Completed
Branch 0.7.dev (7db1ce)
by Andrei
01:27
created

ema.__probabilities()   A

Complexity

Conditions 2

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 7
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
4
@details Implementation based on article:
5
         - 
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
31
from pyclustering.utils import pi;
32
33
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
Unused Code introduced by
Unused matplotlib.pyplot imported as plt
Loading history...
34
from _operator import index
0 ignored issues
show
Unused Code introduced by
Unused index imported from _operator
Loading history...
35
36
37
def gaussion_multivariable(data, mean = None, covariance = None):
38
    dimension = len(data[0]);
39
40
    if (mean is None):
41
        mean = numpy.mean(data);
42
    
43
    if (covariance is None):
44
        covariance = numpy.cov(data);
45
    
46
    inv_variance = numpy.linalg.inv(covariance);
47
    right_const = 1.0 / ( (pi * 2.0) ** (dimension / 2.0) * numpy.linalg.norm(covariance) ** 0.5 );
48
    
49
    result = [];
50
    
51
    for point in data:
52
        mean_delta = point - mean;
53
        point_gaussian = right_const * numpy.exp(-0.5 * mean_delta.T * inv_variance * mean_delta);
54
        result.append(point_gaussian);
55
    
56
    return result;
57
58
59
def gaussian_singlevariable(data, mean = None, variance = None):
60
    if (mean is None):
61
        mean = numpy.mean(data);
62
    
63
    if (variance is None):
64
        variance = numpy.var(data, ddof = 1);
65
66
    right_const = 1.0 / ( 2.0 * pi * variance ) ** 0.5;
67
    result = [];
68
    
69
    for point in data:
70
        mean_delta = point - mean;
71
        point_gaussian = right_const * numpy.exp(-mean_delta ** 2.0 / (2.0 * variance) );
72
        result.append(point_gaussian);
73
    
74
    return result;
75
76
77
def gaussian(data, mean = None, variance = None):
78
    try: dimension = len(data[0]);
79
    except: dimension = 1;
80
    
81
    if (dimension == 1):
82
        return gaussian_singlevariable(data, mean, variance);
83
    else:
84
        return gaussion_multivariable(data, mean, variance);
85
86
87
# data = numpy.random.normal(0, 0.1, 100);
88
# one_gaussian = gaussian(data);
89
# one_gaussian.sort();
90
# 
91
# print(one_gaussian);
92
# 
93
# axis = plt.subplot(111);
94
# plt.plot(one_gaussian, 'b-', linewidth = 2.0);
95
# plt.show();
96
97
98
99
class ema:
100
    def __init__(self, data, amount_clusters, means = None, variances = None):
101
        self.__data = data;
102
        self.__amount_clusters = amount_clusters;
103
        
104
        self.__means = means;
105
        if (means is None):
106
            self.__means = self.__get_random_means(data, amount_clusters);
107
108
        self.__variances = variances;
109
        if (variances is None):
110
            self.__variances = self.__get_random_covariances(data, amount_clusters);
111
        
112
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
113
        self.__pic = [1.0] * amount_clusters;
114
        self.__clusters = [];
115
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
116
117
118
    def process(self):
119
        previous_likelihood = -1000.0;
120
        current_likelihood = 0.0;
121
        
122
        while(abs(previous_likelihood - current_likelihood) > 0.1):
123
            self.__expectation_step();
124
            self.__maximization_step();
125
            
126
            previous_likelihood = current_likelihood;
127
            current_likelihood = self.__log_likelihood();
128
129
130
    def get_clusters(self):
131
        return self.__clusters;
132
133
134
    def __log_likelihood(self):
135
        likelihood = 0.0;
136
        
137
        for index_point in range(len(self.__data)):
138
            particle = 0.0;
139
            for index_cluster in range(self.__amount_clusters):
140
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
141
            
142
            likelihood += numpy.log(particle);
143
        
144
        return likelihood;
145
146
147
    def __probabilities(self, index_cluster, index_point):
148
        divider = 0.0;
149
        for i in range(self.__amount_clusters):
150
            divider += self.__pic[i] * self.__gaussians[i][index_point];
151
        
152
        rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
153
        return rc;
154
155
156
    def __expectation_step(self):
157
        for index in range(self.__amount_clusters):
0 ignored issues
show
Comprehensibility Bug introduced by
index is re-defining a name which is already available in the outer-scope (previously defined on line 34).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
158
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
159
        
160
        for index_cluster in range(self.__amount_clusters):
161
            for index_point in range(self.__data):
162
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
163
164
165
    def __maximization_step(self):
166
        for index_cluster in range(self.__amount_clusters):
167
            mc = numpy.sum(self.__rc[index_cluster]);
168
            
169
            self.__pic[index_cluster] = mc / len(self.__data);
170
            self.__means[index_cluster] = self.__update_mean(index_cluster, mc);
171
            self.__variances[index_cluster] = self.__update_variance(index_cluster, mc);
172
173
174
    def __update_variance(self, index_cluster, mc):
175
        variance = 0.0;
176
        for index_point in range(len(self.__data)):
177
            deviation = self.__data[index_point] - self.__means[index_cluster];
178
            variance += self.__rc[index_cluster][index_point] * deviation.T * deviation;
179
        
180
        variance = variance / mc;
181
        return variance;
182
183
184
    def __update_mean(self, index_cluster, mc):
185
        mean = 0.0;
186
        for index_point in range(len(self.__data)):
187
            mean += self.__rc[index_cluster][index_point] * self.__data[index_point];
188
        
189
        mean = mean / mc;
190
        return mean;
191
192
193
    def __get_random_covariances(self, data, amount):
194
        covariances = [];
195
        data_covariance = numpy.cov(data);
196
        for _ in range(amount):
197
            random_appendix = numpy.min(data_covariance) * 0.2 * numpy.random.random();
198
            covariances.append(data_covariance + random_appendix);
199
        
200
        return covariances;
201
202
203
    def __get_random_variances(self, data, amount):
204
        variances = [];
205
        data_variance = numpy.var(data, ddof = 1);
206
        for _ in range(amount):
207
            random_appendix = data_variance * 0.1 * numpy.random.random();
208
            variances.append(random_appendix + variances);
209
        
210
        return variances;
211
212
213
    def __get_random_means(self, data, amount):
214
        means = [];
215
        for _ in range(amount):
216
            random_index = numpy.random.randint(0, len(data));
217
            means.append(numpy.array(data[random_index]));
218
        
219
        return means;
220
    
221
222
223
# from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES;
224
# from pyclustering.utils import read_sample;
225
# 
226
# sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1);
227
# ema_instance = ema(sample, 2);
228
# ema_instance.process();
229
# clusters = ema_instance.get_clusters();
230
# 
231
# print(clusters);