Completed
Push — 0.7.dev ( 619631...fe456d )
by Andrei
01:27
created

ema   B

Complexity

Total Complexity 39

Size/Duplication

Total Lines 152
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
dl 0
loc 152
rs 8.2857
c 0
b 0
f 0
wmc 39

14 Methods

Rating   Name   Duplication   Size   Complexity  
A get_centers() 0 2 1
A __update_mean() 0 7 2
A process() 0 14 4
B __init__() 0 17 5
B get_clusters() 0 14 5
A __get_random_means() 0 12 3
A __expectation_step() 0 8 4
A __maximization_step() 0 8 2
A __probabilities() 0 7 2
A get_covariances() 0 2 1
A __log_likelihood() 0 11 3
A __update_covariance() 0 8 2
A __get_random_covariances() 0 8 2
A __get_stop_flag() 0 7 3
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
4
@details Implementation based on article:
5
         - 
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
31
from pyclustering.utils import pi;
32
33
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
Unused Code introduced by
Unused matplotlib.pyplot imported as plt
Loading history...
34
from _operator import index
0 ignored issues
show
Unused Code introduced by
Unused index imported from _operator
Loading history...
35
36
37
def gaussian(data, mean = None, covariance = None):
38
    dimension = len(data[0]);
39
 
40
    if (mean is None):
41
        mean = numpy.mean(data);
42
     
43
    if (covariance is None):
44
        covariance = numpy.cov(data, rowvar = False);
45
     
46
    inv_variance = numpy.linalg.inv(covariance);
47
    right_const = 1.0 / ( (pi * 2.0) ** (dimension / 2.0) * numpy.linalg.norm(covariance) ** 0.5 );
48
     
49
    result = [];
50
     
51
    for point in data:
52
        mean_delta = point - mean;
53
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
54
        result.append(point_gaussian);
55
     
56
    return result;
57
58
59
class ema:
60
    def __init__(self, data, amount_clusters, means = None, variances = None):
61
        self.__data = numpy.array(data);
62
        self.__amount_clusters = amount_clusters;
63
        
64
        self.__means = means;
65
        if (means is None):
66
            self.__means = self.__get_random_means(data, amount_clusters);
67
68
        self.__variances = variances;
69
        if (variances is None):
70
            self.__variances = self.__get_random_covariances(data, amount_clusters);
71
        
72
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
73
        self.__pic = [1.0] * amount_clusters;
74
        self.__clusters = [];
75
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
76
        self.__stop = False;
77
78
79
    def process(self):
80
        self.__clusters = None;
81
        
82
        previous_likelihood = -10000500;
83
        current_likelihood = -10000000;
84
        
85
        while((self.__stop is False) and (abs(numpy.min(previous_likelihood) - numpy.min(current_likelihood)) > 0.00001) and (current_likelihood < 0.0)):
86
            self.__expectation_step();
87
            self.__maximization_step();
88
            
89
            previous_likelihood = current_likelihood;
90
            current_likelihood = self.__log_likelihood();
91
            self.__stop = self.__get_stop_flag();
92
            print(previous_likelihood, current_likelihood);
93
94
95
    def get_clusters(self):
96
        if (self.__clusters is not None):
97
            return self.__clusters;
98
        
99
        self.__clusters= [];
100
        for index_cluster in range(self.__amount_clusters):
101
            cluster = [];
102
            for index_point in range(len(self.__data)):
103
                if (self.__rc[index_cluster][index_point] >= 0.5):
104
                    cluster.append(index_point);
105
            
106
            self.__clusters.append(cluster);
107
        
108
        return self.__clusters;
109
110
111
    def get_centers(self):
112
        return self.__means;
113
114
115
    def get_covariances(self):
116
        return self.__variances;
117
118
119
    def __log_likelihood(self):
120
        likelihood = 0.0;
121
        
122
        for index_point in range(len(self.__data)):
123
            particle = 0.0;
124
            for index_cluster in range(self.__amount_clusters):
125
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
126
            
127
            likelihood += numpy.log(particle);
128
        
129
        return likelihood;
130
131
132
    def __probabilities(self, index_cluster, index_point):
133
        divider = 0.0;
134
        for i in range(self.__amount_clusters):
135
            divider += self.__pic[i] * self.__gaussians[i][index_point];
136
        
137
        rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
138
        return rc;
139
140
141
    def __expectation_step(self):
142
        for index in range(self.__amount_clusters):
0 ignored issues
show
Comprehensibility Bug introduced by
index is re-defining a name which is already available in the outer-scope (previously defined on line 34).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
143
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
144
        
145
        for index_cluster in range(self.__amount_clusters):
146
            for index_point in range(len(self.__data)):
147
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
148
            print(self.__rc[index_cluster]);
149
150
151
    def __maximization_step(self):
152
        for index_cluster in range(self.__amount_clusters):
153
            mc = numpy.sum(self.__rc[index_cluster]);
154
            
155
            self.__pic[index_cluster] = mc / len(self.__data);
156
            self.__means[index_cluster] = self.__update_mean(index_cluster, mc);
157
            
158
            self.__variances[index_cluster] = self.__update_covariance(index_cluster, mc);
159
160
161
    def __get_stop_flag(self):
162
        for covariance in self.__variances:
163
            print(covariance[0])
164
            if (min(covariance[0]) == 0):
165
                return True;
166
        
167
        return False;
168
169
170
    def __update_covariance(self, index_cluster, mc):
171
        covariance = 0.0;
172
        for index_point in range(len(self.__data)):
173
            deviation = numpy.array( [ self.__data[index_point] - self.__means[index_cluster] ]);
174
            covariance += self.__rc[index_cluster][index_point] * deviation.T.dot(deviation);
175
        
176
        covariance = covariance / mc;
177
        return covariance;
178
179
180
    def __update_mean(self, index_cluster, mc):
181
        mean = 0.0;
182
        for index_point in range(len(self.__data)):
183
            mean += self.__rc[index_cluster][index_point] * self.__data[index_point];
184
        
185
        mean = mean / mc;
186
        return mean;
187
188
189
    def __get_random_covariances(self, data, amount):
190
        covariances = [];
191
        data_covariance = numpy.cov(data, rowvar = False);
192
        for _ in range(amount):
193
            random_appendix = numpy.min(data_covariance) * 0.2 * numpy.random.random();
194
            covariances.append(data_covariance + random_appendix);
195
         
196
        return covariances;
197
198
199
    def __get_random_means(self, data, amount):
200
        means = [];
201
        mean_indexes = [];
202
        for _ in range(amount):
203
            random_index = numpy.random.randint(0, len(data));
204
            while(random_index in mean_indexes):
205
                mean_indexes.append(random_index);
206
                random_index = numpy.random.randint(0, len(data));
207
            
208
            means.append(numpy.array(data[random_index]));
209
        
210
        return means;
211
212
213
214
# from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES;
215
# from pyclustering.utils import read_sample;
216
#   
217
# # sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE9);
218
# # ema_instance = ema(sample, 2);
219
# 
220
# sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
221
# ema_instance = ema(sample, 3);
222
# 
223
# ema_instance.process();
224
# clusters = ema_instance.get_clusters();
225
#   
226
# print(clusters);