Completed
Push — 0.7.dev ( 9bc136...42a05b )
by Andrei
01:07
created

ema_observer.get_evolution_covariances()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
4
@details Implementation based on article:
5
         - 
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
import random;
31
32
from pyclustering.cluster import cluster_visualizer;
33
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
34
from pyclustering.cluster.kmeans import kmeans;
35
36
from pyclustering.utils import pi, calculate_ellipse_description;
37
38
from enum import IntEnum;
39
40
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
41
import matplotlib.animation as animation;
0 ignored issues
show
Configuration introduced by
The import matplotlib.animation could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
42
from matplotlib import patches;
0 ignored issues
show
Configuration introduced by
The import matplotlib could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
43
44
45
46
def gaussian(data, mean, covariance):
47
    dimension = float(len(data[0]));
48
 
49
    if (dimension != 1.0):
50
        inv_variance = numpy.linalg.pinv(covariance);
51
    else:
52
        inv_variance = 1.0 / covariance;
53
    
54
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
55
    right_const = 1.0 / divider;
56
     
57
    result = [];
58
     
59
    for point in data:
60
        mean_delta = point - mean;
61
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
62
        result.append(point_gaussian);
63
     
64
    return result;
65
66
67
68
class ema_init_type(IntEnum):
69
    RANDOM_INITIALIZATION = 0;
70
    KMEANS_INITIALIZATION = 1;
71
72
73
74
class ema_initializer():
75
    def __init__(self, sample, amount):
76
        self.__sample = sample;
77
        self.__amount = amount;
78
79
80
    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
81
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
82
            return self.__initialize_kmeans();
83
        
84
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
85
            return self.__initialize_random();
86
        
87
        raise NameError("Unknown type of EM algorithm initialization is specified.");
88
89
90
    def __initialize_random(self):
91
        initial_means = [];
92
        initial_covariance = [];
93
        
94
        for _ in range(self.__amount):
95
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
96
            while (mean in initial_means):
97
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
98
            
99
            initial_means.append(mean);
100
            
101
            covariance = numpy.cov(self.__sample, rowvar = False);
102
            covariance = numpy.divide(covariance, self.__amount + 1);
103
        
104
            initial_covariance.append(covariance);
105
        
106
        return initial_means, initial_covariance;
107
108
109
    def __initialize_kmeans(self):
110
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
111
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
112
        kmeans_instance.process();
113
        
114
        means = kmeans_instance.get_centers();
115
        
116
        covariances = [];
117
        initial_clusters = kmeans_instance.get_clusters();
118
        for initial_cluster in initial_clusters:
119
            cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
120
            covariances.append(numpy.cov(cluster_sample, rowvar = False));
121
        
122
        return means, covariances;
123
124
125
126
class ema_observer:
127
    def __init__(self):
128
        self.__means_evolution = [];
129
        self.__covariances_evolution = [];
130
        self.__clusters_evolution = [];
131
132
133
    def __len__(self):
134
        return len(self.__means_evolution);
135
136
137
    def get_iterations(self):
138
        return len(self.__means_evolution);
139
140
141
    def get_evolution_means(self):
142
        return self.__means_evolution;
143
144
145
    def get_evolution_covariances(self):
146
        return self.__covariances_evolution;
147
148
149
    def get_evolution_clusters(self):
150
        return self.__clusters_evolution;
151
152
153
    def notify(self, means, covariances, clusters):
154
        self.__means_evolution.append(means);
155
        self.__covariances_evolution.append(covariances);
156
        self.__clusters_evolution.append(clusters);
157
158
159
160
class ema_visualizer:
161
    @staticmethod
162
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
163
        visualizer = cluster_visualizer();
164
        visualizer.append_clusters(clusters, sample);
165
        
166
        if (figure is None):
167
            figure = visualizer.show(display = False);
168
        else:
169
            visualizer.show(figure = figure, display = False);
170
        
171
        if (len(sample[0]) == 2):
172
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
173
174
        if (display is True): 
175
            plt.show();
176
177
        return figure;
178
179
180
    @staticmethod
181
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
182
        figure = plt.figure();
183
        
184
        def init_frame():
185
            return frame_generation(0);
186
        
187
        def frame_generation(index_iteration):
188
            figure.clf();
189
            
190
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
191
            
192
            clusters = observer.get_evolution_clusters()[index_iteration];
193
            covariances = observer.get_evolution_covariances()[index_iteration];
194
            means = observer.get_evolution_means()[index_iteration];
195
            
196
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
197
            figure.subplots_adjust(top = 0.85);
198
            
199
            return [ figure.gca() ];
200
201
        iterations = len(observer);
202
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
203
204
        if (save_movie is not None):
205
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
206
        else:
207
            plt.show();
208
209
210
    @staticmethod
211
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
212
        print(len(clusters));
213
        print([len(cluster) for cluster in clusters]);
214
        print(clusters);
215
        
216
        ax = figure.get_axes()[0];
217
        
218
        for index in range(len(clusters)):
219
            angle, width, height = calculate_ellipse_description(covariances[index]);
220
            color = visualizer.get_cluster_color(index, 0);
221
            
222
            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
223
224
225
    @staticmethod
226
    def __draw_ellipse(ax, x, y, angle, width, height, color):
227
        ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
228
        ax.add_patch(ellipse);
229
230
231
232
class ema:
233
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
234
        self.__data = numpy.array(data);
235
        self.__amount_clusters = amount_clusters;
236
        self.__tolerance = tolerance;
237
        self.__iterations = iterations;
238
        self.__observer = observer;
239
        
240
        self.__means = means;
241
        self.__variances = variances;
242
        
243
        if ((means is None) or (variances is None)):
244
            self.__means, self.__variances = self.__get_initial_parameters(data, amount_clusters);
245
        
246
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
247
        self.__pic = [1.0] * amount_clusters;
248
        self.__clusters = [];
249
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
250
        self.__stop = False;
251
252
253
    def process(self):
254
        self.__clusters = None;
255
        
256
        previous_likelihood = -200000;
257
        current_likelihood = -100000;
258
        
259
        current_iteration = 0;
260
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
261
            self.__expectation_step();
262
            self.__maximization_step();
263
            
264
            previous_likelihood = current_likelihood;
265
            current_likelihood = self.__log_likelihood();
266
            self.__stop = self.__get_stop_condition();
267
            
268
            current_iteration += 1;
269
270
            if (self.__observer is not None):
271
                self.__observer.notify(self.__means, self.__variances, self.__extract_clusters());
272
        
273
        self.__clusters = self.__extract_clusters();
274
275
276
    def get_clusters(self):
277
        return self.__clusters;
278
279
280
    def get_centers(self):
281
        return self.__means;
282
283
284
    def get_covariances(self):
285
        return self.__variances;
286
287
288
    def __notify(self):
289
        if (self.__observer is not None):
290
            clusters = self.__extract_clusters();
291
            self.__notify(self.__means, self.__variances, clusters);
0 ignored issues
show
Bug introduced by
There seem to be too many positional arguments for this method call.
Loading history...
292
293
294
    def __extract_clusters(self):
295
        clusters = [ [] for _ in range(self.__amount_clusters) ];
296
        for index_point in range(len(self.__data)):
297
            candidates = [];
298
            for index_cluster in range(self.__amount_clusters):
299
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
300
            
301
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
302
            clusters[index_winner].append(index_point);
303
        
304
        return clusters;
305
306
307
    def __log_likelihood(self):
308
        likelihood = 0.0;
309
        
310
        for index_point in range(len(self.__data)):
311
            particle = 0.0;
312
            for index_cluster in range(self.__amount_clusters):
313
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
314
            
315
            likelihood += numpy.log(particle);
316
        
317
        return likelihood;
318
319
320
    def __probabilities(self, index_cluster, index_point):
321
        divider = 0.0;
322
        for i in range(self.__amount_clusters):
323
            divider += self.__pic[i] * self.__gaussians[i][index_point];
324
        
325
        rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
326
        return rc;
327
328
329
    def __expectation_step(self):
330
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
331
        for index in range(self.__amount_clusters):
332
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
333
        
334
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
335
        for index_cluster in range(self.__amount_clusters):
336
            for index_point in range(len(self.__data)):
337
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
338
339
340
    def __maximization_step(self):
341
        self.__pic = [];
342
        self.__means = [];
343
        self.__variances = [];
344
        
345
        amount_impossible_clusters = 0;
346
        
347
        for index_cluster in range(self.__amount_clusters):
348
            mc = numpy.sum(self.__rc[index_cluster]);
349
            
350
            if (mc == 0.0):
351
                amount_impossible_clusters += 1;
352
                continue;
353
            
354
            self.__pic.append( mc / len(self.__data) );
355
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
356
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
357
        
358
        self.__amount_clusters -= amount_impossible_clusters;
359
360
361
    def __get_stop_condition(self):
362
        for covariance in self.__variances:
363
            if (numpy.linalg.norm(covariance) == 0.0):
364
                return True;
365
        
366
        return False;
367
368
369
    def __update_covariance(self, means, rc, mc):
370
        covariance = 0.0;
371
        for index_point in range(len(self.__data)):
372
            deviation = numpy.array( [ self.__data[index_point] - means ]);
373
            covariance += rc[index_point] * deviation.T.dot(deviation);
374
        
375
        covariance = covariance / mc;
376
        return covariance;
377
378
379
    def __update_mean(self, rc, mc):
380
        mean = 0.0;
381
        for index_point in range(len(self.__data)):
382
            mean += rc[index_point] * self.__data[index_point];
383
        
384
        mean = mean / mc;
385
        return mean;
386
387
388
    def __get_initial_parameters(self, sample, amount_clusters):
389
        return ema_initializer(sample, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);