Completed
Push — 0.7.dev ( c2d6e2...fccf24 )
by Andrei
52s
created

gaussian()   B

Complexity

Conditions 4

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 22
rs 8.9197
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4
5
@authors Andrei Novikov ([email protected])
6
@date 2014-2017
7
@copyright GNU Public License
8
9
@cond GNU_PUBLIC_LICENSE
10
    PyClustering is free software: you can redistribute it and/or modify
11
    it under the terms of the GNU General Public License as published by
12
    the Free Software Foundation, either version 3 of the License, or
13
    (at your option) any later version.
14
    
15
    PyClustering is distributed in the hope that it will be useful,
16
    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
    GNU General Public License for more details.
19
    
20
    You should have received a copy of the GNU General Public License
21
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
@endcond
23
24
"""
25
26
27
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
28
import random;
29
30
from pyclustering.cluster import cluster_visualizer;
31
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
32
from pyclustering.cluster.kmeans import kmeans;
33
34
from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;
35
36
from enum import IntEnum;
37
38
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
39
import matplotlib.animation as animation;
0 ignored issues
show
Configuration introduced by
The import matplotlib.animation could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
40
from matplotlib import patches;
0 ignored issues
show
Configuration introduced by
The import matplotlib could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
41
42
43
44
def gaussian(data, mean, covariance):
45
    dimension = float(len(data[0]));
46
 
47
    if (dimension != 1.0):
48
        inv_variance = numpy.linalg.pinv(covariance);
49
    else:
50
        inv_variance = 1.0 / covariance;
51
    
52
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
53
    if (divider != 0.0):
54
        right_const = 1.0 / divider;
55
    else:
56
        right_const = float('inf');
57
    
58
    result = [];
59
    
60
    for point in data:
61
        mean_delta = point - mean;
62
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
63
        result.append(point_gaussian);
64
    
65
    return result;
66
67
68
69
class ema_init_type(IntEnum):
70
    RANDOM_INITIALIZATION = 0;
71
    KMEANS_INITIALIZATION = 1;
72
73
74
75
class ema_initializer():
76
    __MAX_GENERATION_ATTEPTS = 10;
77
    
78
    def __init__(self, sample, amount):
79
        self.__sample = sample;
80
        self.__amount = amount;
81
82
83
    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
84
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
85
            return self.__initialize_kmeans();
86
        
87
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
88
            return self.__initialize_random();
89
        
90
        raise NameError("Unknown type of EM algorithm initialization is specified.");
91
92
93
    def __calculate_initial_clusters(self, centers):
94
        """!
95
        @brief Calculate Euclidean distance to each point from the each cluster. 
96
        @brief Nearest points are captured by according clusters and as a result clusters are updated.
97
        
98
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
99
        
100
        """
101
        
102
        clusters = [[] for _ in range(len(centers))];
103
        for index_point in range(len(self.__sample)):
104
            index_optim, dist_optim = -1, 0.0;
105
             
106
            for index in range(len(centers)):
107
                dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
108
                 
109
                if ( (dist < dist_optim) or (index is 0)):
110
                    index_optim, dist_optim = index, dist;
111
             
112
            clusters[index_optim].append(index_point);
113
        
114
        return clusters;
115
116
117
    def __calculate_initial_covariances(self, initial_clusters):
118
        covariances = [];
119
        for initial_cluster in initial_clusters:
120
            if (len(initial_cluster) > 1):
121
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
122
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
123
            else:
124
                dimension = len(self.__sample[0]);
125
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
126
        
127
        return covariances;
128
129
130
    def __initialize_random(self):
131
        initial_means = [];
132
        
133
        for _ in range(self.__amount):
134
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
135
            attempts = 0;
136
            while ( (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEPTS) ):
137
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
138
                attempts += 1;
139
            
140
            if (attempts == ema_initializer.__MAX_GENERATION_ATTEPTS):
141
                mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ];
142
            
143
            initial_means.append(mean);
144
        
145
        initial_clusters = self.__calculate_initial_clusters(initial_means);
146
        initial_covariance = self.__calculate_initial_covariances(initial_clusters);
147
        
148
        return initial_means, initial_covariance;
149
150
151
    def __initialize_kmeans(self):
152
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
153
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
154
        kmeans_instance.process();
155
        
156
        means = kmeans_instance.get_centers();
157
        
158
        covariances = [];
159
        initial_clusters = kmeans_instance.get_clusters();
160
        for initial_cluster in initial_clusters:
161
            if (len(initial_cluster) > 1):
162
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
163
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
164
            else:
165
                dimension = len(self.__sample[0]);
166
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
167
        
168
        return means, covariances;
169
170
171
172
class ema_observer:
173
    def __init__(self):
174
        self.__means_evolution = [];
175
        self.__covariances_evolution = [];
176
        self.__clusters_evolution = [];
177
178
179
    def __len__(self):
180
        return len(self.__means_evolution);
181
182
183
    def get_iterations(self):
184
        return len(self.__means_evolution);
185
186
187
    def get_evolution_means(self):
188
        return self.__means_evolution;
189
190
191
    def get_evolution_covariances(self):
192
        return self.__covariances_evolution;
193
194
195
    def get_evolution_clusters(self):
196
        return self.__clusters_evolution;
197
198
199
    def notify(self, means, covariances, clusters):
200
        self.__means_evolution.append(means);
201
        self.__covariances_evolution.append(covariances);
202
        self.__clusters_evolution.append(clusters);
203
204
205
206
class ema_visualizer:
207
    @staticmethod
208
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
209
        visualizer = cluster_visualizer();
210
        visualizer.append_clusters(clusters, sample);
211
        
212
        if (figure is None):
213
            figure = visualizer.show(display = False);
214
        else:
215
            visualizer.show(figure = figure, display = False);
216
        
217
        if (len(sample[0]) == 2):
218
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
219
220
        if (display is True): 
221
            plt.show();
222
223
        return figure;
224
225
226
    @staticmethod
227
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
228
        figure = plt.figure();
229
        
230
        def init_frame():
231
            return frame_generation(0);
232
        
233
        def frame_generation(index_iteration):
234
            figure.clf();
235
            
236
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
237
            
238
            clusters = observer.get_evolution_clusters()[index_iteration];
239
            covariances = observer.get_evolution_covariances()[index_iteration];
240
            means = observer.get_evolution_means()[index_iteration];
241
            
242
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
243
            figure.subplots_adjust(top = 0.85);
244
            
245
            return [ figure.gca() ];
246
247
        iterations = len(observer);
248
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
249
250
        if (save_movie is not None):
251
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
252
        else:
253
            plt.show();
254
255
256
    @staticmethod
257
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
258
        ax = figure.get_axes()[0];
259
        
260
        for index in range(len(clusters)):
261
            angle, width, height = calculate_ellipse_description(covariances[index]);
262
            color = visualizer.get_cluster_color(index, 0);
263
264
            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
265
266
267
    @staticmethod
268
    def __draw_ellipse(ax, x, y, angle, width, height, color):
269
        if ((width > 0.0) and (height > 0.0)):
270
            ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
271
            ax.add_patch(ellipse);
272
273
274
275
class ema:
276
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
277
        self.__data = numpy.array(data);
278
        self.__amount_clusters = amount_clusters;
279
        self.__tolerance = tolerance;
280
        self.__iterations = iterations;
281
        self.__observer = observer;
282
        
283
        self.__means = means;
284
        self.__variances = variances;
285
        
286
        if ((means is None) or (variances is None)):
287
            self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
288
            
289
            if (len(self.__means) != amount_clusters):
290
                self.__amount_clusters = len(self.__means);
291
        
292
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
293
        self.__pic = [1.0] * amount_clusters;
294
        self.__clusters = [];
295
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
296
        self.__stop = False;
297
298
299
    def process(self):
300
        previous_likelihood = -200000;
301
        current_likelihood = -100000;
302
        
303
        current_iteration = 0;
304
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
305
            self.__expectation_step();
306
            self.__maximization_step();
307
            
308
            current_iteration += 1;
309
            
310
            self.__extract_clusters();
311
            self.__notify();
312
            
313
            previous_likelihood = current_likelihood;
314
            current_likelihood = self.__log_likelihood();
315
            self.__stop = self.__get_stop_condition();
316
317
318
    def get_clusters(self):
319
        return self.__clusters;
320
321
322
    def get_centers(self):
323
        return self.__means;
324
325
326
    def get_covariances(self):
327
        return self.__variances;
328
329
330
    def __erase_empty_clusters(self):
331
        clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], [];
332
333
        for index_cluster in range(len(self.__clusters)):
334
            if (len(self.__clusters[index_cluster]) > 0):
335
                clusters.append(self.__clusters[index_cluster]);
336
                means.append(self.__means[index_cluster]);
337
                variances.append(self.__variances[index_cluster]);
338
                pic.append(self.__pic[index_cluster]);
339
                gaussians.append(self.__gaussians[index_cluster]);
340
                rc.append(self.__rc[index_cluster]);
341
        
342
        if (len(self.__clusters) != len(clusters)):
343
            self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
344
            self.__gaussians, self.__rc = gaussians, rc;
345
            self.__amount_clusters = len(self.__clusters);
346
347
348
    def __notify(self):
349
        if (self.__observer is not None):
350
            self.__observer.notify(self.__means, self.__variances, self.__clusters);
351
352
353
    def __extract_clusters(self):
354
        self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
355
        for index_point in range(len(self.__data)):
356
            candidates = [];
357
            for index_cluster in range(self.__amount_clusters):
358
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
359
            
360
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
361
            self.__clusters[index_winner].append(index_point);
362
        
363
        self.__erase_empty_clusters();
364
365
366
    def __log_likelihood(self):
367
        likelihood = 0.0;
368
        
369
        for index_point in range(len(self.__data)):
370
            particle = 0.0;
371
            for index_cluster in range(self.__amount_clusters):
372
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
373
            
374
            likelihood += numpy.log(particle);
375
        
376
        return likelihood;
377
378
379
    def __probabilities(self, index_cluster, index_point):
380
        divider = 0.0;
381
        for i in range(self.__amount_clusters):
382
            divider += self.__pic[i] * self.__gaussians[i][index_point];
383
        
384
        if ( (divider != 0.0) and (divider != float('inf')) ):
385
            return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
386
        
387
        return float('nan');
388
389
390
    def __expectation_step(self):
391
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
392
        for index in range(self.__amount_clusters):
393
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
394
        
395
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
396
        for index_cluster in range(self.__amount_clusters):
397
            for index_point in range(len(self.__data)):
398
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
399
400
401
    def __maximization_step(self):
402
        self.__pic = [];
403
        self.__means = [];
404
        self.__variances = [];
405
        
406
        amount_impossible_clusters = 0;
407
        
408
        for index_cluster in range(self.__amount_clusters):
409
            mc = numpy.sum(self.__rc[index_cluster]);
410
            
411
            if (mc == 0.0):
412
                amount_impossible_clusters += 1;
413
                continue;
414
            
415
            self.__pic.append( mc / len(self.__data) );
416
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
417
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
418
        
419
        self.__amount_clusters -= amount_impossible_clusters;
420
421
422
    def __get_stop_condition(self):
423
        for covariance in self.__variances:
424
            if (numpy.linalg.norm(covariance) == 0.0):
425
                return True;
426
        
427
        return False;
428
429
430
    def __update_covariance(self, means, rc, mc):
431
        covariance = 0.0;
432
        for index_point in range(len(self.__data)):
433
            deviation = numpy.array( [ self.__data[index_point] - means ]);
434
            covariance += rc[index_point] * deviation.T.dot(deviation);
435
        
436
        covariance = covariance / mc;
437
        return covariance;
438
439
440
    def __update_mean(self, rc, mc):
441
        mean = 0.0;
442
        for index_point in range(len(self.__data)):
443
            mean += rc[index_point] * self.__data[index_point];
444
        
445
        mean = mean / mc;
446
        return mean;