Completed
Push — master ( e45971...e8eda8 )
by Andrei
01:01
created

ema_visualizer.__draw_ellipses()   A

Complexity

Conditions 2

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4
5
@authors Andrei Novikov ([email protected])
6
@date 2014-2017
7
@copyright GNU Public License
8
9
@cond GNU_PUBLIC_LICENSE
10
    PyClustering is free software: you can redistribute it and/or modify
11
    it under the terms of the GNU General Public License as published by
12
    the Free Software Foundation, either version 3 of the License, or
13
    (at your option) any later version.
14
    
15
    PyClustering is distributed in the hope that it will be useful,
16
    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
    GNU General Public License for more details.
19
    
20
    You should have received a copy of the GNU General Public License
21
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
@endcond
23
24
"""
25
26
27
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
28
import random;
29
30
from pyclustering.cluster import cluster_visualizer;
31
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
32
from pyclustering.cluster.kmeans import kmeans;
33
34
from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;
35
36
from enum import IntEnum;
37
38
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
39
import matplotlib.animation as animation;
0 ignored issues
show
Configuration introduced by
The import matplotlib.animation could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
40
from matplotlib import patches;
0 ignored issues
show
Configuration introduced by
The import matplotlib could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
41
42
43
44
def gaussian(data, mean, covariance):
45
    """!
46
    @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
47
            multi-dimensional data.
48
    
49
    @param[in] data (list): Data that is used for gaussian calculation.
50
    @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
51
    @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
52
    
53
    @return (list) Value of gaussian function for each point in dataset.
54
    
55
    """
56
    dimension = float(len(data[0]));
57
 
58
    if (dimension != 1.0):
59
        inv_variance = numpy.linalg.pinv(covariance);
60
    else:
61
        inv_variance = 1.0 / covariance;
62
    
63
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
64
    if (divider != 0.0):
65
        right_const = 1.0 / divider;
66
    else:
67
        right_const = float('inf');
68
    
69
    result = [];
70
    
71
    for point in data:
72
        mean_delta = point - mean;
73
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
74
        result.append(point_gaussian);
75
    
76
    return result;
77
78
79
80
class ema_init_type(IntEnum):
81
    """!
82
    @brief Enumeration of initialization types for Expectation-Maximization algorithm.
83
    
84
    """
85
    
86
    ## Means are randomly taken from input dataset and variance or covariance is calculated based on
87
    ## spherical data that belongs to the chosen means.
88
    RANDOM_INITIALIZATION = 0;
89
    
90
    ## Two step initialization. The first is calculation of initial centers using K-Means++ method.
91
    ## The second is K-Means clustering using obtained centers in the first step. Obtained clusters
92
    ## and its centers are used for calculation of variance (covariance in case of multi-dimensional)
93
    ## data.
94
    KMEANS_INITIALIZATION = 1;
95
96
97
98
class ema_initializer():
99
    """!
100
    @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm.
101
    @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and 
102
              kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
103
    
104
    @code
105
        from pyclustering.utils import read_sample;
106
        from pyclustering.samples.definitions import COMMON_SAMPLES;
107
        from pyclustering.cluster.ema import ema_initializer;
108
        
109
        sample = read_sample(COMMON_SAMPLES.SAMPLE_OLD_FAITHFUL);
110
        amount_clusters = 2;
111
        
112
        initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize(initializer);
113
        print(initial_means);
114
        print(initial_covariance);
115
    @endcode
116
    
117
    """
118
    
119
    __MAX_GENERATION_ATTEPTS = 10;
120
    
121
    def __init__(self, sample, amount):
122
        """!
123
        @brief Constructs EM initializer.
124
        
125
        @param[in] sample (list): Data that will be used by the EM algorithm.
126
        @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
127
        
128
        """
129
        self.__sample = sample;
130
        self.__amount = amount;
131
132
133
    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
134
        """!
135
        @brief Calculates initial parameters for EM algorithm: means and covariances using
136
                specified strategy.
137
        
138
        @param[in] init_type (ema_init_type): Strategy for initialization.
139
        
140
        @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
141
        
142
        """
143
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
144
            return self.__initialize_kmeans();
145
        
146
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
147
            return self.__initialize_random();
148
        
149
        raise NameError("Unknown type of EM algorithm initialization is specified.");
150
151
152
    def __calculate_initial_clusters(self, centers):
153
        """!
154
        @brief Calculate Euclidean distance to each point from the each cluster. 
155
        @brief Nearest points are captured by according clusters and as a result clusters are updated.
156
        
157
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
158
        
159
        """
160
        
161
        clusters = [[] for _ in range(len(centers))];
162
        for index_point in range(len(self.__sample)):
163
            index_optim, dist_optim = -1, 0.0;
164
             
165
            for index in range(len(centers)):
166
                dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
167
                 
168
                if ( (dist < dist_optim) or (index is 0)):
169
                    index_optim, dist_optim = index, dist;
170
             
171
            clusters[index_optim].append(index_point);
172
        
173
        return clusters;
174
175
176
    def __calculate_initial_covariances(self, initial_clusters):
177
        covariances = [];
178
        for initial_cluster in initial_clusters:
179
            if (len(initial_cluster) > 1):
180
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
181
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
182
            else:
183
                dimension = len(self.__sample[0]);
184
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
185
        
186
        return covariances;
187
188
189
    def __initialize_random(self):
190
        initial_means = [];
191
        
192
        for _ in range(self.__amount):
193
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
194
            attempts = 0;
195
            while ( (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEPTS) ):
196
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
197
                attempts += 1;
198
            
199
            if (attempts == ema_initializer.__MAX_GENERATION_ATTEPTS):
200
                mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ];
201
            
202
            initial_means.append(mean);
203
        
204
        initial_clusters = self.__calculate_initial_clusters(initial_means);
205
        initial_covariance = self.__calculate_initial_covariances(initial_clusters);
206
        
207
        return initial_means, initial_covariance;
208
209
210
    def __initialize_kmeans(self):
211
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
212
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
213
        kmeans_instance.process();
214
        
215
        means = kmeans_instance.get_centers();
216
        
217
        covariances = [];
218
        initial_clusters = kmeans_instance.get_clusters();
219
        for initial_cluster in initial_clusters:
220
            if (len(initial_cluster) > 1):
221
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
222
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
223
            else:
224
                dimension = len(self.__sample[0]);
225
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
226
        
227
        return means, covariances;
228
229
230
231
class ema_observer:
232
    """!
233
    @brief Observer of EM algorithm for collecting algorithm state on each step.
234
    @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
235
              means and covariances are stored in observer on each step. Here an example of usage:
236
    
237
    @code
238
        from pyclustering.cluster.ema import ema, ema_observer;
239
        from pyclustering.utils import read_sample;
240
        from pyclustering.samples.definitions import SIMPLE_SAMPLES;
241
        
242
        # Read data from text file
243
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
244
        
245
        # Create EM observer
246
        observer = ema_observer();
247
        
248
        # Create EM algorithm to allocated four clusters and pass observer to it
249
        ema_instance = ema(sample, 4, observer=observer);
250
        
251
        # Run clustering process
252
        ema_instance.process();
253
        
254
        # Print amount of steps that were done by the algorithm
255
        print("EMA steps:", observer.get_iterations());
256
        
257
        # Print evolution of means and covariances
258
        print("Means evolution:", observer.get_evolution_means());
259
        print("Covariances evolution:", observer.get_evolution_covariances());
260
        
261
        # Print evolution of clusters
262
        print("Clusters evolution:", observer.get_evolution_clusters());
263
        
264
        # Print final clusters
265
        print("Allocated clusters:", observer.get_evolution_clusters()[-1]);
266
    @endcode
267
    
268
    """
269
    def __init__(self):
270
        """!
271
        @brief Initializes EM observer.
272
        
273
        """
274
        self.__means_evolution = [];
275
        self.__covariances_evolution = [];
276
        self.__clusters_evolution = [];
277
278
279
    def __len__(self):
280
        """!
281
        @return (uint) Amount of iterations that were done by the EM algorithm.
282
        
283
        """
284
        return len(self.__means_evolution);
285
286
287
    def get_iterations(self):
288
        """!
289
        @return (uint) Amount of iterations that were done by the EM algorithm.
290
        
291
        """
292
        return len(self.__means_evolution);
293
294
295
    def get_evolution_means(self):
296
        """!
297
        @return (list) Mean of each cluster on each step of clustering.
298
        
299
        """
300
        return self.__means_evolution;
301
302
303
    def get_evolution_covariances(self):
304
        """!
305
        @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
306
        
307
        """
308
        return self.__covariances_evolution;
309
310
311
    def get_evolution_clusters(self):
312
        """!
313
        @return (list) Allocated clusters on each step of clustering.
314
        
315
        """
316
        return self.__clusters_evolution;
317
318
319
    def notify(self, means, covariances, clusters):
320
        """!
321
        @brief This method is used by the algorithm to notify observer about changes where the algorithm
322
                should provide new values: means, covariances and allocated clusters.
323
        
324
        @param[in] means (list): Mean of each cluster on currect step.
325
        @param[in] covariances (list): Covariances of each cluster on current step.
326
        @param[in] clusters (list): Allocated cluster on current step.
327
        
328
        """
329
        self.__means_evolution.append(means);
330
        self.__covariances_evolution.append(covariances);
331
        self.__clusters_evolution.append(clusters);
332
333
334
335
class ema_visualizer:
336
    """!
337
    @brief Visualizer of EM algorithm's results.
338
    @details Provides services for visualization of particular features of the algorithm, for example,
339
              in case of two-dimensional dataset it shows covariance ellipses.
340
    
341
    """
342
    
343
    @staticmethod
344
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
345
        """!
346
        @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
347
        
348
        @param[in] clusters (list): Clusters that were allocated by the algorithm.
349
        @param[in] sample (list): Dataset that were used for clustering.
350
        @param[in] covariances (list): Covariances of the clusters.
351
        @param[in] means (list): Means of the clusters.
352
        @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
353
                    for visualization.
354
        @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
355
                    shown manually using matplotlib function 'plt.show()'.
356
        
357
        @return (figure) Figure where clusters were drawn.
358
        
359
        """
360
        
361
        visualizer = cluster_visualizer();
362
        visualizer.append_clusters(clusters, sample);
363
        
364
        if (figure is None):
365
            figure = visualizer.show(display = False);
366
        else:
367
            visualizer.show(figure = figure, display = False);
368
        
369
        if (len(sample[0]) == 2):
370
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
371
372
        if (display is True): 
373
            plt.show();
374
375
        return figure;
376
377
378
    @staticmethod
379
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
380
        """!
381
        @brief Animates clustering process that is performed by EM algorithm.
382
        
383
        @param[in] data (list): Dataset that is used for clustering.
384
        @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
385
        @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
386
        @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
387
        @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
388
        
389
        """
390
        
391
        figure = plt.figure();
392
        
393
        def init_frame():
394
            return frame_generation(0);
395
        
396
        def frame_generation(index_iteration):
397
            figure.clf();
398
            
399
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
400
            
401
            clusters = observer.get_evolution_clusters()[index_iteration];
402
            covariances = observer.get_evolution_covariances()[index_iteration];
403
            means = observer.get_evolution_means()[index_iteration];
404
            
405
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
406
            figure.subplots_adjust(top = 0.85);
407
            
408
            return [ figure.gca() ];
409
410
        iterations = len(observer);
411
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
412
413
        if (save_movie is not None):
414
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
415
        else:
416
            plt.show();
417
418
419
    @staticmethod
420
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
421
        ax = figure.get_axes()[0];
422
        
423
        for index in range(len(clusters)):
424
            angle, width, height = calculate_ellipse_description(covariances[index]);
425
            color = visualizer.get_cluster_color(index, 0);
426
427
            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
428
429
430
    @staticmethod
431
    def __draw_ellipse(ax, x, y, angle, width, height, color):
432
        if ((width > 0.0) and (height > 0.0)):
433
            ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=-angle, linewidth=2, fill=True, zorder=2, color=color);
434
            ax.add_patch(ellipse);
435
436
437
438
class ema:
439
    """!
440
    @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
441
    @details The algorithm provides only clustering services (unsupervise learning).
442
              Here an example of data clustering process:
443
    @code
444
        # Read dataset from text file
445
        sample = read_sample(COMMON_SAMPLES.);
446
        
447
        # Amount of cluster that should be allocated
448
        amount = 2;
449
        
450
        # Prepare initial means and covariances using K-Means initializer
451
        initializer = ema_init_type.KMEANS_INITIALIZATION;
452
        initial_means, initial_covariance = ema_initializer(sample, amount).initialize(initializer);
453
        
454
        # Lets create observer to see clustering process
455
        observer = ema_observer();
456
        
457
        # Create instance of the EM algorithm
458
        ema_instance = ema(sample, amount, initial_means, initial_covariance, observer=observer);
459
        
460
        # Run clustering process
461
        ema_instance.process();
462
        
463
        # Extract clusters
464
        clusters = ema_instance.get_clusters();
465
        print("Obtained clusters:", clusters);
466
        
467
        # Display allocated clusters using visualizer
468
        covariances = ema_instance.get_covariances();
469
        means = ema_instance.get_centers();
470
        ema_visualizer.show_clusters(clusters, sample, covariances, means);
471
        
472
        # Show animation process
473
        ema_visualizer.animate_cluster_allocation(sample, observer);
474
        
475
    @endcode
476
    
477
    Here is clustering results of the Expectation-Maximization clustering algorithm where popular sample 'OldFaithful' was used.
478
    Initial random means and covariances were used in the example. The first step is presented on the left side of the figure and
479
    final result (the last step) is on the right side:
480
    @image html ema_old_faithful_clustering.png
481
    
482
    @see ema_visualizer
483
    @see ema_observer
484
    
485
    """
486
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
487
        """!
488
        @brief Initializes Expectation-Maximization algorithm for cluster analysis.
489
        
490
        @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
491
        @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
492
        @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
493
                    If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
494
        @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
495
                    covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
496
                    K-Means algorithm with K-Means++ method will be used for initialization by default.
497
        @param[in] observer (ema_observer): Observer for gathering information about clustering process.
498
        @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
499
                    previous log-likelihood estimation is less then 'tolerance' then clustering is over).
500
        @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
501
                    performed by the algorithm during clustering process.
502
        
503
        """
504
        
505
        self.__data = numpy.array(data);
506
        self.__amount_clusters = amount_clusters;
507
        self.__tolerance = tolerance;
508
        self.__iterations = iterations;
509
        self.__observer = observer;
510
        
511
        self.__means = means;
512
        self.__variances = variances;
513
        
514
        if ((means is None) or (variances is None)):
515
            self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
516
            
517
            if (len(self.__means) != amount_clusters):
518
                self.__amount_clusters = len(self.__means);
519
        
520
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
521
        self.__pic = [1.0] * amount_clusters;
522
        self.__clusters = [];
523
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
524
        self.__stop = False;
525
526
527
    def process(self):
528
        """!
529
        @brief Run clustering process of the algorithm.
530
        @details This method should be called before call 'get_clusters()'.
531
        
532
        """
533
        
534
        previous_likelihood = -200000;
535
        current_likelihood = -100000;
536
        
537
        current_iteration = 0;
538
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
539
            self.__expectation_step();
540
            self.__maximization_step();
541
            
542
            current_iteration += 1;
543
            
544
            self.__extract_clusters();
545
            self.__notify();
546
            
547
            previous_likelihood = current_likelihood;
548
            current_likelihood = self.__log_likelihood();
549
            self.__stop = self.__get_stop_condition();
550
551
552
    def get_clusters(self):
553
        """!
554
        @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
555
                        for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
556
        
557
        """
558
        return self.__clusters;
559
560
561
    def get_centers(self):
562
        """!
563
        @return (list) Corresponding centers (means) of clusters.
564
        
565
        """
566
        
567
        return self.__means;
568
569
570
    def get_covariances(self):
571
        """!
572
        @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
573
        
574
        """
575
        
576
        return self.__variances;
577
578
579
    def __erase_empty_clusters(self):
580
        clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], [];
581
582
        for index_cluster in range(len(self.__clusters)):
583
            if (len(self.__clusters[index_cluster]) > 0):
584
                clusters.append(self.__clusters[index_cluster]);
585
                means.append(self.__means[index_cluster]);
586
                variances.append(self.__variances[index_cluster]);
587
                pic.append(self.__pic[index_cluster]);
588
                gaussians.append(self.__gaussians[index_cluster]);
589
                rc.append(self.__rc[index_cluster]);
590
        
591
        if (len(self.__clusters) != len(clusters)):
592
            self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
593
            self.__gaussians, self.__rc = gaussians, rc;
594
            self.__amount_clusters = len(self.__clusters);
595
596
597
    def __notify(self):
598
        if (self.__observer is not None):
599
            self.__observer.notify(self.__means, self.__variances, self.__clusters);
600
601
602
    def __extract_clusters(self):
603
        self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
604
        for index_point in range(len(self.__data)):
605
            candidates = [];
606
            for index_cluster in range(self.__amount_clusters):
607
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
608
            
609
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
610
            self.__clusters[index_winner].append(index_point);
611
        
612
        self.__erase_empty_clusters();
613
614
615
    def __log_likelihood(self):
616
        likelihood = 0.0;
617
        
618
        for index_point in range(len(self.__data)):
619
            particle = 0.0;
620
            for index_cluster in range(self.__amount_clusters):
621
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
622
            
623
            likelihood += numpy.log(particle);
624
        
625
        return likelihood;
626
627
628
    def __probabilities(self, index_cluster, index_point):
629
        divider = 0.0;
630
        for i in range(self.__amount_clusters):
631
            divider += self.__pic[i] * self.__gaussians[i][index_point];
632
        
633
        if ( (divider != 0.0) and (divider != float('inf')) ):
634
            return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
635
        
636
        return float('nan');
637
638
639
    def __expectation_step(self):
640
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
641
        for index in range(self.__amount_clusters):
642
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
643
        
644
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
645
        for index_cluster in range(self.__amount_clusters):
646
            for index_point in range(len(self.__data)):
647
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
648
649
650
    def __maximization_step(self):
651
        self.__pic = [];
652
        self.__means = [];
653
        self.__variances = [];
654
        
655
        amount_impossible_clusters = 0;
656
        
657
        for index_cluster in range(self.__amount_clusters):
658
            mc = numpy.sum(self.__rc[index_cluster]);
659
            
660
            if (mc == 0.0):
661
                amount_impossible_clusters += 1;
662
                continue;
663
            
664
            self.__pic.append( mc / len(self.__data) );
665
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
666
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
667
        
668
        self.__amount_clusters -= amount_impossible_clusters;
669
670
671
    def __get_stop_condition(self):
672
        for covariance in self.__variances:
673
            if (numpy.linalg.norm(covariance) == 0.0):
674
                return True;
675
        
676
        return False;
677
678
679
    def __update_covariance(self, means, rc, mc):
680
        covariance = 0.0;
681
        for index_point in range(len(self.__data)):
682
            deviation = numpy.array( [ self.__data[index_point] - means ]);
683
            covariance += rc[index_point] * deviation.T.dot(deviation);
684
        
685
        covariance = covariance / mc;
686
        return covariance;
687
688
689
    def __update_mean(self, rc, mc):
690
        mean = 0.0;
691
        for index_point in range(len(self.__data)):
692
            mean += rc[index_point] * self.__data[index_point];
693
        
694
        mean = mean / mc;
695
        return mean;