Completed
Push — 0.7.dev ( fccf24...9cbc00 )
by Andrei
51s
created

ema.process()   B

Complexity

Conditions 4

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
dl 0
loc 23
rs 8.7972
c 1
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4
5
@authors Andrei Novikov ([email protected])
6
@date 2014-2017
7
@copyright GNU Public License
8
9
@cond GNU_PUBLIC_LICENSE
10
    PyClustering is free software: you can redistribute it and/or modify
11
    it under the terms of the GNU General Public License as published by
12
    the Free Software Foundation, either version 3 of the License, or
13
    (at your option) any later version.
14
    
15
    PyClustering is distributed in the hope that it will be useful,
16
    but WITHOUT ANY WARRANTY; without even the implied warranty of
17
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
    GNU General Public License for more details.
19
    
20
    You should have received a copy of the GNU General Public License
21
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
@endcond
23
24
"""
25
26
27
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
28
import random;
29
30
from pyclustering.cluster import cluster_visualizer;
31
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
32
from pyclustering.cluster.kmeans import kmeans;
33
34
from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;
35
36
from enum import IntEnum;
37
38
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
39
import matplotlib.animation as animation;
0 ignored issues
show
Configuration introduced by
The import matplotlib.animation could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
40
from matplotlib import patches;
0 ignored issues
show
Configuration introduced by
The import matplotlib could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
41
42
43
44
def gaussian(data, mean, covariance):
45
    """!
46
    @brief Calculates gaussian for dataset using specified mean (mathematical expectation) and variance or covariance in case
47
            multi-dimensional data.
48
    
49
    @param[in] data (list): Data that is used for gaussian calculation.
50
    @param[in] mean (float|numpy.array): Mathematical expectation used for calculation.
51
    @param[in] covariance (float|numpy.array): Variance or covariance matrix for calculation.
52
    
53
    @return (list) Value of gaussian function for each point in dataset.
54
    
55
    """
56
    dimension = float(len(data[0]));
57
 
58
    if (dimension != 1.0):
59
        inv_variance = numpy.linalg.pinv(covariance);
60
    else:
61
        inv_variance = 1.0 / covariance;
62
    
63
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
64
    if (divider != 0.0):
65
        right_const = 1.0 / divider;
66
    else:
67
        right_const = float('inf');
68
    
69
    result = [];
70
    
71
    for point in data:
72
        mean_delta = point - mean;
73
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
74
        result.append(point_gaussian);
75
    
76
    return result;
77
78
79
80
class ema_init_type(IntEnum):
81
    """!
82
    @brief Enumeration of initialization types for Expectation-Maximization algorithm.
83
    
84
    """
85
    
86
    ## Means are randomly taken from input dataset and variance or covariance is calculated based on
87
    ## spherical data that belongs to the chosen means.
88
    RANDOM_INITIALIZATION = 0;
89
    
90
    ## Two step initialization. The first is calculation of initial centers using K-Means++ method.
91
    ## The second is K-Means clustering using obtained centers in the first step. Obtained clusters
92
    ## and its centers are used for calculation of variance (covariance in case of multi-dimensional)
93
    ## data.
94
    KMEANS_INITIALIZATION = 1;
95
96
97
98
class ema_initializer():
99
    """!
100
    @brief Provides servies for preparing initial means and covariances for Expectation-Maximization algorithm.
101
    @details Initialization strategy is defined by enumerator 'ema_init_type': random initialization and 
102
              kmeans with kmeans++ initialization. Here an example of initialization using kmeans strategy:
103
    
104
    @code
105
        from pyclustering.utils import read_sample;
106
        from pyclustering.samples.definitions import COMMON_SAMPLES;
107
        from pyclustering.cluster.ema import ema_initializer;
108
        
109
        sample = read_sample(COMMON_SAMPLES.SAMPLE_OLD_FAITHFUL);
110
        amount_clusters = 2;
111
        
112
        initial_means, initial_covariance = ema_initializer(sample, amount_clusters).initialize(initializer);
113
        print(initial_means);
114
        print(initial_covariance);
115
    @endcode
116
    
117
    """
118
    
119
    __MAX_GENERATION_ATTEPTS = 10;
120
    
121
    def __init__(self, sample, amount):
122
        """!
123
        @brief Constructs EM initializer.
124
        
125
        @param[in] sample (list): Data that will be used by the EM algorithm.
126
        @param[in] amount (uint): Amount of clusters that should be allocated by the EM algorithm.
127
        
128
        """
129
        self.__sample = sample;
130
        self.__amount = amount;
131
132
133
    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
134
        """!
135
        @brief Calculates initial parameters for EM algorithm: means and covariances using
136
                specified strategy.
137
        
138
        @param[in] init_type (ema_init_type): Strategy for initialization.
139
        
140
        @return (float|list, float|numpy.array) Initial means and variance (covariance matrix in case multi-dimensional data).
141
        
142
        """
143
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
144
            return self.__initialize_kmeans();
145
        
146
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
147
            return self.__initialize_random();
148
        
149
        raise NameError("Unknown type of EM algorithm initialization is specified.");
150
151
152
    def __calculate_initial_clusters(self, centers):
153
        """!
154
        @brief Calculate Euclidean distance to each point from the each cluster. 
155
        @brief Nearest points are captured by according clusters and as a result clusters are updated.
156
        
157
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
158
        
159
        """
160
        
161
        clusters = [[] for _ in range(len(centers))];
162
        for index_point in range(len(self.__sample)):
163
            index_optim, dist_optim = -1, 0.0;
164
             
165
            for index in range(len(centers)):
166
                dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
167
                 
168
                if ( (dist < dist_optim) or (index is 0)):
169
                    index_optim, dist_optim = index, dist;
170
             
171
            clusters[index_optim].append(index_point);
172
        
173
        return clusters;
174
175
176
    def __calculate_initial_covariances(self, initial_clusters):
177
        covariances = [];
178
        for initial_cluster in initial_clusters:
179
            if (len(initial_cluster) > 1):
180
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
181
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
182
            else:
183
                dimension = len(self.__sample[0]);
184
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
185
        
186
        return covariances;
187
188
189
    def __initialize_random(self):
190
        initial_means = [];
191
        
192
        for _ in range(self.__amount):
193
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
194
            attempts = 0;
195
            while ( (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEPTS) ):
196
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
197
                attempts += 1;
198
            
199
            if (attempts == ema_initializer.__MAX_GENERATION_ATTEPTS):
200
                mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ];
201
            
202
            initial_means.append(mean);
203
        
204
        initial_clusters = self.__calculate_initial_clusters(initial_means);
205
        initial_covariance = self.__calculate_initial_covariances(initial_clusters);
206
        
207
        return initial_means, initial_covariance;
208
209
210
    def __initialize_kmeans(self):
211
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
212
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
213
        kmeans_instance.process();
214
        
215
        means = kmeans_instance.get_centers();
216
        
217
        covariances = [];
218
        initial_clusters = kmeans_instance.get_clusters();
219
        for initial_cluster in initial_clusters:
220
            if (len(initial_cluster) > 1):
221
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
222
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
223
            else:
224
                dimension = len(self.__sample[0]);
225
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
226
        
227
        return means, covariances;
228
229
230
231
class ema_observer:
232
    """!
233
    @brief Observer of EM algorithm for collecting algorithm state on each step.
234
    @details It can be used to obtain whole picture about clustering process of EM algorithm. Allocated clusters,
235
              means and covariances are stored in observer on each step. Here an example of usage:
236
    
237
    @code
238
        from pyclustering.cluster.ema import ema, ema_observer;
239
        from pyclustering.utils import read_sample;
240
        from pyclustering.samples.definitions import SIMPLE_SAMPLES;
241
        
242
        # Read data from text file
243
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
244
        
245
        # Create EM observer
246
        observer = ema_observer();
247
        
248
        # Create EM algorithm to allocated four clusters and pass observer to it
249
        ema_instance = ema(sample, 4, observer=observer);
250
        
251
        # Run clustering process
252
        ema_instance.process();
253
        
254
        # Print amount of steps that were done by the algorithm
255
        print("EMA steps:", observer.get_iterations());
256
        
257
        # Print evolution of means and covariances
258
        print("Means evolution:", observer.get_evolution_means());
259
        print("Covariances evolution:", observer.get_evolution_covariances());
260
        
261
        # Print evolution of clusters
262
        print("Clusters evolution:", observer.get_evolution_clusters());
263
        
264
        # Print final clusters
265
        print("Allocated clusters:", observer.get_evolution_clusters()[-1]);
266
    @endcode
267
    
268
    """
269
    def __init__(self):
270
        """!
271
        @brief Initializes EM observer.
272
        
273
        """
274
        self.__means_evolution = [];
275
        self.__covariances_evolution = [];
276
        self.__clusters_evolution = [];
277
278
279
    def __len__(self):
280
        """!
281
        @return (uint) Amount of iterations that were done by the EM algorithm.
282
        
283
        """
284
        return len(self.__means_evolution);
285
286
287
    def get_iterations(self):
288
        """!
289
        @return (uint) Amount of iterations that were done by the EM algorithm.
290
        
291
        """
292
        return len(self.__means_evolution);
293
294
295
    def get_evolution_means(self):
296
        """!
297
        @return (list) Mean of each cluster on each step of clustering.
298
        
299
        """
300
        return self.__means_evolution;
301
302
303
    def get_evolution_covariances(self):
304
        """!
305
        @return (list) Covariance matrix (or variance in case of one-dimensional data) of each cluster on each step of clustering.
306
        
307
        """
308
        return self.__covariances_evolution;
309
310
311
    def get_evolution_clusters(self):
312
        """!
313
        @return (list) Allocated clusters on each step of clustering.
314
        
315
        """
316
        return self.__clusters_evolution;
317
318
319
    def notify(self, means, covariances, clusters):
320
        """!
321
        @brief This method is used by the algorithm to notify observer about changes where the algorithm
322
                should provide new values: means, covariances and allocated clusters.
323
        
324
        @param[in] means (list): Mean of each cluster on currect step.
325
        @param[in] covariances (list): Covariances of each cluster on current step.
326
        @param[in] clusters (list): Allocated cluster on current step.
327
        
328
        """
329
        self.__means_evolution.append(means);
330
        self.__covariances_evolution.append(covariances);
331
        self.__clusters_evolution.append(clusters);
332
333
334
335
class ema_visualizer:
336
    """!
337
    @brief Visualizer of EM algorithm's results.
338
    @details Provides services for visualization of particular features of the algorithm, for example,
339
              in case of two-dimensional dataset it shows covariance ellipses.
340
    
341
    """
342
    
343
    @staticmethod
344
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
345
        """!
346
        @brief Draws clusters and in case of two-dimensional dataset draws their ellipses.
347
        
348
        @param[in] clusters (list): Clusters that were allocated by the algorithm.
349
        @param[in] sample (list): Dataset that were used for clustering.
350
        @param[in] covariances (list): Covariances of the clusters.
351
        @param[in] means (list): Means of the clusters.
352
        @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
353
                    for visualization.
354
        @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
355
                    shown manually using matplotlib function 'plt.show()'.
356
        
357
        @return (figure) Figure where clusters were drawn.
358
        
359
        """
360
        
361
        visualizer = cluster_visualizer();
362
        visualizer.append_clusters(clusters, sample);
363
        
364
        if (figure is None):
365
            figure = visualizer.show(display = False);
366
        else:
367
            visualizer.show(figure = figure, display = False);
368
        
369
        if (len(sample[0]) == 2):
370
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
371
372
        if (display is True): 
373
            plt.show();
374
375
        return figure;
376
377
378
    @staticmethod
379
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
380
        """!
381
        @brief Animates clustering process that is performed by EM algorithm.
382
        
383
        @param[in] data (list): Dataset that is used for clustering.
384
        @param[in] observer (ema_observer): EM observer that was used for collection information about clustering process.
385
        @param[in] animation_velocity (uint): Interval between frames in milliseconds (for run-time animation only).
386
        @param[in] movie_fps (uint): Defines frames per second (for rendering movie only).
387
        @param[in] save_movie (string): If it is specified then animation will be stored to file that is specified in this parameter.
388
        
389
        """
390
        
391
        figure = plt.figure();
392
        
393
        def init_frame():
394
            return frame_generation(0);
395
        
396
        def frame_generation(index_iteration):
397
            figure.clf();
398
            
399
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
400
            
401
            clusters = observer.get_evolution_clusters()[index_iteration];
402
            covariances = observer.get_evolution_covariances()[index_iteration];
403
            means = observer.get_evolution_means()[index_iteration];
404
            
405
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
406
            figure.subplots_adjust(top = 0.85);
407
            
408
            return [ figure.gca() ];
409
410
        iterations = len(observer);
411
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
412
413
        if (save_movie is not None):
414
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
415
        else:
416
            plt.show();
417
418
419
    @staticmethod
420
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
421
        ax = figure.get_axes()[0];
422
        
423
        for index in range(len(clusters)):
424
            angle, width, height = calculate_ellipse_description(covariances[index]);
425
            color = visualizer.get_cluster_color(index, 0);
426
427
            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
428
429
430
    @staticmethod
431
    def __draw_ellipse(ax, x, y, angle, width, height, color):
432
        if ((width > 0.0) and (height > 0.0)):
433
            ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
434
            ax.add_patch(ellipse);
435
436
437
438
class ema:
439
    """!
440
    @brief Expectation-Maximization clustering algorithm for Gaussian Mixture Model (GMM).
441
    @details The algorithm provides only clustering services (unsupervise learning).
442
              Here an example of data clustering process:
443
    @code
444
        # Read dataset from text file
445
        sample = read_sample(COMMON_SAMPLES.);
446
        
447
        # Amount of cluster that should be allocated
448
        amount = 2;
449
        
450
        # Prepare initial means and covariances using K-Means initializer
451
        initializer = ema_init_type.KMEANS_INITIALIZATION;
452
        initial_means, initial_covariance = ema_initializer(sample, amount).initialize(initializer);
453
        
454
        # Lets create observer to see clustering process
455
        observer = ema_observer();
456
        
457
        # Create instance of the EM algorithm
458
        ema_instance = ema(sample, amount, initial_means, initial_covariance, observer=observer);
459
        
460
        # Run clustering process
461
        ema_instance.process();
462
        
463
        # Extract clusters
464
        clusters = ema_instance.get_clusters();
465
        print("Obtained clusters:", clusters);
466
        
467
        # Display allocated clusters using visualizer
468
        covariances = ema_instance.get_covariances();
469
        means = ema_instance.get_centers();
470
        ema_visualizer.show_clusters(clusters, sample, covariances, means);
471
        
472
        # Show animation process
473
        ema_visualizer.animate_cluster_allocation(sample, observer);
474
        
475
    @endcode
476
    
477
    @see ema_visualizer
478
    @see ema_observer
479
    
480
    """
481
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
482
        """!
483
        @brief Initializes Expectation-Maximization algorithm for cluster analysis.
484
        
485
        @param[in] data (list): Dataset that should be analysed and where each point (object) is represented by the list of coordinates.
486
        @param[in] amount_clusters (uint): Amount of clusters that should be allocated.
487
        @param[in] means (list): Initial means of clusters (amount of means should be equal to amount of clusters for allocation).
488
                    If this parameter is 'None' then K-Means algorithm with K-Means++ method will be used for initialization by default.
489
        @param[in] variances (list): Initial cluster variances (or covariances in case of multi-dimensional data). Amount of
490
                    covariances should be equal to amount of clusters that should be allocated. If this parameter is 'None' then
491
                    K-Means algorithm with K-Means++ method will be used for initialization by default.
492
        @param[in] observer (ema_observer): Observer for gathering information about clustering process.
493
        @param[in] tolerance (float): Defines stop condition of the algorithm (when difference between current and
494
                    previous log-likelihood estimation is less then 'tolerance' then clustering is over).
495
        @param[in] iterations (uint): Additional stop condition parameter that defines maximum number of steps that can be
496
                    performed by the algorithm during clustering process.
497
        
498
        """
499
        
500
        self.__data = numpy.array(data);
501
        self.__amount_clusters = amount_clusters;
502
        self.__tolerance = tolerance;
503
        self.__iterations = iterations;
504
        self.__observer = observer;
505
        
506
        self.__means = means;
507
        self.__variances = variances;
508
        
509
        if ((means is None) or (variances is None)):
510
            self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
511
            
512
            if (len(self.__means) != amount_clusters):
513
                self.__amount_clusters = len(self.__means);
514
        
515
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
516
        self.__pic = [1.0] * amount_clusters;
517
        self.__clusters = [];
518
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
519
        self.__stop = False;
520
521
522
    def process(self):
523
        """!
524
        @brief Run clustering process of the algorithm.
525
        @details This method should be called before call 'get_clusters()'.
526
        
527
        """
528
        
529
        previous_likelihood = -200000;
530
        current_likelihood = -100000;
531
        
532
        current_iteration = 0;
533
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
534
            self.__expectation_step();
535
            self.__maximization_step();
536
            
537
            current_iteration += 1;
538
            
539
            self.__extract_clusters();
540
            self.__notify();
541
            
542
            previous_likelihood = current_likelihood;
543
            current_likelihood = self.__log_likelihood();
544
            self.__stop = self.__get_stop_condition();
545
546
547
    def get_clusters(self):
548
        """!
549
        @return (list) Allocated clusters where each cluster is represented by list of indexes of points from dataset,
550
                        for example, two cluster may have following representation [[0, 1, 4], [2, 3, 5, 6]].
551
        
552
        """
553
        return self.__clusters;
554
555
556
    def get_centers(self):
557
        """!
558
        @return (list) Corresponding centers (means) of clusters.
559
        
560
        """
561
        
562
        return self.__means;
563
564
565
    def get_covariances(self):
566
        """!
567
        @return (list) Corresponding variances (or covariances in case of multi-dimensional data) of clusters.
568
        
569
        """
570
        
571
        return self.__variances;
572
573
574
    def __erase_empty_clusters(self):
575
        clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], [];
576
577
        for index_cluster in range(len(self.__clusters)):
578
            if (len(self.__clusters[index_cluster]) > 0):
579
                clusters.append(self.__clusters[index_cluster]);
580
                means.append(self.__means[index_cluster]);
581
                variances.append(self.__variances[index_cluster]);
582
                pic.append(self.__pic[index_cluster]);
583
                gaussians.append(self.__gaussians[index_cluster]);
584
                rc.append(self.__rc[index_cluster]);
585
        
586
        if (len(self.__clusters) != len(clusters)):
587
            self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
588
            self.__gaussians, self.__rc = gaussians, rc;
589
            self.__amount_clusters = len(self.__clusters);
590
591
592
    def __notify(self):
593
        if (self.__observer is not None):
594
            self.__observer.notify(self.__means, self.__variances, self.__clusters);
595
596
597
    def __extract_clusters(self):
598
        self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
599
        for index_point in range(len(self.__data)):
600
            candidates = [];
601
            for index_cluster in range(self.__amount_clusters):
602
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
603
            
604
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
605
            self.__clusters[index_winner].append(index_point);
606
        
607
        self.__erase_empty_clusters();
608
609
610
    def __log_likelihood(self):
611
        likelihood = 0.0;
612
        
613
        for index_point in range(len(self.__data)):
614
            particle = 0.0;
615
            for index_cluster in range(self.__amount_clusters):
616
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
617
            
618
            likelihood += numpy.log(particle);
619
        
620
        return likelihood;
621
622
623
    def __probabilities(self, index_cluster, index_point):
624
        divider = 0.0;
625
        for i in range(self.__amount_clusters):
626
            divider += self.__pic[i] * self.__gaussians[i][index_point];
627
        
628
        if ( (divider != 0.0) and (divider != float('inf')) ):
629
            return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
630
        
631
        return float('nan');
632
633
634
    def __expectation_step(self):
635
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
636
        for index in range(self.__amount_clusters):
637
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
638
        
639
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
640
        for index_cluster in range(self.__amount_clusters):
641
            for index_point in range(len(self.__data)):
642
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
643
644
645
    def __maximization_step(self):
646
        self.__pic = [];
647
        self.__means = [];
648
        self.__variances = [];
649
        
650
        amount_impossible_clusters = 0;
651
        
652
        for index_cluster in range(self.__amount_clusters):
653
            mc = numpy.sum(self.__rc[index_cluster]);
654
            
655
            if (mc == 0.0):
656
                amount_impossible_clusters += 1;
657
                continue;
658
            
659
            self.__pic.append( mc / len(self.__data) );
660
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
661
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
662
        
663
        self.__amount_clusters -= amount_impossible_clusters;
664
665
666
    def __get_stop_condition(self):
667
        for covariance in self.__variances:
668
            if (numpy.linalg.norm(covariance) == 0.0):
669
                return True;
670
        
671
        return False;
672
673
674
    def __update_covariance(self, means, rc, mc):
675
        covariance = 0.0;
676
        for index_point in range(len(self.__data)):
677
            deviation = numpy.array( [ self.__data[index_point] - means ]);
678
            covariance += rc[index_point] * deviation.T.dot(deviation);
679
        
680
        covariance = covariance / mc;
681
        return covariance;
682
683
684
    def __update_mean(self, rc, mc):
685
        mean = 0.0;
686
        for index_point in range(len(self.__data)):
687
            mean += rc[index_point] * self.__data[index_point];
688
        
689
        mean = mean / mc;
690
        return mean;