ema_initializer.__calculate_initial_clusters() - Code Metrics - Inspection of "#16: Improve EMA (random initializer)." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 0.7.dev ( ce3005...c2d6e2 )

by Andrei

created 2017-09-21 15:41 UTC

ema_initializer.__calculate_initial_clusters() B

↳ Parent: ema_initializer

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
dl	0
loc	22
rs	7.7857
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
@details Implementation based on article:
         - 

@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from pyclustering.cluster import cluster_visualizer;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
from pyclustering.cluster.kmeans import kmeans;

from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;

from enum import IntEnum;

import matplotlib.pyplot as plt;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import matplotlib.animation as animation;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from matplotlib import patches;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3



def gaussian(data, mean, covariance):
    dimension = float(len(data[0]));
 
    if (dimension != 1.0):
        inv_variance = numpy.linalg.pinv(covariance);
    else:
        inv_variance = 1.0 / covariance;
    
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
    right_const = 1.0 / divider;
     
    result = [];
     
    for point in data:
        mean_delta = point - mean;
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
        result.append(point_gaussian);
     
    return result;



class ema_init_type(IntEnum):
    RANDOM_INITIALIZATION = 0;
    KMEANS_INITIALIZATION = 1;



class ema_initializer():
    def __init__(self, sample, amount):
        self.__sample = sample;
        self.__amount = amount;


    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
            return self.__initialize_kmeans();
        
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
            return self.__initialize_random();
        
        raise NameError("Unknown type of EM algorithm initialization is specified.");


    def __calculate_initial_clusters(self, centers):
        """!
        @brief Calculate Euclidean distance to each point from the each cluster. 
        @brief Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for _ in range(len(centers))];
        for index_point in range(len(self.__sample)):
            index_optim, dist_optim = -1, 0.0;
             
            for index in range(len(centers)):
                dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
                 
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim, dist_optim = index, dist;
             
            clusters[index_optim].append(index_point);
        
        return clusters;


    def __calculate_initial_covariances(self, initial_clusters):
        covariances = [];
        for initial_cluster in initial_clusters:
            if (len(initial_cluster) > 1):
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
            else:
                dimension = len(self.__sample[0]);
                covariances.append(numpy.zeros((dimension, dimension))  + random.random());
        
        return covariances;


    def __initialize_random(self):
        initial_means = [];
        
        for _ in range(self.__amount):
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
            while (mean in initial_means):
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
            
            initial_means.append(mean);
            
            #covariance = numpy.cov(self.__sample, rowvar = False);
            #covariance = numpy.divide(covariance, self.__amount * 10.0);
            #initial_covariance.append(covariance);
        
        initial_clusters = self.__calculate_initial_clusters(initial_means);
        initial_covariance = self.__calculate_initial_covariances(initial_clusters);
        
        return initial_means, initial_covariance;


    def __initialize_kmeans(self):
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
        kmeans_instance.process();
        
        means = kmeans_instance.get_centers();
        
        covariances = [];
        initial_clusters = kmeans_instance.get_clusters();
        for initial_cluster in initial_clusters:
            if (len(initial_cluster) > 1):
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
            else:
                dimension = len(self.__sample[0]);
                covariances.append(numpy.zeros((dimension, dimension))  + random.random());
        
        return means, covariances;



class ema_observer:
    def __init__(self):
        self.__means_evolution = [];
        self.__covariances_evolution = [];
        self.__clusters_evolution = [];


    def __len__(self):
        return len(self.__means_evolution);


    def get_iterations(self):
        return len(self.__means_evolution);


    def get_evolution_means(self):
        return self.__means_evolution;


    def get_evolution_covariances(self):
        return self.__covariances_evolution;


    def get_evolution_clusters(self):
        return self.__clusters_evolution;


    def notify(self, means, covariances, clusters):
        self.__means_evolution.append(means);
        self.__covariances_evolution.append(covariances);
        self.__clusters_evolution.append(clusters);



class ema_visualizer:
    @staticmethod
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
        visualizer = cluster_visualizer();
        visualizer.append_clusters(clusters, sample);
        
        if (figure is None):
            figure = visualizer.show(display = False);
        else:
            visualizer.show(figure = figure, display = False);
        
        if (len(sample[0]) == 2):
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);

        if (display is True): 
            plt.show();

        return figure;


    @staticmethod
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
        figure = plt.figure();
        
        def init_frame():
            return frame_generation(0);
        
        def frame_generation(index_iteration):
            figure.clf();
            
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
            
            clusters = observer.get_evolution_clusters()[index_iteration];
            covariances = observer.get_evolution_covariances()[index_iteration];
            means = observer.get_evolution_means()[index_iteration];
            
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
            figure.subplots_adjust(top = 0.85);
            
            return [ figure.gca() ];

        iterations = len(observer);
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);

        if (save_movie is not None):
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
        else:
            plt.show();


    @staticmethod
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
        ax = figure.get_axes()[0];
        
        for index in range(len(clusters)):
            angle, width, height = calculate_ellipse_description(covariances[index]);
            color = visualizer.get_cluster_color(index, 0);
            
            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);


    @staticmethod
    def __draw_ellipse(ax, x, y, angle, width, height, color):
        ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
        ax.add_patch(ellipse);



class ema:
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
        self.__data = numpy.array(data);
        self.__amount_clusters = amount_clusters;
        self.__tolerance = tolerance;
        self.__iterations = iterations;
        self.__observer = observer;
        
        self.__means = means;
        self.__variances = variances;
        
        if ((means is None) or (variances is None)):
            self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
            
            if (len(self.__means) != amount_clusters):
                self.__amount_clusters = len(self.__means);
        
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
        self.__pic = [1.0] * amount_clusters;
        self.__clusters = [];
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
        self.__stop = False;


    def process(self):
        previous_likelihood = -200000;
        current_likelihood = -100000;
        
        current_iteration = 0;
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
            self.__expectation_step();
            self.__maximization_step();
            
            previous_likelihood = current_likelihood;
            current_likelihood = self.__log_likelihood();
            self.__stop = self.__get_stop_condition();
            
            current_iteration += 1;
            
            self.__extract_clusters();
            self.__notify();


    def get_clusters(self):
        return self.__clusters;


    def get_centers(self):
        return self.__means;


    def get_covariances(self):
        return self.__variances;


    def __erase_empty_clusters(self):
        clusters, means, variances, pic = [], [], [], [];

        for index_cluster in range(len(self.__clusters)):
            if (len(self.__clusters[index_cluster]) > 0):
                clusters.append(self.__clusters[index_cluster]);
                means.append(self.__means[index_cluster]);
                variances.append(self.__variances[index_cluster]);
                pic.append(self.__pic[index_cluster]);
        
        if (len(self.__clusters) != len(clusters)):
            self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
            self.__amount_clusters = len(self.__clusters);


    def __notify(self):
        if (self.__observer is not None):
            self.__observer.notify(self.__means, self.__variances, self.__clusters);


    def __extract_clusters(self):
        self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
        for index_point in range(len(self.__data)):
            candidates = [];
            for index_cluster in range(self.__amount_clusters):
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
            
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
            self.__clusters[index_winner].append(index_point);
        
        self.__erase_empty_clusters();


    def __log_likelihood(self):
        likelihood = 0.0;
        
        for index_point in range(len(self.__data)):
            particle = 0.0;
            for index_cluster in range(self.__amount_clusters):
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
            
            likelihood += numpy.log(particle);
        
        return likelihood;


    def __probabilities(self, index_cluster, index_point):
        divider = 0.0;
        for i in range(self.__amount_clusters):
            divider += self.__pic[i] * self.__gaussians[i][index_point];
        
        rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
        return rc;


    def __expectation_step(self):
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
        for index in range(self.__amount_clusters):
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
        
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
        for index_cluster in range(self.__amount_clusters):
            for index_point in range(len(self.__data)):
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);


    def __maximization_step(self):
        self.__pic = [];
        self.__means = [];
        self.__variances = [];
        
        amount_impossible_clusters = 0;
        
        for index_cluster in range(self.__amount_clusters):
            mc = numpy.sum(self.__rc[index_cluster]);
            
            if (mc == 0.0):
                amount_impossible_clusters += 1;
                continue;
            
            self.__pic.append( mc / len(self.__data) );
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
        
        self.__amount_clusters -= amount_impossible_clusters;


    def __get_stop_condition(self):
        for covariance in self.__variances:
            if (numpy.linalg.norm(covariance) == 0.0):
                return True;
        
        return False;


    def __update_covariance(self, means, rc, mc):
        covariance = 0.0;
        for index_point in range(len(self.__data)):
            deviation = numpy.array( [ self.__data[index_point] - means ]);
            covariance += rc[index_point] * deviation.T.dot(deviation);
        
        covariance = covariance / mc;
        return covariance;


    def __update_mean(self, rc, mc):
        mean = 0.0;
        for index_point in range(len(self.__data)):
            mean += rc[index_point] * self.__data[index_point];
        
        mean = mean / mc;
        return mean;

Push — 0.7.dev ( ce3005...c2d6e2 )

ema_initializer.__calculate_initial_clusters() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
4			@details Implementation based on article:
5			-
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2017
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28
29			import numpy;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30			import random;
31
32			from pyclustering.cluster import cluster_visualizer;
33			from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
34			from pyclustering.cluster.kmeans import kmeans;
35
36			from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;
37
38			from enum import IntEnum;
39
40			import matplotlib.pyplot as plt;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `matplotlib.pyplot` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
41			import matplotlib.animation as animation;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `matplotlib.animation` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
42			from matplotlib import patches;
			0 ignored issues – show Configuration introduced 2017-09-16 22:44 UTC by Report Bug Copy Issue Report The import `matplotlib` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
43
44
45
46			def gaussian(data, mean, covariance):
47			dimension = float(len(data[0]));
48
49			if (dimension != 1.0):
50			inv_variance = numpy.linalg.pinv(covariance);
51			else:
52			inv_variance = 1.0 / covariance;
53
54			divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
55			right_const = 1.0 / divider;
56
57			result = [];
58
59			for point in data:
60			mean_delta = point - mean;
61			point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
62			result.append(point_gaussian);
63
64			return result;
65
66
67
68			class ema_init_type(IntEnum):
69			RANDOM_INITIALIZATION = 0;
70			KMEANS_INITIALIZATION = 1;
71
72
73
74			class ema_initializer():
75			def __init__(self, sample, amount):
76			self.__sample = sample;
77			self.__amount = amount;
78
79
80			def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
81			if (init_type == ema_init_type.KMEANS_INITIALIZATION):
82			return self.__initialize_kmeans();
83
84			elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
85			return self.__initialize_random();
86
87			raise NameError("Unknown type of EM algorithm initialization is specified.");
88
89
90			def __calculate_initial_clusters(self, centers):
91			"""!
92			@brief Calculate Euclidean distance to each point from the each cluster.
93			@brief Nearest points are captured by according clusters and as a result clusters are updated.
94
95			@return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
96
97			"""
98
99			clusters = [[] for _ in range(len(centers))];
100			for index_point in range(len(self.__sample)):
101			index_optim, dist_optim = -1, 0.0;
102
103			for index in range(len(centers)):
104			dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
105
106			if ( (dist < dist_optim) or (index is 0)):
107			index_optim, dist_optim = index, dist;
108
109			clusters[index_optim].append(index_point);
110
111			return clusters;
112
113
114			def __calculate_initial_covariances(self, initial_clusters):
115			covariances = [];
116			for initial_cluster in initial_clusters:
117			if (len(initial_cluster) > 1):
118			cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
119			covariances.append(numpy.cov(cluster_sample, rowvar = False));
120			else:
121			dimension = len(self.__sample[0]);
122			covariances.append(numpy.zeros((dimension, dimension)) + random.random());
123
124			return covariances;
125
126
127			def __initialize_random(self):
128			initial_means = [];
129
130			for _ in range(self.__amount):
131			mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
132			while (mean in initial_means):
133			mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
134
135			initial_means.append(mean);
136
137			#covariance = numpy.cov(self.__sample, rowvar = False);
138			#covariance = numpy.divide(covariance, self.__amount * 10.0);
139			#initial_covariance.append(covariance);
140
141			initial_clusters = self.__calculate_initial_clusters(initial_means);
142			initial_covariance = self.__calculate_initial_covariances(initial_clusters);
143
144			return initial_means, initial_covariance;
145
146
147			def __initialize_kmeans(self):
148			initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
149			kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
150			kmeans_instance.process();
151
152			means = kmeans_instance.get_centers();
153
154			covariances = [];
155			initial_clusters = kmeans_instance.get_clusters();
156			for initial_cluster in initial_clusters:
157			if (len(initial_cluster) > 1):
158			cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
159			covariances.append(numpy.cov(cluster_sample, rowvar = False));
160			else:
161			dimension = len(self.__sample[0]);
162			covariances.append(numpy.zeros((dimension, dimension)) + random.random());
163
164			return means, covariances;
165
166
167
168			class ema_observer:
169			def __init__(self):
170			self.__means_evolution = [];
171			self.__covariances_evolution = [];
172			self.__clusters_evolution = [];
173
174
175			def __len__(self):
176			return len(self.__means_evolution);
177
178
179			def get_iterations(self):
180			return len(self.__means_evolution);
181
182
183			def get_evolution_means(self):
184			return self.__means_evolution;
185
186
187			def get_evolution_covariances(self):
188			return self.__covariances_evolution;
189
190
191			def get_evolution_clusters(self):
192			return self.__clusters_evolution;
193
194
195			def notify(self, means, covariances, clusters):
196			self.__means_evolution.append(means);
197			self.__covariances_evolution.append(covariances);
198			self.__clusters_evolution.append(clusters);
199
200
201
202			class ema_visualizer:
203			@staticmethod
204			def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
205			visualizer = cluster_visualizer();
206			visualizer.append_clusters(clusters, sample);
207
208			if (figure is None):
209			figure = visualizer.show(display = False);
210			else:
211			visualizer.show(figure = figure, display = False);
212
213			if (len(sample[0]) == 2):
214			ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
215
216			if (display is True):
217			plt.show();
218
219			return figure;
220
221
222			@staticmethod
223			def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
224			figure = plt.figure();
225
226			def init_frame():
227			return frame_generation(0);
228
229			def frame_generation(index_iteration):
230			figure.clf();
231
232			figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
233
234			clusters = observer.get_evolution_clusters()[index_iteration];
235			covariances = observer.get_evolution_covariances()[index_iteration];
236			means = observer.get_evolution_means()[index_iteration];
237
238			ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
239			figure.subplots_adjust(top = 0.85);
240
241			return [ figure.gca() ];
242
243			iterations = len(observer);
244			cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
245
246			if (save_movie is not None):
247			cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
248			else:
249			plt.show();
250
251
252			@staticmethod
253			def __draw_ellipses(figure, visualizer, clusters, covariances, means):
254			ax = figure.get_axes()[0];
255
256			for index in range(len(clusters)):
257			angle, width, height = calculate_ellipse_description(covariances[index]);
258			color = visualizer.get_cluster_color(index, 0);
259
260			ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
261
262
263			@staticmethod
264			def __draw_ellipse(ax, x, y, angle, width, height, color):
265			ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
266			ax.add_patch(ellipse);
267
268
269
270			class ema:
271			def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
272			self.__data = numpy.array(data);
273			self.__amount_clusters = amount_clusters;
274			self.__tolerance = tolerance;
275			self.__iterations = iterations;
276			self.__observer = observer;
277
278			self.__means = means;
279			self.__variances = variances;
280
281			if ((means is None) or (variances is None)):
282			self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
283
284			if (len(self.__means) != amount_clusters):
285			self.__amount_clusters = len(self.__means);
286
287			self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
288			self.__pic = [1.0] * amount_clusters;
289			self.__clusters = [];
290			self.__gaussians = [ [] for _ in range(amount_clusters) ];
291			self.__stop = False;
292
293
294			def process(self):
295			previous_likelihood = -200000;
296			current_likelihood = -100000;
297
298			current_iteration = 0;
299			while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
300			self.__expectation_step();
301			self.__maximization_step();
302
303			previous_likelihood = current_likelihood;
304			current_likelihood = self.__log_likelihood();
305			self.__stop = self.__get_stop_condition();
306
307			current_iteration += 1;
308
309			self.__extract_clusters();
310			self.__notify();
311
312
313			def get_clusters(self):
314			return self.__clusters;
315
316
317			def get_centers(self):
318			return self.__means;
319
320
321			def get_covariances(self):
322			return self.__variances;
323
324
325			def __erase_empty_clusters(self):
326			clusters, means, variances, pic = [], [], [], [];
327
328			for index_cluster in range(len(self.__clusters)):
329			if (len(self.__clusters[index_cluster]) > 0):
330			clusters.append(self.__clusters[index_cluster]);
331			means.append(self.__means[index_cluster]);
332			variances.append(self.__variances[index_cluster]);
333			pic.append(self.__pic[index_cluster]);
334
335			if (len(self.__clusters) != len(clusters)):
336			self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
337			self.__amount_clusters = len(self.__clusters);
338
339
340			def __notify(self):
341			if (self.__observer is not None):
342			self.__observer.notify(self.__means, self.__variances, self.__clusters);
343
344
345			def __extract_clusters(self):
346			self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
347			for index_point in range(len(self.__data)):
348			candidates = [];
349			for index_cluster in range(self.__amount_clusters):
350			candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
351
352			index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
353			self.__clusters[index_winner].append(index_point);
354
355			self.__erase_empty_clusters();
356
357
358			def __log_likelihood(self):
359			likelihood = 0.0;
360
361			for index_point in range(len(self.__data)):
362			particle = 0.0;
363			for index_cluster in range(self.__amount_clusters):
364			particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
365
366			likelihood += numpy.log(particle);
367
368			return likelihood;
369
370
371			def __probabilities(self, index_cluster, index_point):
372			divider = 0.0;
373			for i in range(self.__amount_clusters):
374			divider += self.__pic[i] * self.__gaussians[i][index_point];
375
376			rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
377			return rc;
378
379
380			def __expectation_step(self):
381			self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
382			for index in range(self.__amount_clusters):
383			self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
384
385			self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
386			for index_cluster in range(self.__amount_clusters):
387			for index_point in range(len(self.__data)):
388			self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
389
390
391			def __maximization_step(self):
392			self.__pic = [];
393			self.__means = [];
394			self.__variances = [];
395
396			amount_impossible_clusters = 0;
397
398			for index_cluster in range(self.__amount_clusters):
399			mc = numpy.sum(self.__rc[index_cluster]);
400
401			if (mc == 0.0):
402			amount_impossible_clusters += 1;
403			continue;
404
405			self.__pic.append( mc / len(self.__data) );
406			self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
407			self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
408
409			self.__amount_clusters -= amount_impossible_clusters;
410
411
412			def __get_stop_condition(self):
413			for covariance in self.__variances:
414			if (numpy.linalg.norm(covariance) == 0.0):
415			return True;
416
417			return False;
418
419
420			def __update_covariance(self, means, rc, mc):
421			covariance = 0.0;
422			for index_point in range(len(self.__data)):
423			deviation = numpy.array( [ self.__data[index_point] - means ]);
424			covariance += rc[index_point] * deviation.T.dot(deviation);
425
426			covariance = covariance / mc;
427			return covariance;
428
429
430			def __update_mean(self, rc, mc):
431			mean = 0.0;
432			for index_point in range(len(self.__data)):
433			mean += rc[index_point] * self.__data[index_point];
434
435			mean = mean / mc;
436			return mean;

annoviko / pyclustering

Push — 0.7.dev ( ce3005...c2d6e2 )

ema_initializer.__calculate_initial_clusters() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files