gaussian() - Code Metrics - Inspection of "#16: Improvement of EMA, unit-tests and examples." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 0.7.dev ( c2d6e2...fccf24 )

by Andrei

created 2017-09-23 17:50 UTC

gaussian() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
dl	0
loc	22
rs	8.9197
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.

@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from pyclustering.cluster import cluster_visualizer;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
from pyclustering.cluster.kmeans import kmeans;

from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;

from enum import IntEnum;

import matplotlib.pyplot as plt;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import matplotlib.animation as animation;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from matplotlib import patches;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3



def gaussian(data, mean, covariance):
    dimension = float(len(data[0]));
 
    if (dimension != 1.0):
        inv_variance = numpy.linalg.pinv(covariance);
    else:
        inv_variance = 1.0 / covariance;
    
    divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
    if (divider != 0.0):
        right_const = 1.0 / divider;
    else:
        right_const = float('inf');
    
    result = [];
    
    for point in data:
        mean_delta = point - mean;
        point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
        result.append(point_gaussian);
    
    return result;



class ema_init_type(IntEnum):
    RANDOM_INITIALIZATION = 0;
    KMEANS_INITIALIZATION = 1;



class ema_initializer():
    __MAX_GENERATION_ATTEPTS = 10;
    
    def __init__(self, sample, amount):
        self.__sample = sample;
        self.__amount = amount;


    def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
        if (init_type == ema_init_type.KMEANS_INITIALIZATION):
            return self.__initialize_kmeans();
        
        elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
            return self.__initialize_random();
        
        raise NameError("Unknown type of EM algorithm initialization is specified.");


    def __calculate_initial_clusters(self, centers):
        """!
        @brief Calculate Euclidean distance to each point from the each cluster. 
        @brief Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for _ in range(len(centers))];
        for index_point in range(len(self.__sample)):
            index_optim, dist_optim = -1, 0.0;
             
            for index in range(len(centers)):
                dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
                 
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim, dist_optim = index, dist;
             
            clusters[index_optim].append(index_point);
        
        return clusters;


    def __calculate_initial_covariances(self, initial_clusters):
        covariances = [];
        for initial_cluster in initial_clusters:
            if (len(initial_cluster) > 1):
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
            else:
                dimension = len(self.__sample[0]);
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
        
        return covariances;


    def __initialize_random(self):
        initial_means = [];
        
        for _ in range(self.__amount):
            mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
            attempts = 0;
            while ( (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEPTS) ):
                mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
                attempts += 1;
            
            if (attempts == ema_initializer.__MAX_GENERATION_ATTEPTS):
                mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ];
            
            initial_means.append(mean);
        
        initial_clusters = self.__calculate_initial_clusters(initial_means);
        initial_covariance = self.__calculate_initial_covariances(initial_clusters);
        
        return initial_means, initial_covariance;


    def __initialize_kmeans(self):
        initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
        kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
        kmeans_instance.process();
        
        means = kmeans_instance.get_centers();
        
        covariances = [];
        initial_clusters = kmeans_instance.get_clusters();
        for initial_cluster in initial_clusters:
            if (len(initial_cluster) > 1):
                cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
                covariances.append(numpy.cov(cluster_sample, rowvar = False));
            else:
                dimension = len(self.__sample[0]);
                covariances.append(numpy.zeros((dimension, dimension))  + random.random() / 10.0);
        
        return means, covariances;



class ema_observer:
    def __init__(self):
        self.__means_evolution = [];
        self.__covariances_evolution = [];
        self.__clusters_evolution = [];


    def __len__(self):
        return len(self.__means_evolution);


    def get_iterations(self):
        return len(self.__means_evolution);


    def get_evolution_means(self):
        return self.__means_evolution;


    def get_evolution_covariances(self):
        return self.__covariances_evolution;


    def get_evolution_clusters(self):
        return self.__clusters_evolution;


    def notify(self, means, covariances, clusters):
        self.__means_evolution.append(means);
        self.__covariances_evolution.append(covariances);
        self.__clusters_evolution.append(clusters);



class ema_visualizer:
    @staticmethod
    def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
        visualizer = cluster_visualizer();
        visualizer.append_clusters(clusters, sample);
        
        if (figure is None):
            figure = visualizer.show(display = False);
        else:
            visualizer.show(figure = figure, display = False);
        
        if (len(sample[0]) == 2):
            ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);

        if (display is True): 
            plt.show();

        return figure;


    @staticmethod
    def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
        figure = plt.figure();
        
        def init_frame():
            return frame_generation(0);
        
        def frame_generation(index_iteration):
            figure.clf();
            
            figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
            
            clusters = observer.get_evolution_clusters()[index_iteration];
            covariances = observer.get_evolution_covariances()[index_iteration];
            means = observer.get_evolution_means()[index_iteration];
            
            ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
            figure.subplots_adjust(top = 0.85);
            
            return [ figure.gca() ];

        iterations = len(observer);
        cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);

        if (save_movie is not None):
            cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
        else:
            plt.show();


    @staticmethod
    def __draw_ellipses(figure, visualizer, clusters, covariances, means):
        ax = figure.get_axes()[0];
        
        for index in range(len(clusters)):
            angle, width, height = calculate_ellipse_description(covariances[index]);
            color = visualizer.get_cluster_color(index, 0);

            ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);


    @staticmethod
    def __draw_ellipse(ax, x, y, angle, width, height, color):
        if ((width > 0.0) and (height > 0.0)):
            ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
            ax.add_patch(ellipse);



class ema:
    def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
        self.__data = numpy.array(data);
        self.__amount_clusters = amount_clusters;
        self.__tolerance = tolerance;
        self.__iterations = iterations;
        self.__observer = observer;
        
        self.__means = means;
        self.__variances = variances;
        
        if ((means is None) or (variances is None)):
            self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
            
            if (len(self.__means) != amount_clusters):
                self.__amount_clusters = len(self.__means);
        
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
        self.__pic = [1.0] * amount_clusters;
        self.__clusters = [];
        self.__gaussians = [ [] for _ in range(amount_clusters) ];
        self.__stop = False;


    def process(self):
        previous_likelihood = -200000;
        current_likelihood = -100000;
        
        current_iteration = 0;
        while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
            self.__expectation_step();
            self.__maximization_step();
            
            current_iteration += 1;
            
            self.__extract_clusters();
            self.__notify();
            
            previous_likelihood = current_likelihood;
            current_likelihood = self.__log_likelihood();
            self.__stop = self.__get_stop_condition();


    def get_clusters(self):
        return self.__clusters;


    def get_centers(self):
        return self.__means;


    def get_covariances(self):
        return self.__variances;


    def __erase_empty_clusters(self):
        clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], [];

        for index_cluster in range(len(self.__clusters)):
            if (len(self.__clusters[index_cluster]) > 0):
                clusters.append(self.__clusters[index_cluster]);
                means.append(self.__means[index_cluster]);
                variances.append(self.__variances[index_cluster]);
                pic.append(self.__pic[index_cluster]);
                gaussians.append(self.__gaussians[index_cluster]);
                rc.append(self.__rc[index_cluster]);
        
        if (len(self.__clusters) != len(clusters)):
            self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
            self.__gaussians, self.__rc = gaussians, rc;
            self.__amount_clusters = len(self.__clusters);


    def __notify(self):
        if (self.__observer is not None):
            self.__observer.notify(self.__means, self.__variances, self.__clusters);


    def __extract_clusters(self):
        self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
        for index_point in range(len(self.__data)):
            candidates = [];
            for index_cluster in range(self.__amount_clusters):
                candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
            
            index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
            self.__clusters[index_winner].append(index_point);
        
        self.__erase_empty_clusters();


    def __log_likelihood(self):
        likelihood = 0.0;
        
        for index_point in range(len(self.__data)):
            particle = 0.0;
            for index_cluster in range(self.__amount_clusters):
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
            
            likelihood += numpy.log(particle);
        
        return likelihood;


    def __probabilities(self, index_cluster, index_point):
        divider = 0.0;
        for i in range(self.__amount_clusters):
            divider += self.__pic[i] * self.__gaussians[i][index_point];
        
        if ( (divider != 0.0) and (divider != float('inf')) ):
            return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
        
        return float('nan');


    def __expectation_step(self):
        self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
        for index in range(self.__amount_clusters):
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
        
        self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
        for index_cluster in range(self.__amount_clusters):
            for index_point in range(len(self.__data)):
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);


    def __maximization_step(self):
        self.__pic = [];
        self.__means = [];
        self.__variances = [];
        
        amount_impossible_clusters = 0;
        
        for index_cluster in range(self.__amount_clusters):
            mc = numpy.sum(self.__rc[index_cluster]);
            
            if (mc == 0.0):
                amount_impossible_clusters += 1;
                continue;
            
            self.__pic.append( mc / len(self.__data) );
            self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
            self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
        
        self.__amount_clusters -= amount_impossible_clusters;


    def __get_stop_condition(self):
        for covariance in self.__variances:
            if (numpy.linalg.norm(covariance) == 0.0):
                return True;
        
        return False;


    def __update_covariance(self, means, rc, mc):
        covariance = 0.0;
        for index_point in range(len(self.__data)):
            deviation = numpy.array( [ self.__data[index_point] - means ]);
            covariance += rc[index_point] * deviation.T.dot(deviation);
        
        covariance = covariance / mc;
        return covariance;


    def __update_mean(self, rc, mc):
        mean = 0.0;
        for index_point in range(len(self.__data)):
            mean += rc[index_point] * self.__data[index_point];
        
        mean = mean / mc;
        return mean;

Push — 0.7.dev ( c2d6e2...fccf24 )

gaussian() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Cluster analysis algorithm: Expectation-Maximization Algorithm for Gaussian Mixture Model.
4
5			@authors Andrei Novikov ([email protected])
6			@date 2014-2017
7			@copyright GNU Public License
8
9			@cond GNU_PUBLIC_LICENSE
10			PyClustering is free software: you can redistribute it and/or modify
11			it under the terms of the GNU General Public License as published by
12			the Free Software Foundation, either version 3 of the License, or
13			(at your option) any later version.
14
15			PyClustering is distributed in the hope that it will be useful,
16			but WITHOUT ANY WARRANTY; without even the implied warranty of
17			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18			GNU General Public License for more details.
19
20			You should have received a copy of the GNU General Public License
21			along with this program. If not, see <http://www.gnu.org/licenses/>.
22			@endcond
23
24			"""
25
26
27			import numpy;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
28			import random;
29
30			from pyclustering.cluster import cluster_visualizer;
31			from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
32			from pyclustering.cluster.kmeans import kmeans;
33
34			from pyclustering.utils import pi, calculate_ellipse_description, euclidean_distance_sqrt;
35
36			from enum import IntEnum;
37
38			import matplotlib.pyplot as plt;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `matplotlib.pyplot` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
39			import matplotlib.animation as animation;
			0 ignored issues – show Configuration introduced 2017-09-19 14:51 UTC by Report Bug Copy Issue Report The import `matplotlib.animation` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
40			from matplotlib import patches;
			0 ignored issues – show Configuration introduced 2017-09-16 22:44 UTC by Report Bug Copy Issue Report The import `matplotlib` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
41
42
43
44			def gaussian(data, mean, covariance):
45			dimension = float(len(data[0]));
46
47			if (dimension != 1.0):
48			inv_variance = numpy.linalg.pinv(covariance);
49			else:
50			inv_variance = 1.0 / covariance;
51
52			divider = (pi * 2.0) ** (dimension / 2.0) * numpy.sqrt(numpy.linalg.norm(covariance));
53			if (divider != 0.0):
54			right_const = 1.0 / divider;
55			else:
56			right_const = float('inf');
57
58			result = [];
59
60			for point in data:
61			mean_delta = point - mean;
62			point_gaussian = right_const * numpy.exp( -0.5 * mean_delta.dot(inv_variance).dot(numpy.transpose(mean_delta)) );
63			result.append(point_gaussian);
64
65			return result;
66
67
68
69			class ema_init_type(IntEnum):
70			RANDOM_INITIALIZATION = 0;
71			KMEANS_INITIALIZATION = 1;
72
73
74
75			class ema_initializer():
76			__MAX_GENERATION_ATTEPTS = 10;
77
78			def __init__(self, sample, amount):
79			self.__sample = sample;
80			self.__amount = amount;
81
82
83			def initialize(self, init_type = ema_init_type.KMEANS_INITIALIZATION):
84			if (init_type == ema_init_type.KMEANS_INITIALIZATION):
85			return self.__initialize_kmeans();
86
87			elif (init_type == ema_init_type.RANDOM_INITIALIZATION):
88			return self.__initialize_random();
89
90			raise NameError("Unknown type of EM algorithm initialization is specified.");
91
92
93			def __calculate_initial_clusters(self, centers):
94			"""!
95			@brief Calculate Euclidean distance to each point from the each cluster.
96			@brief Nearest points are captured by according clusters and as a result clusters are updated.
97
98			@return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
99
100			"""
101
102			clusters = [[] for _ in range(len(centers))];
103			for index_point in range(len(self.__sample)):
104			index_optim, dist_optim = -1, 0.0;
105
106			for index in range(len(centers)):
107			dist = euclidean_distance_sqrt(self.__sample[index_point], centers[index]);
108
109			if ( (dist < dist_optim) or (index is 0)):
110			index_optim, dist_optim = index, dist;
111
112			clusters[index_optim].append(index_point);
113
114			return clusters;
115
116
117			def __calculate_initial_covariances(self, initial_clusters):
118			covariances = [];
119			for initial_cluster in initial_clusters:
120			if (len(initial_cluster) > 1):
121			cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
122			covariances.append(numpy.cov(cluster_sample, rowvar = False));
123			else:
124			dimension = len(self.__sample[0]);
125			covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0);
126
127			return covariances;
128
129
130			def __initialize_random(self):
131			initial_means = [];
132
133			for _ in range(self.__amount):
134			mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
135			attempts = 0;
136			while ( (mean in initial_means) and (attempts < ema_initializer.__MAX_GENERATION_ATTEPTS) ):
137			mean = self.__sample[ random.randint(0, len(self.__sample)) - 1 ];
138			attempts += 1;
139
140			if (attempts == ema_initializer.__MAX_GENERATION_ATTEPTS):
141			mean = [ value + (random.random() - 0.5) * value * 0.2 for value in mean ];
142
143			initial_means.append(mean);
144
145			initial_clusters = self.__calculate_initial_clusters(initial_means);
146			initial_covariance = self.__calculate_initial_covariances(initial_clusters);
147
148			return initial_means, initial_covariance;
149
150
151			def __initialize_kmeans(self):
152			initial_centers = kmeans_plusplus_initializer(self.__sample, self.__amount).initialize();
153			kmeans_instance = kmeans(self.__sample, initial_centers, ccore = True);
154			kmeans_instance.process();
155
156			means = kmeans_instance.get_centers();
157
158			covariances = [];
159			initial_clusters = kmeans_instance.get_clusters();
160			for initial_cluster in initial_clusters:
161			if (len(initial_cluster) > 1):
162			cluster_sample = [ self.__sample[index_point] for index_point in initial_cluster ];
163			covariances.append(numpy.cov(cluster_sample, rowvar = False));
164			else:
165			dimension = len(self.__sample[0]);
166			covariances.append(numpy.zeros((dimension, dimension)) + random.random() / 10.0);
167
168			return means, covariances;
169
170
171
172			class ema_observer:
173			def __init__(self):
174			self.__means_evolution = [];
175			self.__covariances_evolution = [];
176			self.__clusters_evolution = [];
177
178
179			def __len__(self):
180			return len(self.__means_evolution);
181
182
183			def get_iterations(self):
184			return len(self.__means_evolution);
185
186
187			def get_evolution_means(self):
188			return self.__means_evolution;
189
190
191			def get_evolution_covariances(self):
192			return self.__covariances_evolution;
193
194
195			def get_evolution_clusters(self):
196			return self.__clusters_evolution;
197
198
199			def notify(self, means, covariances, clusters):
200			self.__means_evolution.append(means);
201			self.__covariances_evolution.append(covariances);
202			self.__clusters_evolution.append(clusters);
203
204
205
206			class ema_visualizer:
207			@staticmethod
208			def show_clusters(clusters, sample, covariances, means, figure = None, display = True):
209			visualizer = cluster_visualizer();
210			visualizer.append_clusters(clusters, sample);
211
212			if (figure is None):
213			figure = visualizer.show(display = False);
214			else:
215			visualizer.show(figure = figure, display = False);
216
217			if (len(sample[0]) == 2):
218			ema_visualizer.__draw_ellipses(figure, visualizer, clusters, covariances, means);
219
220			if (display is True):
221			plt.show();
222
223			return figure;
224
225
226			@staticmethod
227			def animate_cluster_allocation(data, observer, animation_velocity = 75, movie_fps = 1, save_movie = None):
228			figure = plt.figure();
229
230			def init_frame():
231			return frame_generation(0);
232
233			def frame_generation(index_iteration):
234			figure.clf();
235
236			figure.suptitle("Expectation maximixation algorithm (iteration: " + str(index_iteration) +")", fontsize = 18, fontweight = 'bold');
237
238			clusters = observer.get_evolution_clusters()[index_iteration];
239			covariances = observer.get_evolution_covariances()[index_iteration];
240			means = observer.get_evolution_means()[index_iteration];
241
242			ema_visualizer.show_clusters(clusters, data, covariances, means, figure, False);
243			figure.subplots_adjust(top = 0.85);
244
245			return [ figure.gca() ];
246
247			iterations = len(observer);
248			cluster_animation = animation.FuncAnimation(figure, frame_generation, iterations, interval = animation_velocity, init_func = init_frame, repeat_delay = 5000);
249
250			if (save_movie is not None):
251			cluster_animation.save(save_movie, writer = 'ffmpeg', fps = movie_fps, bitrate = 1500);
252			else:
253			plt.show();
254
255
256			@staticmethod
257			def __draw_ellipses(figure, visualizer, clusters, covariances, means):
258			ax = figure.get_axes()[0];
259
260			for index in range(len(clusters)):
261			angle, width, height = calculate_ellipse_description(covariances[index]);
262			color = visualizer.get_cluster_color(index, 0);
263
264			ema_visualizer.__draw_ellipse(ax, means[index][0], means[index][1], angle, width, height, color);
265
266
267			@staticmethod
268			def __draw_ellipse(ax, x, y, angle, width, height, color):
269			if ((width > 0.0) and (height > 0.0)):
270			ellipse = patches.Ellipse((x, y), width, height, alpha=0.2, angle=angle, linewidth=2, fill=True, zorder=2, color=color);
271			ax.add_patch(ellipse);
272
273
274
275			class ema:
276			def __init__(self, data, amount_clusters, means = None, variances = None, observer = None, tolerance = 0.00001, iterations = 100):
277			self.__data = numpy.array(data);
278			self.__amount_clusters = amount_clusters;
279			self.__tolerance = tolerance;
280			self.__iterations = iterations;
281			self.__observer = observer;
282
283			self.__means = means;
284			self.__variances = variances;
285
286			if ((means is None) or (variances is None)):
287			self.__means, self.__variances = ema_initializer(data, amount_clusters).initialize(ema_init_type.KMEANS_INITIALIZATION);
288
289			if (len(self.__means) != amount_clusters):
290			self.__amount_clusters = len(self.__means);
291
292			self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
293			self.__pic = [1.0] * amount_clusters;
294			self.__clusters = [];
295			self.__gaussians = [ [] for _ in range(amount_clusters) ];
296			self.__stop = False;
297
298
299			def process(self):
300			previous_likelihood = -200000;
301			current_likelihood = -100000;
302
303			current_iteration = 0;
304			while( (self.__stop is False) and (abs(previous_likelihood - current_likelihood) > self.__tolerance) and (current_iteration < self.__iterations) ):
305			self.__expectation_step();
306			self.__maximization_step();
307
308			current_iteration += 1;
309
310			self.__extract_clusters();
311			self.__notify();
312
313			previous_likelihood = current_likelihood;
314			current_likelihood = self.__log_likelihood();
315			self.__stop = self.__get_stop_condition();
316
317
318			def get_clusters(self):
319			return self.__clusters;
320
321
322			def get_centers(self):
323			return self.__means;
324
325
326			def get_covariances(self):
327			return self.__variances;
328
329
330			def __erase_empty_clusters(self):
331			clusters, means, variances, pic, gaussians, rc = [], [], [], [], [], [];
332
333			for index_cluster in range(len(self.__clusters)):
334			if (len(self.__clusters[index_cluster]) > 0):
335			clusters.append(self.__clusters[index_cluster]);
336			means.append(self.__means[index_cluster]);
337			variances.append(self.__variances[index_cluster]);
338			pic.append(self.__pic[index_cluster]);
339			gaussians.append(self.__gaussians[index_cluster]);
340			rc.append(self.__rc[index_cluster]);
341
342			if (len(self.__clusters) != len(clusters)):
343			self.__clusters, self.__means, self.__variances, self.__pic = clusters, means, variances, pic;
344			self.__gaussians, self.__rc = gaussians, rc;
345			self.__amount_clusters = len(self.__clusters);
346
347
348			def __notify(self):
349			if (self.__observer is not None):
350			self.__observer.notify(self.__means, self.__variances, self.__clusters);
351
352
353			def __extract_clusters(self):
354			self.__clusters = [ [] for _ in range(self.__amount_clusters) ];
355			for index_point in range(len(self.__data)):
356			candidates = [];
357			for index_cluster in range(self.__amount_clusters):
358			candidates.append((index_cluster, self.__rc[index_cluster][index_point]));
359
360			index_winner = max(candidates, key = lambda candidate : candidate[1])[0];
361			self.__clusters[index_winner].append(index_point);
362
363			self.__erase_empty_clusters();
364
365
366			def __log_likelihood(self):
367			likelihood = 0.0;
368
369			for index_point in range(len(self.__data)):
370			particle = 0.0;
371			for index_cluster in range(self.__amount_clusters):
372			particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
373
374			likelihood += numpy.log(particle);
375
376			return likelihood;
377
378
379			def __probabilities(self, index_cluster, index_point):
380			divider = 0.0;
381			for i in range(self.__amount_clusters):
382			divider += self.__pic[i] * self.__gaussians[i][index_point];
383
384			if ( (divider != 0.0) and (divider != float('inf')) ):
385			return self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
386
387			return float('nan');
388
389
390			def __expectation_step(self):
391			self.__gaussians = [ [] for _ in range(self.__amount_clusters) ];
392			for index in range(self.__amount_clusters):
393			self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
394
395			self.__rc = [ [0.0] * len(self.__data) for _ in range(self.__amount_clusters) ];
396			for index_cluster in range(self.__amount_clusters):
397			for index_point in range(len(self.__data)):
398			self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
399
400
401			def __maximization_step(self):
402			self.__pic = [];
403			self.__means = [];
404			self.__variances = [];
405
406			amount_impossible_clusters = 0;
407
408			for index_cluster in range(self.__amount_clusters):
409			mc = numpy.sum(self.__rc[index_cluster]);
410
411			if (mc == 0.0):
412			amount_impossible_clusters += 1;
413			continue;
414
415			self.__pic.append( mc / len(self.__data) );
416			self.__means.append( self.__update_mean(self.__rc[index_cluster], mc) );
417			self.__variances.append( self.__update_covariance(self.__means[-1], self.__rc[index_cluster], mc) );
418
419			self.__amount_clusters -= amount_impossible_clusters;
420
421
422			def __get_stop_condition(self):
423			for covariance in self.__variances:
424			if (numpy.linalg.norm(covariance) == 0.0):
425			return True;
426
427			return False;
428
429
430			def __update_covariance(self, means, rc, mc):
431			covariance = 0.0;
432			for index_point in range(len(self.__data)):
433			deviation = numpy.array( [ self.__data[index_point] - means ]);
434			covariance += rc[index_point] * deviation.T.dot(deviation);
435
436			covariance = covariance / mc;
437			return covariance;
438
439
440			def __update_mean(self, rc, mc):
441			mean = 0.0;
442			for index_point in range(len(self.__data)):
443			mean += rc[index_point] * self.__data[index_point];
444
445			mean = mean / mc;
446			return mean;

annoviko / pyclustering

Push — 0.7.dev ( c2d6e2...fccf24 )

gaussian() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files