ema.__get_random_variances() - Code Metrics - Inspection of "#16: Expectation maximization algorithm prototype." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — 0.7.dev (7db1ce)

by Andrei

created 2017-09-11 12:45 UTC

ema.__get_random_variances() A

↳ Parent: ema

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
dl	0
loc	8
rs	9.4285
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
@details Implementation based on article:
         - 

@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from pyclustering.utils import pi;

import matplotlib.pyplot as plt;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from _operator import index



def gaussion_multivariable(data, mean = None, covariance = None):
    dimension = len(data[0]);

    if (mean is None):
        mean = numpy.mean(data);
    
    if (covariance is None):
        covariance = numpy.cov(data);
    
    inv_variance = numpy.linalg.inv(covariance);
    right_const = 1.0 / ( (pi * 2.0) ** (dimension / 2.0) * numpy.linalg.norm(covariance) ** 0.5 );
    
    result = [];
    
    for point in data:
        mean_delta = point - mean;
        point_gaussian = right_const * numpy.exp(-0.5 * mean_delta.T * inv_variance * mean_delta);
        result.append(point_gaussian);
    
    return result;


def gaussian_singlevariable(data, mean = None, variance = None):
    if (mean is None):
        mean = numpy.mean(data);
    
    if (variance is None):
        variance = numpy.var(data, ddof = 1);

    right_const = 1.0 / ( 2.0 * pi * variance ) ** 0.5;
    result = [];
    
    for point in data:
        mean_delta = point - mean;
        point_gaussian = right_const * numpy.exp(-mean_delta ** 2.0 / (2.0 * variance) );
        result.append(point_gaussian);
    
    return result;


def gaussian(data, mean = None, variance = None):
    try: dimension = len(data[0]);
    except: dimension = 1;
    
    if (dimension == 1):
        return gaussian_singlevariable(data, mean, variance);
    else:
        return gaussion_multivariable(data, mean, variance);


# data = numpy.random.normal(0, 0.1, 100);
# one_gaussian = gaussian(data);
# one_gaussian.sort();
# 
# print(one_gaussian);
# 
# axis = plt.subplot(111);
# plt.plot(one_gaussian, 'b-', linewidth = 2.0);
# plt.show();



class ema:
    def __init__(self, data, amount_clusters, means = None, variances = None):
        self.__data = data;
        self.__amount_clusters = amount_clusters;
        
        self.__means = means;
        if (means is None):
            self.__means = self.__get_random_means(data, amount_clusters);

        self.__variances = variances;
        if (variances is None):
            self.__variances = self.__get_random_covariances(data, amount_clusters);
        
        self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
        self.__pic = [1.0] * amount_clusters;
        self.__clusters = [];
        self.__gaussians = [ [] for _ in range(amount_clusters) ];


    def process(self):
        previous_likelihood = -1000.0;
        current_likelihood = 0.0;
        
        while(abs(previous_likelihood - current_likelihood) > 0.1):
            self.__expectation_step();
            self.__maximization_step();
            
            previous_likelihood = current_likelihood;
            current_likelihood = self.__log_likelihood();


    def get_clusters(self):
        return self.__clusters;


    def __log_likelihood(self):
        likelihood = 0.0;
        
        for index_point in range(len(self.__data)):
            particle = 0.0;
            for index_cluster in range(self.__amount_clusters):
                particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
            
            likelihood += numpy.log(particle);
        
        return likelihood;


    def __probabilities(self, index_cluster, index_point):
        divider = 0.0;
        for i in range(self.__amount_clusters):
            divider += self.__pic[i] * self.__gaussians[i][index_point];
        
        rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
        return rc;


    def __expectation_step(self):
        for index in range(self.__amount_clusters):
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
            self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
        
        for index_cluster in range(self.__amount_clusters):
            for index_point in range(self.__data):
                self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);


    def __maximization_step(self):
        for index_cluster in range(self.__amount_clusters):
            mc = numpy.sum(self.__rc[index_cluster]);
            
            self.__pic[index_cluster] = mc / len(self.__data);
            self.__means[index_cluster] = self.__update_mean(index_cluster, mc);
            self.__variances[index_cluster] = self.__update_variance(index_cluster, mc);


    def __update_variance(self, index_cluster, mc):
        variance = 0.0;
        for index_point in range(len(self.__data)):
            deviation = self.__data[index_point] - self.__means[index_cluster];
            variance += self.__rc[index_cluster][index_point] * deviation.T * deviation;
        
        variance = variance / mc;
        return variance;


    def __update_mean(self, index_cluster, mc):
        mean = 0.0;
        for index_point in range(len(self.__data)):
            mean += self.__rc[index_cluster][index_point] * self.__data[index_point];
        
        mean = mean / mc;
        return mean;


    def __get_random_covariances(self, data, amount):
        covariances = [];
        data_covariance = numpy.cov(data);
        for _ in range(amount):
            random_appendix = numpy.min(data_covariance) * 0.2 * numpy.random.random();
            covariances.append(data_covariance + random_appendix);
        
        return covariances;


    def __get_random_variances(self, data, amount):
        variances = [];
        data_variance = numpy.var(data, ddof = 1);
        for _ in range(amount):
            random_appendix = data_variance * 0.1 * numpy.random.random();
            variances.append(random_appendix + variances);
        
        return variances;


    def __get_random_means(self, data, amount):
        means = [];
        for _ in range(amount):
            random_index = numpy.random.randint(0, len(data));
            means.append(numpy.array(data[random_index]));
        
        return means;
    


# from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES;
# from pyclustering.utils import read_sample;
# 
# sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1);
# ema_instance = ema(sample, 2);
# ema_instance.process();
# clusters = ema_instance.get_clusters();
# 
# print(clusters);

Branch — 0.7.dev (7db1ce)

ema.__get_random_variances() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Cluster analysis algorithm: Expectation-Maximization Algorithm (EMA).
4			@details Implementation based on article:
5			-
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2017
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28
29			import numpy;
			0 ignored issues – show Configuration introduced 2017-09-11 12:46 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30
31			from pyclustering.utils import pi;
32
33			import matplotlib.pyplot as plt;
			0 ignored issues – show Configuration introduced 2017-09-11 12:46 UTC by Report Bug Copy Issue Report The import `matplotlib.pyplot` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history... Unused Code introduced 2017-09-11 12:46 UTC by Report Bug Copy Issue Report Unused matplotlib.pyplot imported as plt Loading history...
34			from _operator import index
			0 ignored issues – show Unused Code introduced 2017-09-11 12:46 UTC by Report Bug Copy Issue Report Unused index imported from _operator Loading history...
35
36
37			def gaussion_multivariable(data, mean = None, covariance = None):
38			dimension = len(data[0]);
39
40			if (mean is None):
41			mean = numpy.mean(data);
42
43			if (covariance is None):
44			covariance = numpy.cov(data);
45
46			inv_variance = numpy.linalg.inv(covariance);
47			right_const = 1.0 / ( (pi * 2.0) ** (dimension / 2.0) * numpy.linalg.norm(covariance) ** 0.5 );
48
49			result = [];
50
51			for point in data:
52			mean_delta = point - mean;
53			point_gaussian = right_const * numpy.exp(-0.5 * mean_delta.T * inv_variance * mean_delta);
54			result.append(point_gaussian);
55
56			return result;
57
58
59			def gaussian_singlevariable(data, mean = None, variance = None):
60			if (mean is None):
61			mean = numpy.mean(data);
62
63			if (variance is None):
64			variance = numpy.var(data, ddof = 1);
65
66			right_const = 1.0 / ( 2.0 * pi * variance ) ** 0.5;
67			result = [];
68
69			for point in data:
70			mean_delta = point - mean;
71			point_gaussian = right_const * numpy.exp(-mean_delta ** 2.0 / (2.0 * variance) );
72			result.append(point_gaussian);
73
74			return result;
75
76
77			def gaussian(data, mean = None, variance = None):
78			try: dimension = len(data[0]);
79			except: dimension = 1;
80
81			if (dimension == 1):
82			return gaussian_singlevariable(data, mean, variance);
83			else:
84			return gaussion_multivariable(data, mean, variance);
85
86
87			# data = numpy.random.normal(0, 0.1, 100);
88			# one_gaussian = gaussian(data);
89			# one_gaussian.sort();
90			#
91			# print(one_gaussian);
92			#
93			# axis = plt.subplot(111);
94			# plt.plot(one_gaussian, 'b-', linewidth = 2.0);
95			# plt.show();
96
97
98
99			class ema:
100			def __init__(self, data, amount_clusters, means = None, variances = None):
101			self.__data = data;
102			self.__amount_clusters = amount_clusters;
103
104			self.__means = means;
105			if (means is None):
106			self.__means = self.__get_random_means(data, amount_clusters);
107
108			self.__variances = variances;
109			if (variances is None):
110			self.__variances = self.__get_random_covariances(data, amount_clusters);
111
112			self.__rc = [ [0.0] * len(self.__data) for _ in range(amount_clusters) ];
113			self.__pic = [1.0] * amount_clusters;
114			self.__clusters = [];
115			self.__gaussians = [ [] for _ in range(amount_clusters) ];
116
117
118			def process(self):
119			previous_likelihood = -1000.0;
120			current_likelihood = 0.0;
121
122			while(abs(previous_likelihood - current_likelihood) > 0.1):
123			self.__expectation_step();
124			self.__maximization_step();
125
126			previous_likelihood = current_likelihood;
127			current_likelihood = self.__log_likelihood();
128
129
130			def get_clusters(self):
131			return self.__clusters;
132
133
134			def __log_likelihood(self):
135			likelihood = 0.0;
136
137			for index_point in range(len(self.__data)):
138			particle = 0.0;
139			for index_cluster in range(self.__amount_clusters):
140			particle += self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point];
141
142			likelihood += numpy.log(particle);
143
144			return likelihood;
145
146
147			def __probabilities(self, index_cluster, index_point):
148			divider = 0.0;
149			for i in range(self.__amount_clusters):
150			divider += self.__pic[i] * self.__gaussians[i][index_point];
151
152			rc = self.__pic[index_cluster] * self.__gaussians[index_cluster][index_point] / divider;
153			return rc;
154
155
156			def __expectation_step(self):
157			for index in range(self.__amount_clusters):
			0 ignored issues – show Comprehensibility Bug introduced 2017-09-11 12:46 UTC by Report Bug Copy Issue Report `index` is re-defining a name which is already available in the outer-scope (previously defined on line `34`). It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: param = 5 class Foo: def __init__(self, param): # "param" would be flagged here self.param = param Loading history...
158			self.__gaussians[index] = gaussian(self.__data, self.__means[index], self.__variances[index]);
159
160			for index_cluster in range(self.__amount_clusters):
161			for index_point in range(self.__data):
162			self.__rc[index_cluster][index_point] = self.__probabilities(index_cluster, index_point);
163
164
165			def __maximization_step(self):
166			for index_cluster in range(self.__amount_clusters):
167			mc = numpy.sum(self.__rc[index_cluster]);
168
169			self.__pic[index_cluster] = mc / len(self.__data);
170			self.__means[index_cluster] = self.__update_mean(index_cluster, mc);
171			self.__variances[index_cluster] = self.__update_variance(index_cluster, mc);
172
173
174			def __update_variance(self, index_cluster, mc):
175			variance = 0.0;
176			for index_point in range(len(self.__data)):
177			deviation = self.__data[index_point] - self.__means[index_cluster];
178			variance += self.__rc[index_cluster][index_point] * deviation.T * deviation;
179
180			variance = variance / mc;
181			return variance;
182
183
184			def __update_mean(self, index_cluster, mc):
185			mean = 0.0;
186			for index_point in range(len(self.__data)):
187			mean += self.__rc[index_cluster][index_point] * self.__data[index_point];
188
189			mean = mean / mc;
190			return mean;
191
192
193			def __get_random_covariances(self, data, amount):
194			covariances = [];
195			data_covariance = numpy.cov(data);
196			for _ in range(amount):
197			random_appendix = numpy.min(data_covariance) * 0.2 * numpy.random.random();
198			covariances.append(data_covariance + random_appendix);
199
200			return covariances;
201
202
203			def __get_random_variances(self, data, amount):
204			variances = [];
205			data_variance = numpy.var(data, ddof = 1);
206			for _ in range(amount):
207			random_appendix = data_variance * 0.1 * numpy.random.random();
208			variances.append(random_appendix + variances);
209
210			return variances;
211
212
213			def __get_random_means(self, data, amount):
214			means = [];
215			for _ in range(amount):
216			random_index = numpy.random.randint(0, len(data));
217			means.append(numpy.array(data[random_index]));
218
219			return means;
220
221
222
223			# from pyclustering.samples.definitions import SIMPLE_SAMPLES, FCPS_SAMPLES;
224			# from pyclustering.utils import read_sample;
225			#
226			# sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE1);
227			# ema_instance = ema(sample, 2);
228			# ema_instance.process();
229			# clusters = ema_instance.get_clusters();
230			#
231			# print(clusters);

annoviko / pyclustering

Branch — 0.7.dev (7db1ce)

ema.__get_random_variances() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files