kmeans.__init__() - Code Metrics - Inspection of "[.] Merge '0.8.dev' branch." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( a74470...c8bc69 )

by Andrei

created 2018-02-03 15:09 UTC

kmeans.init() A

↳ Parent: kmeans

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
dl	0
loc	21
rs	9.3142
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: K-Means
@details Based on book description:
         - J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.

@authors Andrei Novikov ([email protected])
@date 2014-2018
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

import pyclustering.core.kmeans_wrapper as wrapper;

from pyclustering.core.wrapper import ccore_library;

from pyclustering.cluster.encoder import type_encoding;


class kmeans:
    """!
    @brief Class represents clustering algorithm K-Means.
    @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
    
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
             
             K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization 
             initial centers from module 'pyclustering.cluster.center_initializer'.
    
    Example #1 - Trivial clustering:
    @code
        # load list of points for cluster analysis
        sample = read_sample(path);
        
        # create instance of K-Means algorithm
        kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
        
        # run cluster analysis and obtain results
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
    @endcode
    
    Example #2 - Clustering using K-Means++ for center initialization:
    @code
        # load list of points for cluster analysis
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
        
        # initialize initial centers using K-Means++ method
        initial_centers = kmeans_plusplus_initializer(sample, 3).initialize();
        
        # create instance of K-Means algorithm with prepared centers
        kmeans_instance = kmeans(sample, initial_centers);
        
        # run cluster analysis and obtain results
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
        final_centers = kmeans_instance.get_centers();
    @endcode
    
    @see center_initializer
    
    """
    
    def __init__(self, data, initial_centers, tolerance = 0.001, ccore = True):
        """!
        @brief Constructor of clustering algorithm K-Means.
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
        
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        
        @see center_initializer
        
        """
        self.__pointer_data = numpy.matrix(data);
        self.__clusters = [];
        self.__centers = numpy.matrix(initial_centers);
        self.__tolerance = tolerance;
        
        self.__ccore = ccore;
        if (self.__ccore):
            self.__ccore = ccore_library.workable();


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of K-Means algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        @see get_centers()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
            self.__centers = self.__update_centers();
        else: 
            maximum_change = float('inf');
             
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
            #stop_condition = self.__tolerance;              # Slow solution
             
            # Check for dimension
            if (len(self.__pointer_data[0]) != len(self.__centers[0])):
                raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
             
            while (maximum_change > stop_condition):
                self.__clusters = self.__update_clusters();
                updated_centers = self.__update_centers();  # changes should be calculated before asignment
                
                if (len(self.__centers) != len(updated_centers)):
                    maximum_change = float('inf');
                
                else:
                    changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
                    maximum_change = numpy.max(changes);
                
                self.__centers = updated_centers;


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        @see get_centers()
        
        """
        
        return self.__clusters;
    
    
    def get_centers(self):
        """!
        @brief Returns list of centers of allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """

        return self.__centers.tolist();


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __update_clusters(self):
        """!
        @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for _ in range(len(self.__centers))];
        
        dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
        for index_center in range(len(self.__centers)):
            dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;
        
        optimum_indexes = numpy.argmin(dataset_differences, axis=0);
        for index_point in range(len(optimum_indexes)):
            index_cluster = optimum_indexes[index_point];
            clusters[index_cluster].append(index_point);
        
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
        
        return clusters;
    
    
    def __update_centers(self):
        """!
        @brief Calculate centers of clusters in line with contained objects.
        
        @return (numpy.matrix) Updated centers as list of centers.
        
        """
        
        dimension = self.__pointer_data.shape[1];
        centers = numpy.zeros((len(self.__clusters), dimension));
        
        for index in range(len(self.__clusters)):
            cluster_points = self.__pointer_data[self.__clusters[index], :];
            centers[index] = cluster_points.mean(axis=0);

        return numpy.matrix(centers);


Push — master ( a74470...c8bc69 )

kmeans.init() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Cluster analysis algorithm: K-Means
4			@details Based on book description:
5			- J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2018
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28
29			import numpy;
			0 ignored issues – show Configuration introduced 2018-01-20 23:17 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30
31			import pyclustering.core.kmeans_wrapper as wrapper;
32
33			from pyclustering.core.wrapper import ccore_library;
34
35			from pyclustering.cluster.encoder import type_encoding;
36
37
38			class kmeans:
39			"""!
40			@brief Class represents clustering algorithm K-Means.
41			@details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
42
43			CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
44
45			K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization
46			initial centers from module 'pyclustering.cluster.center_initializer'.
47
48			Example #1 - Trivial clustering:
49			@code
50			# load list of points for cluster analysis
51			sample = read_sample(path);
52
53			# create instance of K-Means algorithm
54			kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
55
56			# run cluster analysis and obtain results
57			kmeans_instance.process();
58			clusters = kmeans_instance.get_clusters();
59			@endcode
60
61			Example #2 - Clustering using K-Means++ for center initialization:
62			@code
63			# load list of points for cluster analysis
64			sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
65
66			# initialize initial centers using K-Means++ method
67			initial_centers = kmeans_plusplus_initializer(sample, 3).initialize();
68
69			# create instance of K-Means algorithm with prepared centers
70			kmeans_instance = kmeans(sample, initial_centers);
71
72			# run cluster analysis and obtain results
73			kmeans_instance.process();
74			clusters = kmeans_instance.get_clusters();
75			final_centers = kmeans_instance.get_centers();
76			@endcode
77
78			@see center_initializer
79
80			"""
81
82			def __init__(self, data, initial_centers, tolerance = 0.001, ccore = True):
83			"""!
84			@brief Constructor of clustering algorithm K-Means.
85			@details Center initializer can be used for creating initial centers, for example, K-Means++ method.
86
87			@param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
88			@param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
89			@param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
90			@param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
91
92			@see center_initializer
93
94			"""
95			self.__pointer_data = numpy.matrix(data);
96			self.__clusters = [];
97			self.__centers = numpy.matrix(initial_centers);
98			self.__tolerance = tolerance;
99
100			self.__ccore = ccore;
101			if (self.__ccore):
102			self.__ccore = ccore_library.workable();
103
104
105			def process(self):
106			"""!
107			@brief Performs cluster analysis in line with rules of K-Means algorithm.
108
109			@remark Results of clustering can be obtained using corresponding get methods.
110
111			@see get_clusters()
112			@see get_centers()
113
114			"""
115
116			if (self.__ccore is True):
117			self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
118			self.__centers = self.__update_centers();
119			else:
120			maximum_change = float('inf');
121
122			stop_condition = self.__tolerance * self.__tolerance; # Fast solution
123			#stop_condition = self.__tolerance; # Slow solution
124
125			# Check for dimension
126			if (len(self.__pointer_data[0]) != len(self.__centers[0])):
127			raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
128
129			while (maximum_change > stop_condition):
130			self.__clusters = self.__update_clusters();
131			updated_centers = self.__update_centers(); # changes should be calculated before asignment
132
133			if (len(self.__centers) != len(updated_centers)):
134			maximum_change = float('inf');
135
136			else:
137			changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
138			maximum_change = numpy.max(changes);
139
140			self.__centers = updated_centers;
141
142
143			def get_clusters(self):
144			"""!
145			@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
146
147			@see process()
148			@see get_centers()
149
150			"""
151
152			return self.__clusters;
153
154
155			def get_centers(self):
156			"""!
157			@brief Returns list of centers of allocated clusters.
158
159			@see process()
160			@see get_clusters()
161
162			"""
163
164			return self.__centers.tolist();
165
166
167			def get_cluster_encoding(self):
168			"""!
169			@brief Returns clustering result representation type that indicate how clusters are encoded.
170
171			@return (type_encoding) Clustering result representation.
172
173			@see get_clusters()
174
175			"""
176
177			return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
178
179
180			def __update_clusters(self):
181			"""!
182			@brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
183
184			@return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
185
186			"""
187
188			clusters = [[] for _ in range(len(self.__centers))];
189
190			dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
191			for index_center in range(len(self.__centers)):
192			dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;
193
194			optimum_indexes = numpy.argmin(dataset_differences, axis=0);
195			for index_point in range(len(optimum_indexes)):
196			index_cluster = optimum_indexes[index_point];
197			clusters[index_cluster].append(index_point);
198
199			clusters = [cluster for cluster in clusters if len(cluster) > 0];
200
201			return clusters;
202
203
204			def __update_centers(self):
205			"""!
206			@brief Calculate centers of clusters in line with contained objects.
207
208			@return (numpy.matrix) Updated centers as list of centers.
209
210			"""
211
212			dimension = self.__pointer_data.shape[1];
213			centers = numpy.zeros((len(self.__clusters), dimension));
214
215			for index in range(len(self.__clusters)):
216			cluster_points = self.__pointer_data[self.__clusters[index], :];
217			centers[index] = cluster_points.mean(axis=0);
218
219			return numpy.matrix(centers);
220

annoviko / pyclustering

Push — master ( a74470...c8bc69 )

kmeans.__init__() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

kmeans.init() A

2. Missing init.py files