kmedians.__init__() - Code Metrics - Inspection of "Merge branch '0.8.dev'." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 769c99...57e143 )

by Andrei

created 2018-05-27 21:36 UTC

kmedians.init() B

↳ Parent: kmedians

Complexity

Conditions

Size

Total Lines

Duplication

Lines	26
Ratio	100 %

Importance

Changes

Metric	Value
cc	3
c	0
b	0
f	0
dl	26
loc	26
rs	8.8571

"""!

@brief Cluster analysis algorithm: K-Medians
@details Implementation based on paper @cite book::algorithms_for_clustering_data.

@authors Andrei Novikov ([email protected])
@date 2014-2018
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import math

from pyclustering.cluster.encoder import type_encoding

from pyclustering.utils.metric import distance_metric, type_metric

import pyclustering.core.kmedians_wrapper as wrapper

from pyclustering.core.wrapper import ccore_library
from pyclustering.core.metric_wrapper import metric_wrapper


class kmedians:
    """!
    @brief Class represents clustering algorithm K-Medians.
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
    
             CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
    
    Example:
    @code
        # load list of points for cluster analysis
        sample = read_sample(path);
        
        # create instance of K-Medians algorithm
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
        
        # run cluster analysis and obtain results
        kmedians_instance.process();
        kmedians_instance.get_clusters();    
    @endcode
    
    """
    
    def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):

        """!
        @brief Constructor of clustering algorithm K-Medians.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').

        <b>Keyword Args:</b><br>
            - metric (distance_metric): Metric that is used for distance calculation between two points.
        
        """
        self.__pointer_data = data
        self.__clusters = []
        self.__medians = initial_centers[:]
        self.__tolerance = tolerance

        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
        if self.__metric is None:
            self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
        if self.__ccore:
            self.__ccore = ccore_library.workable()


    def process(self):

        """!
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        @see get_medians()
        
        """
        
        if self.__ccore is True:
            ccore_metric = metric_wrapper.create_instance(self.__metric)

            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, ccore_metric.get_pointer())
            self.__medians = self.__update_medians()
            
        else:
            changes = float('inf')
             
            # Check for dimension
            if len(self.__pointer_data[0]) != len(self.__medians[0]):
                raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
             
            while changes > self.__tolerance:
                self.__clusters = self.__update_clusters()
                updated_centers = self.__update_medians()
             
                changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
                 
                self.__medians = updated_centers


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        @see get_medians()
        
        """
        
        return self.__clusters
    
    
    def get_medians(self):
        """!
        @brief Returns list of centers of allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """

        return self.__medians


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION


    def __update_clusters(self):

        """!
        @brief Calculate Manhattan distance to each point from the each cluster. 
        @details Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for i in range(len(self.__medians))]
        for index_point in range(len(self.__pointer_data)):
            index_optim = -1
            dist_optim = 0.0
             
            for index in range(len(self.__medians)):
                dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
                 
                if (dist < dist_optim) or (index is 0):
                    index_optim = index
                    dist_optim = dist
             
            clusters[index_optim].append(index_point)
            
        # If cluster is not able to capture object it should be removed
        clusters = [cluster for cluster in clusters if len(cluster) > 0]
        
        return clusters
    
    
    def __update_medians(self):
        """!
        @brief Calculate medians of clusters in line with contained objects.
        
        @return (list) list of medians for current number of clusters.
        
        """
         
        medians = [[] for i in range(len(self.__clusters))]
         
        for index in range(len(self.__clusters)):
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))]
            length_cluster = len(self.__clusters[index])
            
            for index_dimension in range(len(self.__pointer_data[0])):
                sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
                
                relative_index_median = int(math.floor((length_cluster - 1) / 2))
                index_median = sorted_cluster[relative_index_median]
                
                if (length_cluster % 2) == 0:
                    index_median_second = sorted_cluster[relative_index_median + 1]
                    medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
                    
                else:
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
             
        return medians


1		"""!
2
3		@brief Cluster analysis algorithm: K-Medians
4		@details Implementation based on paper @cite book::algorithms_for_clustering_data.
5
6		@authors Andrei Novikov ([email protected])
7		@date 2014-2018
8		@copyright GNU Public License
9
10		@cond GNU_PUBLIC_LICENSE
11		PyClustering is free software: you can redistribute it and/or modify
12		it under the terms of the GNU General Public License as published by
13		the Free Software Foundation, either version 3 of the License, or
14		(at your option) any later version.
15
16		PyClustering is distributed in the hope that it will be useful,
17		but WITHOUT ANY WARRANTY; without even the implied warranty of
18		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19		GNU General Public License for more details.
20
21		You should have received a copy of the GNU General Public License
22		along with this program. If not, see <http://www.gnu.org/licenses/>.
23		@endcond
24
25		"""
26
27
28		import math
29
30		from pyclustering.cluster.encoder import type_encoding
31
32		from pyclustering.utils.metric import distance_metric, type_metric
33
34		import pyclustering.core.kmedians_wrapper as wrapper
35
36		from pyclustering.core.wrapper import ccore_library
37		from pyclustering.core.metric_wrapper import metric_wrapper
38
39
40		class kmedians:
41		"""!
42		@brief Class represents clustering algorithm K-Medians.
43		@details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
44
45		CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
46
47		Example:
48		@code
49		# load list of points for cluster analysis
50		sample = read_sample(path);
51
52		# create instance of K-Medians algorithm
53		kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
54
55		# run cluster analysis and obtain results
56		kmedians_instance.process();
57		kmedians_instance.get_clusters();
58		@endcode
59
60		"""
61
62	View Code Duplication	def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
		0 ignored issues – show Duplication introduced 2018-05-17 09:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
63		"""!
64		@brief Constructor of clustering algorithm K-Medians.
65
66		@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
67		@param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
68		@param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
69		@param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
70		@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
71
72		<b>Keyword Args:</b><br>
73		- metric (distance_metric): Metric that is used for distance calculation between two points.
74
75		"""
76		self.__pointer_data = data
77		self.__clusters = []
78		self.__medians = initial_centers[:]
79		self.__tolerance = tolerance
80
81		self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
82		if self.__metric is None:
83		self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
84
85		self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
86		if self.__ccore:
87		self.__ccore = ccore_library.workable()
88
89
90	View Code Duplication	def process(self):
		0 ignored issues – show Duplication introduced 2018-05-17 09:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
91		"""!
92		@brief Performs cluster analysis in line with rules of K-Medians algorithm.
93
94		@remark Results of clustering can be obtained using corresponding get methods.
95
96		@see get_clusters()
97		@see get_medians()
98
99		"""
100
101		if self.__ccore is True:
102		ccore_metric = metric_wrapper.create_instance(self.__metric)
103
104		self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, ccore_metric.get_pointer())
105		self.__medians = self.__update_medians()
106
107		else:
108		changes = float('inf')
109
110		# Check for dimension
111		if len(self.__pointer_data[0]) != len(self.__medians[0]):
112		raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
113
114		while changes > self.__tolerance:
115		self.__clusters = self.__update_clusters()
116		updated_centers = self.__update_medians()
117
118		changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
119
120		self.__medians = updated_centers
121
122
123		def get_clusters(self):
124		"""!
125		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
126
127		@see process()
128		@see get_medians()
129
130		"""
131
132		return self.__clusters
133
134
135		def get_medians(self):
136		"""!
137		@brief Returns list of centers of allocated clusters.
138
139		@see process()
140		@see get_clusters()
141
142		"""
143
144		return self.__medians
145
146
147		def get_cluster_encoding(self):
148		"""!
149		@brief Returns clustering result representation type that indicate how clusters are encoded.
150
151		@return (type_encoding) Clustering result representation.
152
153		@see get_clusters()
154
155		"""
156
157		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
158
159
160	View Code Duplication	def __update_clusters(self):
		0 ignored issues – show Duplication introduced 2017-08-07 16:14 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
161		"""!
162		@brief Calculate Manhattan distance to each point from the each cluster.
163		@details Nearest points are captured by according clusters and as a result clusters are updated.
164
165		@return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
166
167		"""
168
169		clusters = [[] for i in range(len(self.__medians))]
170		for index_point in range(len(self.__pointer_data)):
171		index_optim = -1
172		dist_optim = 0.0
173
174		for index in range(len(self.__medians)):
175		dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
176
177		if (dist < dist_optim) or (index is 0):
178		index_optim = index
179		dist_optim = dist
180
181		clusters[index_optim].append(index_point)
182
183		# If cluster is not able to capture object it should be removed
184		clusters = [cluster for cluster in clusters if len(cluster) > 0]
185
186		return clusters
187
188
189		def __update_medians(self):
190		"""!
191		@brief Calculate medians of clusters in line with contained objects.
192
193		@return (list) list of medians for current number of clusters.
194
195		"""
196
197		medians = [[] for i in range(len(self.__clusters))]
198
199		for index in range(len(self.__clusters)):
200		medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))]
201		length_cluster = len(self.__clusters[index])
202
203		for index_dimension in range(len(self.__pointer_data[0])):
204		sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
205
206		relative_index_median = int(math.floor((length_cluster - 1) / 2))
207		index_median = sorted_cluster[relative_index_median]
208
209		if (length_cluster % 2) == 0:
210		index_median_second = sorted_cluster[relative_index_median + 1]
211		medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
212
213		else:
214		medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
215
216		return medians
217

annoviko / pyclustering

Push — master ( 769c99...57e143 )

kmedians.__init__() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

kmedians.init() B