Completed
Push — master ( 769c99...57e143 )
by Andrei
01:09
created

kmedians.__init__()   B

Complexity

Conditions 3

Size

Total Lines 26

Duplication

Lines 26
Ratio 100 %

Importance

Changes 0
Metric Value
cc 3
c 0
b 0
f 0
dl 26
loc 26
rs 8.8571
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medians
4
@details Implementation based on paper @cite book::algorithms_for_clustering_data.
5
6
@authors Andrei Novikov ([email protected])
7
@date 2014-2018
8
@copyright GNU Public License
9
10
@cond GNU_PUBLIC_LICENSE
11
    PyClustering is free software: you can redistribute it and/or modify
12
    it under the terms of the GNU General Public License as published by
13
    the Free Software Foundation, either version 3 of the License, or
14
    (at your option) any later version.
15
    
16
    PyClustering is distributed in the hope that it will be useful,
17
    but WITHOUT ANY WARRANTY; without even the implied warranty of
18
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
    GNU General Public License for more details.
20
    
21
    You should have received a copy of the GNU General Public License
22
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
23
@endcond
24
25
"""
26
27
28
import math
29
30
from pyclustering.cluster.encoder import type_encoding
31
32
from pyclustering.utils.metric import distance_metric, type_metric
33
34
import pyclustering.core.kmedians_wrapper as wrapper
35
36
from pyclustering.core.wrapper import ccore_library
37
from pyclustering.core.metric_wrapper import metric_wrapper
38
39
40
class kmedians:
41
    """!
42
    @brief Class represents clustering algorithm K-Medians.
43
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
44
    
45
             CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
46
    
47
    Example:
48
    @code
49
        # load list of points for cluster analysis
50
        sample = read_sample(path);
51
        
52
        # create instance of K-Medians algorithm
53
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
54
        
55
        # run cluster analysis and obtain results
56
        kmedians_instance.process();
57
        kmedians_instance.get_clusters();    
58
    @endcode
59
    
60
    """
61
    
62 View Code Duplication
    def __init__(self, data, initial_centers, tolerance=0.001, ccore=True, **kwargs):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
63
        """!
64
        @brief Constructor of clustering algorithm K-Medians.
65
        
66
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
67
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
68
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
69
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
70
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
71
72
        <b>Keyword Args:</b><br>
73
            - metric (distance_metric): Metric that is used for distance calculation between two points.
74
        
75
        """
76
        self.__pointer_data = data
77
        self.__clusters = []
78
        self.__medians = initial_centers[:]
79
        self.__tolerance = tolerance
80
81
        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE))
82
        if self.__metric is None:
83
            self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)
84
85
        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED
86
        if self.__ccore:
87
            self.__ccore = ccore_library.workable()
88
89
90 View Code Duplication
    def process(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
91
        """!
92
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
93
        
94
        @remark Results of clustering can be obtained using corresponding get methods.
95
        
96
        @see get_clusters()
97
        @see get_medians()
98
        
99
        """
100
        
101
        if self.__ccore is True:
102
            ccore_metric = metric_wrapper.create_instance(self.__metric)
103
104
            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance, ccore_metric.get_pointer())
105
            self.__medians = self.__update_medians()
106
            
107
        else:
108
            changes = float('inf')
109
             
110
            # Check for dimension
111
            if len(self.__pointer_data[0]) != len(self.__medians[0]):
112
                raise NameError('Dimension of the input data and dimension of the initial medians must be equal.')
113
             
114
            while changes > self.__tolerance:
115
                self.__clusters = self.__update_clusters()
116
                updated_centers = self.__update_medians()
117
             
118
                changes = max([self.__metric(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))])
119
                 
120
                self.__medians = updated_centers
121
122
123
    def get_clusters(self):
124
        """!
125
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
126
        
127
        @see process()
128
        @see get_medians()
129
        
130
        """
131
        
132
        return self.__clusters
133
    
134
    
135
    def get_medians(self):
136
        """!
137
        @brief Returns list of centers of allocated clusters.
138
        
139
        @see process()
140
        @see get_clusters()
141
        
142
        """
143
144
        return self.__medians
145
146
147
    def get_cluster_encoding(self):
148
        """!
149
        @brief Returns clustering result representation type that indicate how clusters are encoded.
150
        
151
        @return (type_encoding) Clustering result representation.
152
        
153
        @see get_clusters()
154
        
155
        """
156
        
157
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
158
159
160 View Code Duplication
    def __update_clusters(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
161
        """!
162
        @brief Calculate Manhattan distance to each point from the each cluster. 
163
        @details Nearest points are captured by according clusters and as a result clusters are updated.
164
        
165
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
166
        
167
        """
168
        
169
        clusters = [[] for i in range(len(self.__medians))]
170
        for index_point in range(len(self.__pointer_data)):
171
            index_optim = -1
172
            dist_optim = 0.0
173
             
174
            for index in range(len(self.__medians)):
175
                dist = self.__metric(self.__pointer_data[index_point], self.__medians[index])
176
                 
177
                if (dist < dist_optim) or (index is 0):
178
                    index_optim = index
179
                    dist_optim = dist
180
             
181
            clusters[index_optim].append(index_point)
182
            
183
        # If cluster is not able to capture object it should be removed
184
        clusters = [cluster for cluster in clusters if len(cluster) > 0]
185
        
186
        return clusters
187
    
188
    
189
    def __update_medians(self):
190
        """!
191
        @brief Calculate medians of clusters in line with contained objects.
192
        
193
        @return (list) list of medians for current number of clusters.
194
        
195
        """
196
         
197
        medians = [[] for i in range(len(self.__clusters))]
198
         
199
        for index in range(len(self.__clusters)):
200
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))]
201
            length_cluster = len(self.__clusters[index])
202
            
203
            for index_dimension in range(len(self.__pointer_data[0])):
204
                sorted_cluster = sorted(self.__clusters[index], key=lambda x: self.__pointer_data[x][index_dimension])
205
                
206
                relative_index_median = int(math.floor((length_cluster - 1) / 2))
207
                index_median = sorted_cluster[relative_index_median]
208
                
209
                if (length_cluster % 2) == 0:
210
                    index_median_second = sorted_cluster[relative_index_median + 1]
211
                    medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0
212
                    
213
                else:
214
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]
215
             
216
        return medians
217