Completed
Push — master ( af3192...a7eabf )
by Andrei
02:20
created

kmedoids.__update_clusters()   B

Complexity

Conditions 6

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 6
dl 0
loc 27
rs 7.5384
c 1
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2018
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from enum import IntEnum;
0 ignored issues
show
Unused Code introduced by
Unused IntEnum imported from enum
Loading history...
31
32
from pyclustering.cluster.encoder import type_encoding;
33
34
from pyclustering.utils import median;
35
from pyclustering.utils.metric import distance_metric, type_metric;
36
37
import pyclustering.core.kmedoids_wrapper as wrapper;
38
39
from pyclustering.core.wrapper import ccore_library;
40
from pyclustering.core.metric_wrapper import metric_wrapper;
41
42
43
class kmedoids:
44
    """!
45
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Partitioning Around Medoids).
46
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
47
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
48
             input data space).
49
             
50
             CCORE option can be used to use core pyclustering - C/C++ shared library for processing that significantly increases performance.
51
    
52
    Clustering example:
53
    @code
54
        # load list of points for cluster analysis
55
        sample = read_sample(path);
56
        
57
        # set random initial medoids
58
        initial_medoids = [1, 10];
59
        
60
        # create instance of K-Medoids algorithm
61
        kmedoids_instance = kmedoids(sample, initial_medoids);
62
        
63
        # run cluster analysis and obtain results
64
        kmedoids_instance.process();
65
        clusters = kmedoids_instance.get_clusters();
66
        
67
        # show allocated clusters
68
        print(clusters);
69
    @endcode
70
71
    Metric for calculation distance between points can be specified by parameter additional 'metric':
72
    @code
73
        # create Minkowski distance metric with degree equals to '2'
74
        metric = distance_metric(type_metric.MINKOWSKI, degree=2);
75
76
        # create K-Medoids algorithm with specific distance metric
77
        kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric);
78
79
        # run cluster analysis and obtain results
80
        kmedoids_instance.process();
81
        clusters = kmedoids_instance.get_clusters();
82
    @endcode
83
84
    Distance matrix can be used instead of sequence of points to increase performance and for that purpose parameter 'data_type' should be used:
85
    @code
86
        # calculate distance matrix for sample
87
        sample = read_sample(path_to_sample);
88
        matrix = calculate_distance_matrix(sample);
89
90
        # create K-Medoids algorithm for processing distance matrix instead of points
91
        kmedoids_instance = kmedoids(matrix, initial_medoids, data_type='distance_matrix');
92
93
        # run cluster analysis and obtain results
94
        kmedoids_instance.process();
95
96
        clusters = kmedoids_instance.get_clusters();
97
        medoids = kmedoids_instance.get_medoids();
98
    @endcode
99
100
    """
101
    
102
    
103
    def __init__(self, data, initial_index_medoids, tolerance = 0.001, ccore = True, **kwargs):
104
        """!
105
        @brief Constructor of clustering algorithm K-Medoids.
106
        
107
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
108
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
109
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
110
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
111
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'data_type').
112
113
        <b>Keyword Args:</b><br>
114
            - metric (distance_metric): Metric that is used for distance calculation between two points.
115
            - data_type (string): Data type of input sample 'data' that is processed by the algorithm ('points', 'distance_matrix').
116
117
        """
118
        self.__pointer_data = data;
119
        self.__clusters = [];
120
        self.__medoid_indexes = initial_index_medoids;
121
        self.__tolerance = tolerance;
122
123
        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE));
124
        if self.__metric is None:
125
            self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE);
126
127
        self.__data_type = kwargs.get('data_type', 'points');
128
        self.__distance_calculator = self.__create_distance_calculator();
129
130
        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED;
131
        if self.__ccore:
132
            self.__ccore = ccore_library.workable();
133
134
135
    def process(self):
136
        """!
137
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
138
        
139
        @remark Results of clustering can be obtained using corresponding get methods.
140
        
141
        @see get_clusters()
142
        @see get_medoids()
143
        
144
        """
145
        
146
        if self.__ccore is True:
147
            ccore_metric = metric_wrapper.create_instance(self.__metric);
148
149
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, ccore_metric.get_pointer(), self.__data_type);
150
            self.__medoid_indexes = self.__update_medoids();
151
        
152
        else:
153
            changes = float('inf');
154
             
155
            stop_condition = self.__tolerance;
156
             
157
            while changes > stop_condition:
158
                self.__clusters = self.__update_clusters();
159
                update_medoid_indexes = self.__update_medoids();
160
             
161
                changes = max([self.__metric(self.__pointer_data[self.__medoid_indexes[index]], self.__pointer_data[update_medoid_indexes[index]]) for index in range(len(update_medoid_indexes))]);
162
163
                self.__medoid_indexes = update_medoid_indexes;
164
165
166
    def get_clusters(self):
167
        """!
168
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
169
        
170
        @see process()
171
        @see get_medoids()
172
        
173
        """
174
        
175
        return self.__clusters;
176
    
177
    
178
    def get_medoids(self):
179
        """!
180
        @brief Returns list of medoids of allocated clusters represented by indexes from the input data.
181
        
182
        @see process()
183
        @see get_clusters()
184
        
185
        """
186
187
        return self.__medoid_indexes;
188
189
190
    def get_cluster_encoding(self):
191
        """!
192
        @brief Returns clustering result representation type that indicate how clusters are encoded.
193
        
194
        @return (type_encoding) Clustering result representation.
195
        
196
        @see get_clusters()
197
        
198
        """
199
        
200
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
201
202
203
    def __create_distance_calculator(self):
204
        """!
205
        @brief Creates distance calculator in line with algorithms parameters.
206
207
        @return (callable) Distance calculator.
208
209
        """
210
        if self.__data_type == 'points':
211
            return lambda index1, index2: self.__metric(self.__pointer_data[index1], self.__pointer_data[index2]);
212
213
        elif self.__data_type == 'distance_matrix':
214
            return lambda index1, index2: self.__pointer_data[index1][index2];
215
216
        else:
217
            raise TypeError("Unknown type of data is specified '%s'" % self.__data_type);
218
219
220
    def __update_clusters(self):
221
        """!
222
        @brief Calculate distance to each point from the each cluster. 
223
        @details Nearest points are captured by according clusters and as a result clusters are updated.
224
        
225
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
226
        
227
        """
228
        
229
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoid_indexes))];
230
        for index_point in range(len(self.__pointer_data)):
231
            if index_point in self.__medoid_indexes:
232
                continue;
233
234
            index_optim = -1;
235
            dist_optim = float('Inf');
236
            
237
            for index in range(len(self.__medoid_indexes)):
238
                dist = self.__distance_calculator(index_point, self.__medoid_indexes[index]);
239
                
240
                if dist < dist_optim:
241
                    index_optim = index;
242
                    dist_optim = dist;
243
            
244
            clusters[index_optim].append(index_point);
245
        
246
        return clusters;
247
    
248
    
249
    def __update_medoids(self):
250
        """!
251
        @brief Find medoids of clusters in line with contained objects.
252
        
253
        @return (list) list of medoids for current number of clusters.
254
        
255
        """
256
257
        medoid_indexes = [-1] * len(self.__clusters);
258
        
259
        for index in range(len(self.__clusters)):
260
            medoid_index = median(self.__pointer_data, self.__clusters[index], metric=self.__metric, data_type=self.__data_type);
261
            medoid_indexes[index] = medoid_index;
262
             
263
        return medoid_indexes;
264