Completed
Push — master ( 7a46cc...a1c9a1 )
by Andrei
01:04
created

kmedoids.__init__()   B

Complexity

Conditions 4

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 28
rs 8.5806
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2018
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from pyclustering.cluster.encoder import type_encoding;
31
32
from pyclustering.utils import median;
33
from pyclustering.utils.metric import distance_metric, type_metric;
34
35
from pyclustering.core.wrapper import ccore_library;
36
37
import pyclustering.core.kmedoids_wrapper as wrapper;
38
39
from pyclustering.core.metric_wrapper import metric_wrapper;
40
41
42
class kmedoids:
43
    """!
44
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Partitioning Around Medoids).
45
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
46
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
47
             input data space).
48
             
49
             CCORE option can be used to use core pyclustering - C/C++ shared library for processing that significantly increases performance.
50
    
51
    Clustering example:
52
    @code
53
        # load list of points for cluster analysis
54
        sample = read_sample(path);
55
        
56
        # set random initial medoids
57
        initial_medoids = [1, 10];
58
        
59
        # create instance of K-Medoids algorithm
60
        kmedoids_instance = kmedoids(sample, initial_medoids);
61
        
62
        # run cluster analysis and obtain results
63
        kmedoids_instance.process();
64
        clusters = kmedoids_instance.get_clusters();
65
        
66
        # show allocated clusters
67
        print(clusters);
68
    @endcode
69
70
    Metric foc calculation distance between points can be specified by parameter additional 'metric':
71
    @code
72
        from pyclustering.utils.metric import type_metric, distance_metric;
73
74
        # create Minkowski distance metric with degree equals to '2'
75
        metric = distance_metric(type_metric.MINKOWSKI, degree=2);
76
77
        # create K-Medoids algorithm with specific distance metric
78
        kmedoids_instance = kmedoids(sample, initial_medoids, metric=metric);
79
80
        # run cluster analysis and obtain results
81
        kmedoids_instance.process();
82
        clusters = kmedoids_instance.get_clusters();
83
    @endcode
84
    
85
    """
86
    
87
    
88
    def __init__(self, data, initial_index_medoids, tolerance = 0.25, ccore = True, **kwargs):
89
        """!
90
        @brief Constructor of clustering algorithm K-Medoids.
91
        
92
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
93
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
94
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
95
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
96
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric').
97
98
        Keyword Args:
99
            metric (distance_metric): Metric that is used for distance calculation between two points.
100
101
        """
102
        self.__pointer_data = data;
103
        self.__clusters = [];
104
        self.__medoids = [ data[medoid_index] for medoid_index in initial_index_medoids ];
105
        self.__medoid_indexes = initial_index_medoids;
106
        self.__tolerance = tolerance;
107
        self.__metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN_SQUARE));
108
109
        if self.__metric is None:
110
            self.__metric = distance_metric(type_metric.EUCLIDEAN_SQUARE);
111
112
        self.__ccore = ccore and self.__metric.get_type() != type_metric.USER_DEFINED;
113
114
        if self.__ccore:
115
            self.__ccore = ccore_library.workable();
116
117
118
    def process(self):
119
        """!
120
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
121
        
122
        @remark Results of clustering can be obtained using corresponding get methods.
123
        
124
        @see get_clusters()
125
        @see get_medoids()
126
        
127
        """
128
        
129
        if (self.__ccore is True):
130
            ccore_metric = metric_wrapper.create_instance(self.__metric);
131
132
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance, ccore_metric.get_pointer());
133
            self.__medoids, self.__medoid_indexes = self.__update_medoids();
134
        
135
        else:
136
            changes = float('inf');
137
             
138
            stop_condition = self.__tolerance * self.__tolerance;
139
             
140
            while changes > stop_condition:
141
                self.__clusters = self.__update_clusters();
142
                updated_medoids, update_medoid_indexes = self.__update_medoids();
143
             
144
                changes = max([distance_metric(type_metric.EUCLIDEAN_SQUARE)(self.__medoids[index], updated_medoids[index]) for index in range(len(updated_medoids))]);    # Fast solution
145
                 
146
                self.__medoids = updated_medoids;
147
                self.__medoid_indexes = update_medoid_indexes;
148
149
150
    def get_clusters(self):
151
        """!
152
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
153
        
154
        @see process()
155
        @see get_medoids()
156
        
157
        """
158
        
159
        return self.__clusters;
160
    
161
    
162
    def get_medoids(self):
163
        """!
164
        @brief Returns list of medoids of allocated clusters.
165
        
166
        @see process()
167
        @see get_clusters()
168
        
169
        """
170
171
        return self.__medoids;
172
173
174
    def get_cluster_encoding(self):
175
        """!
176
        @brief Returns clustering result representation type that indicate how clusters are encoded.
177
        
178
        @return (type_encoding) Clustering result representation.
179
        
180
        @see get_clusters()
181
        
182
        """
183
        
184
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
185
186
187
    def __update_clusters(self):
188
        """!
189
        @brief Calculate distance to each point from the each cluster. 
190
        @details Nearest points are captured by according clusters and as a result clusters are updated.
191
        
192
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
193
        
194
        """
195
        
196
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))];
197
        for index_point in range(len(self.__pointer_data)):
198
            if index_point in self.__medoid_indexes:
199
                continue;
200
201
            index_optim = -1;
202
            dist_optim = float('Inf');
203
            
204
            for index in range(len(self.__medoids)):
205
                dist = self.__metric(self.__pointer_data[index_point], self.__medoids[index]);
206
                
207
                if (dist < dist_optim) or (index is 0):
208
                    index_optim = index;
209
                    dist_optim = dist;
210
            
211
            clusters[index_optim].append(index_point);
212
        
213
        return clusters;
214
    
215
    
216
    def __update_medoids(self):
217
        """!
218
        @brief Find medoids of clusters in line with contained objects.
219
        
220
        @return (list) list of medoids for current number of clusters.
221
        
222
        """
223
         
224
        medoids = [[] for _ in range(len(self.__clusters))];
225
        medoid_indexes = [-1] * len(self.__clusters);
226
        
227
        for index in range(len(self.__clusters)):
228
            medoid_index = median(self.__pointer_data, self.__clusters[index], metric=self.__metric);
229
            medoids[index] = self.__pointer_data[medoid_index];
230
            medoid_indexes[index] = medoid_index;
231
             
232
        return medoids, medoid_indexes;