Completed
Push — 0.8.dev ( e2d899...cfe6de )
by Andrei
02:27
created

kmedoids.__init__()   B

Complexity

Conditions 3

Size

Total Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
dl 0
loc 26
rs 8.8571
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2018
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from pyclustering.cluster.encoder import type_encoding;
31
32
from pyclustering.utils import get_argument, median;
33
from pyclustering.utils.metric import calculate_metric, type_metric;
34
35
from pyclustering.core.wrapper import ccore_library;
36
37
import pyclustering.core.kmedoids_wrapper as wrapper;
38
39
40
class kmedoids:
41
    """!
42
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Partitioning Around Medoids).
43
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
44
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
45
             input data space).
46
             
47
             CCORE option can be used to use core pyclustering - C/C++ shared library for processing that significantly increases performance.
48
    
49
    Example:
50
    @code
51
        # load list of points for cluster analysis
52
        sample = read_sample(path);
53
        
54
        # set random initial medoids
55
        initial_medoids = [1, 10];
56
        
57
        # create instance of K-Medoids algorithm
58
        kmedoids_instance = kmedoids(sample, initial_medoids);
59
        
60
        # run cluster analysis and obtain results
61
        kmedoids_instance.process();
62
        clusters = kmedoids_instance.get_clusters();
63
        
64
        # show allocated clusters
65
        print(clusters);
66
    @endcode
67
    
68
    """
69
    
70
    
71
    def __init__(self, data, initial_index_medoids, tolerance = 0.25, ccore = True, **kwargs):
72
        """!
73
        @brief Constructor of clustering algorithm K-Medoids.
74
        
75
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
76
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
77
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
78
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
79
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric', 'func').
80
81
        Keyword Args:
82
            metric (type_metric): Metric that is used for distance calculation between two points.
83
            func (callable): Callable object with two arguments (point #1 and point #2) that is used only if metric is 'type_metric.USER_DEFINED'.
84
85
        """
86
        self.__pointer_data = data;
87
        self.__clusters = [];
88
        self.__medoids = [ data[medoid_index] for medoid_index in initial_index_medoids ];
89
        self.__medoid_indexes = initial_index_medoids;
90
        self.__tolerance = tolerance;
91
        self.__metric = get_argument("metric", type_metric.EUCLIDEAN_SQUARE, **kwargs);
92
        self.__func = get_argument("func", None, **kwargs);
93
        self.__ccore = ccore;
94
        
95
        if (self.__ccore):
96
            self.__ccore = ccore_library.workable();
97
98
99
    def process(self):
100
        """!
101
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
102
        
103
        @remark Results of clustering can be obtained using corresponding get methods.
104
        
105
        @see get_clusters()
106
        @see get_medoids()
107
        
108
        """
109
        
110
        if (self.__ccore is True):
111
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance);
112
            self.__medoids, self.__medoid_indexes = self.__update_medoids();
113
        
114
        else:
115
            changes = float('inf');
116
             
117
            stop_condition = self.__tolerance * self.__tolerance;
118
             
119
            while changes > stop_condition:
120
                self.__clusters = self.__update_clusters();
121
                updated_medoids, update_medoid_indexes = self.__update_medoids();
122
             
123
                changes = max([calculate_metric(self.__medoids[index], updated_medoids[index], type_metric.EUCLIDEAN_SQUARE) for index in range(len(updated_medoids))]);    # Fast solution
124
                 
125
                self.__medoids = updated_medoids;
126
                self.__medoid_indexes = update_medoid_indexes;
127
128
129
    def get_clusters(self):
130
        """!
131
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
132
        
133
        @see process()
134
        @see get_medoids()
135
        
136
        """
137
        
138
        return self.__clusters;
139
    
140
    
141
    def get_medoids(self):
142
        """!
143
        @brief Returns list of medoids of allocated clusters.
144
        
145
        @see process()
146
        @see get_clusters()
147
        
148
        """
149
150
        return self.__medoids;
151
152
153
    def get_cluster_encoding(self):
154
        """!
155
        @brief Returns clustering result representation type that indicate how clusters are encoded.
156
        
157
        @return (type_encoding) Clustering result representation.
158
        
159
        @see get_clusters()
160
        
161
        """
162
        
163
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
164
165
166
    def __update_clusters(self):
167
        """!
168
        @brief Calculate distance to each point from the each cluster. 
169
        @details Nearest points are captured by according clusters and as a result clusters are updated.
170
        
171
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
172
        
173
        """
174
        
175
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))];
176
        for index_point in range(len(self.__pointer_data)):
177
            if index_point in self.__medoid_indexes:
178
                continue;
179
180
            index_optim = -1;
181
            dist_optim = float('Inf');
182
            
183
            for index in range(len(self.__medoids)):
184
                dist = calculate_metric(self.__pointer_data[index_point], self.__medoids[index], self.__metric, self.__func);
185
                
186
                if ( (dist < dist_optim) or (index is 0)):
187
                    index_optim = index;
188
                    dist_optim = dist;
189
            
190
            clusters[index_optim].append(index_point);
191
        
192
        return clusters;
193
    
194
    
195
    def __update_medoids(self):
196
        """!
197
        @brief Find medoids of clusters in line with contained objects.
198
        
199
        @return (list) list of medoids for current number of clusters.
200
        
201
        """
202
         
203
        medoids = [[] for _ in range(len(self.__clusters))];
204
        medoid_indexes = [-1] * len(self.__clusters);
205
        
206
        for index in range(len(self.__clusters)):
207
            medoid_index = median(self.__pointer_data, self.__clusters[index]);
208
            medoids[index] = self.__pointer_data[medoid_index];
209
            medoid_indexes[index] = medoid_index;
210
             
211
        return medoids, medoid_indexes;