Completed
Push — master ( 941fee...e45971 )
by Andrei
57s
created

kmedoids.__update_clusters()   C

Complexity

Conditions 7

Size

Total Lines 27

Duplication

Lines 22
Ratio 81.48 %

Importance

Changes 0
Metric Value
cc 7
dl 22
loc 27
rs 5.5
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2017
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from pyclustering.cluster.encoder import type_encoding;
31
32
from pyclustering.utils import euclidean_distance_sqrt, median;
33
34
import pyclustering.core.kmedoids_wrapper as wrapper;
35
36
37
class kmedoids:
38
    """!
39
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Partitioning Around Medoids).
40
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
41
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
42
             input data space).
43
    
44
    Example:
45
    @code
46
        # load list of points for cluster analysis
47
        sample = read_sample(path);
48
        
49
        # set random initial medoids
50
        initial_medoids = [1, 10];
51
        
52
        # create instance of K-Medoids algorithm
53
        kmedoids_instance = kmedoids(sample, initial_medoids);
54
        
55
        # run cluster analysis and obtain results
56
        kmedoids_instance.process();
57
        clusters = kmedoids_instance.get_clusters();
58
        
59
        # show allocated clusters
60
        print(clusters);
61
    @endcode
62
    
63
    """
64
    
65
    
66
    def __init__(self, data, initial_index_medoids, tolerance = 0.25, ccore = False):
67
        """!
68
        @brief Constructor of clustering algorithm K-Medoids.
69
        
70
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
71
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
72
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
73
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
74
        
75
        """
76
        self.__pointer_data = data;
77
        self.__clusters = [];
78
        self.__medoids = [ data[medoid_index] for medoid_index in initial_index_medoids ];
79
        self.__medoid_indexes = initial_index_medoids;
80
        self.__tolerance = tolerance;
81
        self.__ccore = ccore;
82
83
84
    def process(self):
85
        """!
86
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
87
        
88
        @remark Results of clustering can be obtained using corresponding get methods.
89
        
90
        @see get_clusters()
91
        @see get_medoids()
92
        
93
        """
94
        
95
        if (self.__ccore is True):
96
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance);
97
            self.__medoids, self.__medoid_indexes = self.__update_medoids();
98
        
99
        else:
100
            changes = float('inf');
101
             
102
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
103
            #stop_condition = self.__tolerance;              # Slow solution
104
             
105
            while (changes > stop_condition):
106
                self.__clusters = self.__update_clusters();
107
                updated_medoids, update_medoid_indexes = self.__update_medoids();  # changes should be calculated before asignment
108
             
109
                changes = max([euclidean_distance_sqrt(self.__medoids[index], updated_medoids[index]) for index in range(len(updated_medoids))]);    # Fast solution
110
                 
111
                self.__medoids = updated_medoids;
112
                self.__medoid_indexes = update_medoid_indexes;
113
114
115
    def get_clusters(self):
116
        """!
117
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
118
        
119
        @see process()
120
        @see get_medoids()
121
        
122
        """
123
        
124
        return self.__clusters;
125
    
126
    
127
    def get_medoids(self):
128
        """!
129
        @brief Returns list of medoids of allocated clusters.
130
        
131
        @see process()
132
        @see get_clusters()
133
        
134
        """
135
136
        return self.__medoids;
137
138
139
    def get_cluster_encoding(self):
140
        """!
141
        @brief Returns clustering result representation type that indicate how clusters are encoded.
142
        
143
        @return (type_encoding) Clustering result representation.
144
        
145
        @see get_clusters()
146 View Code Duplication
        
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
147
        """
148
        
149
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
150
151
152
    def __update_clusters(self):
153
        """!
154
        @brief Calculate distance to each point from the each cluster. 
155
        @details Nearest points are captured by according clusters and as a result clusters are updated.
156
        
157
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
158
        
159
        """
160
        
161
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))];
162
        for index_point in range(len(self.__pointer_data)):
163
            if (index_point in self.__medoid_indexes):
164
                continue;
165
166
            index_optim = -1;
167
            dist_optim = float('Inf');
168
            
169
            for index in range(len(self.__medoids)):
170
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medoids[index]);
171
                
172
                if ( (dist < dist_optim) or (index is 0)):
173
                    index_optim = index;
174
                    dist_optim = dist;
175
            
176
            clusters[index_optim].append(index_point);
177
        
178 View Code Duplication
        return clusters;
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
179
    
180
    
181
    def __update_medoids(self):
182
        """!
183
        @brief Find medoids of clusters in line with contained objects.
184
        
185
        @return (list) list of medoids for current number of clusters.
186
        
187
        """
188
         
189
        medoids = [[] for _ in range(len(self.__clusters))];
190
        medoid_indexes = [-1] * len(self.__clusters);
191
        
192
        for index in range(len(self.__clusters)):
193
            medoid_index = median(self.__pointer_data, self.__clusters[index]);
194
            medoids[index] = self.__pointer_data[medoid_index];
195
            medoid_indexes[index] = medoid_index;
196
             
197
        return medoids, medoid_indexes;