Completed
Push — 0.7.dev ( 4c996b...97a20f )
by Andrei
59s
created

kmedoids.get_clusters()   A

Complexity

Conditions 1

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 10
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2017
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from pyclustering.cluster.encoder import type_encoding;
31
32
from pyclustering.utils import euclidean_distance_sqrt, median;
33
34
import pyclustering.core.kmedoids_wrapper as wrapper;
35
36
37
class kmedoids:
38
    """!
39
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Parti).
40
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
41
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
42
             input data space).
43
    
44
    Example:
45
    @code
46
        # load list of points for cluster analysis
47
        sample = read_sample(path);
48
        
49
        # create instance of K-Medoids algorithm
50
        kmedoids_instance = kmedoids(sample, [1, 10]);
51
        
52
        # run cluster analysis and obtain results
53
        kmedoids_instance.process();
54
        kmedoids_instance.get_clusters();
55
    @endcode
56
    
57
    """
58
    
59
    
60
    def __init__(self, data, initial_index_medoids, tolerance = 0.25, ccore = False):
61
        """!
62
        @brief Constructor of clustering algorithm K-Medoids.
63
        
64
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
65
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
66
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
67
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
68
        
69
        """
70
        self.__pointer_data = data;
71
        self.__clusters = [];
72
        self.__medoids = [ data[medoid_index] for medoid_index in initial_index_medoids ];
73
        self.__medoid_indexes = initial_index_medoids;
74
        self.__tolerance = tolerance;
75
        self.__ccore = ccore;
76
77
78
    def process(self):
79
        """!
80
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
81
        
82
        @remark Results of clustering can be obtained using corresponding get methods.
83
        
84
        @see get_clusters()
85
        @see get_medoids()
86
        
87
        """
88
        
89
        if (self.__ccore is True):
90
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoid_indexes, self.__tolerance);
91
            self.__medoids, self.__medoid_indexes = self.__update_medoids();
92
        
93
        else:
94
            changes = float('inf');
95
             
96
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
97
            #stop_condition = self.__tolerance;              # Slow solution
98
             
99
            while (changes > stop_condition):
100
                self.__clusters = self.__update_clusters();
101
                updated_medoids, update_medoid_indexes = self.__update_medoids();  # changes should be calculated before asignment
102
             
103
                changes = max([euclidean_distance_sqrt(self.__medoids[index], updated_medoids[index]) for index in range(len(updated_medoids))]);    # Fast solution
104
                 
105
                self.__medoids = updated_medoids;
106
                self.__medoid_indexes = update_medoid_indexes;
107
108
109
    def get_clusters(self):
110
        """!
111
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
112
        
113
        @see process()
114
        @see get_medoids()
115
        
116
        """
117
        
118
        return self.__clusters;
119
    
120
    
121
    def get_medoids(self):
122
        """!
123
        @brief Returns list of medoids of allocated clusters.
124
        
125
        @see process()
126
        @see get_clusters()
127
        
128
        """
129
130
        return self.__medoids;
131
132
133
    def get_cluster_encoding(self):
134
        """!
135
        @brief Returns clustering result representation type that indicate how clusters are encoded.
136
        
137
        @return (type_encoding) Clustering result representation.
138
        
139
        @see get_clusters()
140
        
141
        """
142
        
143
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
144
145
146 View Code Duplication
    def __update_clusters(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
147
        """!
148
        @brief Calculate distance to each point from the each cluster. 
149
        @details Nearest points are captured by according clusters and as a result clusters are updated.
150
        
151
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
152
        
153
        """
154
        
155
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))];
156
        for index_point in range(len(self.__pointer_data)):
157
            if (index_point in self.__medoid_indexes):
158
                continue;
159
160
            index_optim = -1;
161
            dist_optim = float('Inf');
162
            
163
            for index in range(len(self.__medoids)):
164
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medoids[index]);
165
                
166
                if ( (dist < dist_optim) or (index is 0)):
167
                    index_optim = index;
168
                    dist_optim = dist;
169
                
170
            clusters[index_optim].append(index_point);
171
        
172
        # If cluster is not able to capture object it should be removed
173
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
174
        
175
        return clusters;
176
    
177
    
178 View Code Duplication
    def __update_medoids(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
179
        """!
180
        @brief Find medoids of clusters in line with contained objects.
181
        
182
        @return (list) list of medoids for current number of clusters.
183
        
184
        """
185
         
186
        medoids = [[] for _ in range(len(self.__clusters))];
187
        medoid_indexes = [-1] * len(self.__clusters);
188
        
189
        for index in range(len(self.__clusters)):
190
            medoid_index = median(self.__pointer_data, self.__clusters[index]);
191
            medoids[index] = self.__pointer_data[medoid_index];
192
            medoid_indexes[index] = medoid_index;
193
             
194
        return medoids, medoid_indexes;