Completed
Push — master ( 39d70b...288fe5 )
by Andrei
01:35
created

kmedoids.get_medoids()   A

Complexity

Conditions 1

Size

Total Lines 10

Duplication

Lines 6
Ratio 60 %
Metric Value
cc 1
dl 6
loc 10
rs 9.4285
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medoids (PAM - Partitioning Around Medoids).
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
         - L. Kaufman, P.J. Rousseeuw, Finding Groups in Data: an Introduction to Cluster Analysis. 1990.
7
8
@authors Andrei Novikov ([email protected])
9
@date 2014-2016
10
@copyright GNU Public License
11
12
@cond GNU_PUBLIC_LICENSE
13
    PyClustering is free software: you can redistribute it and/or modify
14
    it under the terms of the GNU General Public License as published by
15
    the Free Software Foundation, either version 3 of the License, or
16
    (at your option) any later version.
17
    
18
    PyClustering is distributed in the hope that it will be useful,
19
    but WITHOUT ANY WARRANTY; without even the implied warranty of
20
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
    GNU General Public License for more details.
22
    
23
    You should have received a copy of the GNU General Public License
24
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
25
@endcond
26
27
"""
28
29
30
from pyclustering.utils import euclidean_distance_sqrt, median;
31
32
import pyclustering.core.kmedoids_wrapper as wrapper;
33
34
35
class kmedoids:
36
    """!
37
    @brief Class represents clustering algorithm K-Medoids (another one title is PAM - Parti).
38
    @details The algorithm is less sensitive to outliers tham K-Means. The principle difference between K-Medoids and K-Medians is that
39
             K-Medoids uses existed points from input data space as medoids, but median in K-Medians can be unreal object (not from
40
             input data space).
41
    
42
    Example:
43
    @code
44
        # load list of points for cluster analysis
45
        sample = read_sample(path);
46
        
47
        # create instance of K-Medoids algorithm
48
        kmedians_instance = kmedians(sample, [1, 10]);
49
        
50
        # run cluster analysis and obtain results
51
        kmedians_instance.process();
52
        kmedians_instance.get_clusters();    
53
    @endcode
54
    
55
    """
56
    
57
    
58
    def __init__(self, data, initial_index_medoids, tolerance = 0.25, ccore = False):
59
        """!
60
        @brief Constructor of clustering algorithm K-Medoids.
61
        
62
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
63
        @param[in] initial_index_medoids (list): Indexes of intial medoids (indexes of points in input data).
64
        @param[in] tolerance (double): Stop condition: if maximum value of distance change of medoids of clusters is less than tolerance than algorithm will stop processing.
65
        @param[in] ccore (bool): If specified than CCORE library (C++ pyclustering library) is used for clustering instead of Python code.
66
        
67
        """
68
        self.__pointer_data = data;
69
        self.__clusters = [];
70
        self.__medoids = initial_index_medoids;
71
        self.__tolerance = tolerance;
72
        self.__ccore = ccore;
73
74
75 View Code Duplication
    def process(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
76
        """!
77
        @brief Performs cluster analysis in line with rules of K-Medoids algorithm.
78
        
79
        @remark Results of clustering can be obtained using corresponding get methods.
80
        
81
        @see get_clusters()
82
        @see get_medoids()
83
        
84
        """
85
        
86
        if (self.__ccore is True):
87
            self.__clusters = wrapper.kmedoids(self.__pointer_data, self.__medoids, self.__tolerance);
88
            self.__medoids = self.__update_medoids();
89
        
90
        else:
91
            self.__medoids = [ self.__pointer_data[medoid_index] for medoid_index in self.__medoids ];
92
            
93
            changes = float('inf');
94
             
95
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
96
            #stop_condition = self.__tolerance;              # Slow solution
97
             
98
            while (changes > stop_condition):
99
                self.__clusters = self.__update_clusters();
100
                updated_medoids = self.__update_medoids();  # changes should be calculated before asignment
101
             
102
                changes = max([euclidean_distance_sqrt(self.__medoids[index], updated_medoids[index]) for index in range(len(updated_medoids))]);    # Fast solution
103
                 
104
                self.__medoids = updated_medoids;
105
106
107
    def get_clusters(self):
108
        """!
109
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
110
        
111
        @see process()
112
        @see get_medoids()
113
        
114
        """
115
        
116
        return self.__clusters;
117
    
118
    
119
    def get_medoids(self):
120
        """!
121
        @brief Returns list of medoids of allocated clusters.
122
        
123 View Code Duplication
        @see process()
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
124
        @see get_clusters()
125
        
126
        """
127
128
        return self.__medoids;
129
130
131
    def __update_clusters(self):
132
        """!
133
        @brief Calculate distance to each point from the each cluster. 
134
        @details Nearest points are captured by according clusters and as a result clusters are updated.
135
        
136
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
137
        
138
        """
139
        
140
        clusters = [[] for i in range(len(self.__medoids))];
141
        for index_point in range(len(self.__pointer_data)):
142
            index_optim = -1;
143
            dist_optim = 0.0;
144
             
145
            for index in range(len(self.__medoids)):
146
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medoids[index]);
147
                 
148
                if ( (dist < dist_optim) or (index is 0)):
149
                    index_optim = index;
150
                    dist_optim = dist;
151
             
152
            clusters[index_optim].append(index_point);
153
        
154
        # If cluster is not able to capture object it should be removed
155
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
156
        
157
        return clusters;
158
    
159
    
160
    def __update_medoids(self):
161
        """!
162
        @brief Find medoids of clusters in line with contained objects.
163
        
164
        @return (list) list of medoids for current number of clusters.
165
        
166
        """
167
         
168
        medoids = [[] for i in range(len(self.__clusters))];
169
        
170
        for index in range(len(self.__clusters)):
171
            medoid_index = median(self.__pointer_data, self.__clusters[index]);
172
            medoids[index] = self.__pointer_data[medoid_index];
173
             
174
        return medoids;