Completed
Push — master ( d2e29b...b6987f )
by Andrei
01:49
created

kmedians.__update_medians()   C

Complexity

Conditions 7

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
c 0
b 0
f 0
dl 0
loc 28
rs 5.5
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medians
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import math;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.utils import euclidean_distance_sqrt;
34
35
import pyclustering.core.kmedians_wrapper as wrapper;
36
37
38
class kmedians:
39
    """!
40
    @brief Class represents clustering algorithm K-Medians.
41
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
42
    
43
    Example:
44
    @code
45
        # load list of points for cluster analysis
46
        sample = read_sample(path);
47
        
48
        # create instance of K-Medians algorithm
49
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
50
        
51
        # run cluster analysis and obtain results
52
        kmedians_instance.process();
53
        kmedians_instance.get_clusters();    
54
    @endcode
55
    
56
    """
57
    
58
    def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
59
        """!
60
        @brief Constructor of clustering algorithm K-Medians.
61
        
62
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
63
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
64
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
65
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
66
        
67
        """
68
        self.__pointer_data = data;
69
        self.__clusters = [];
70
        self.__medians = initial_centers[:];
71
        self.__tolerance = tolerance;
72
        self.__ccore = ccore;
73
74
75 View Code Duplication
    def process(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
76
        """!
77
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
78
        
79
        @remark Results of clustering can be obtained using corresponding get methods.
80
        
81
        @see get_clusters()
82
        @see get_medians()
83
        
84
        """
85
        
86
        if (self.__ccore is True):
87
            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
88
            self.__medians = self.__update_medians();
89
            
90
        else:
91
            changes = float('inf');
92
             
93
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
94
            #stop_condition = self.__tolerance;              # Slow solution
95
             
96
            # Check for dimension
97
            if (len(self.__pointer_data[0]) != len(self.__medians[0])):
98
                raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
99
             
100
            while (changes > stop_condition):
101
                self.__clusters = self.__update_clusters();
102
                updated_centers = self.__update_medians();  # changes should be calculated before asignment
103
             
104
                changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
105
                 
106
                self.__medians = updated_centers;
107
108
109
    def get_clusters(self):
110
        """!
111
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
112
        
113
        @see process()
114
        @see get_medians()
115
        
116
        """
117
        
118
        return self.__clusters;
119
    
120
    
121
    def get_medians(self):
122
        """!
123
        @brief Returns list of centers of allocated clusters.
124
        
125
        @see process()
126
        @see get_clusters()
127
        
128
        """
129 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
130
        return self.__medians;
131
132
133
    def get_cluster_encoding(self):
134
        """!
135
        @brief Returns clustering result representation type that indicate how clusters are encoded.
136
        
137
        @return (type_encoding) Clustering result representation.
138
        
139
        @see get_clusters()
140
        
141
        """
142
        
143
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
144
145
146
    def __update_clusters(self):
147
        """!
148
        @brief Calculate Manhattan distance to each point from the each cluster. 
149
        @details Nearest points are captured by according clusters and as a result clusters are updated.
150
        
151
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
152
        
153
        """
154
        
155
        clusters = [[] for i in range(len(self.__medians))];
156
        for index_point in range(len(self.__pointer_data)):
157
            index_optim = -1;
158
            dist_optim = 0.0;
159
             
160
            for index in range(len(self.__medians)):
161
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
162
                 
163
                if ( (dist < dist_optim) or (index is 0)):
164
                    index_optim = index;
165
                    dist_optim = dist;
166
             
167
            clusters[index_optim].append(index_point);
168
            
169
        # If cluster is not able to capture object it should be removed
170
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
171
        
172
        return clusters;
173
    
174
    
175
    def __update_medians(self):
176
        """!
177
        @brief Calculate medians of clusters in line with contained objects.
178
        
179
        @return (list) list of medians for current number of clusters.
180
        
181
        """
182
         
183
        medians = [[] for i in range(len(self.__clusters))];
184
         
185
        for index in range(len(self.__clusters)):
186
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
187
            length_cluster = len(self.__clusters[index]);
188
            
189
            for index_dimension in range(len(self.__pointer_data[0])):
190
                sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
191
                
192
                relative_index_median = math.floor(length_cluster / 2);
193
                index_median = sorted_cluster[relative_index_median];
194
                
195
                if ( (length_cluster % 2) == 0 ):
196
                    index_median_second = sorted_cluster[relative_index_median + 1];
197
                    medians[index][index_dimension] =  (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
198
                    
199
                else:
200
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
201
             
202
        return medians;