Completed
Push — master ( 916c89...8edca8 )
by Andrei
01:23
created

kmedians.__update_medians()   D

Complexity

Conditions 8

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 8
dl 0
loc 28
rs 4
1
"""!
2
3
@brief Cluster analysis algorithm: K-Medians
4
@details Based on book description:
5
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
import math;
29
30
from pyclustering.utils import euclidean_distance_sqrt;
31
32
import pyclustering.core.wrapper as wrapper;
33
34
class kmedians:
35
    """!
36
    @brief Class represents clustering algorithm K-Medians.
37
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
38
    
39
    Example:
40
    @code
41
        # load list of points for cluster analysis
42
        sample = read_sample(path);
43
        
44
        # create instance of K-Medians algorithm
45
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
46
        
47
        # run cluster analysis and obtain results
48
        kmedians_instance.process();
49
        kmedians_instance.get_clusters();    
50
    @endcode
51
    
52
    """
53
    
54
    def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
55
        """!
56
        @brief Constructor of clustering algorithm K-Medians.
57
        
58
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
59
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
60
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
61
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
62
        
63
        """
64
        self.__pointer_data = data;
65
        self.__clusters = [];
66
        self.__medians = initial_centers[:];
67
        self.__tolerance = tolerance;
68
        self.__ccore = ccore;
69
70
71 View Code Duplication
    def process(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
72
        """!
73
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
74
        
75
        @remark Results of clustering can be obtained using corresponding get methods.
76
        
77
        @see get_clusters()
78
        @see get_medians()
79
        
80
        """
81
        
82
        if (self.__ccore is True):
83
            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
84
            self.__medians = self.__update_medians();
85
            
86
        else:
87
            changes = float('inf');
88
             
89
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
90
            #stop_condition = self.__tolerance;              # Slow solution
91
             
92
            # Check for dimension
93
            if (len(self.__pointer_data[0]) != len(self.__medians[0])):
94
                raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
95
             
96
            while (changes > stop_condition):
97
                self.__clusters = self.__update_clusters();
98
                updated_centers = self.__update_medians();  # changes should be calculated before asignment
99
             
100
                changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
101
                 
102
                self.__medians = updated_centers;
103
104
105
    def get_clusters(self):
106
        """!
107
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
108
        
109
        @see process()
110
        @see get_medians()
111
        
112
        """
113
        
114
        return self.__clusters;
115
    
116
    
117
    def get_medians(self):
118
        """!
119
        @brief Returns list of centers of allocated clusters.
120
        
121
        @see process()
122
        @see get_clusters()
123
        
124
        """
125
126
        return self.__medians;
127
128
129 View Code Duplication
    def __update_clusters(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
130
        """!
131
        @brief Calculate Manhattan distance to each point from the each cluster. 
132
        @details Nearest points are captured by according clusters and as a result clusters are updated.
133
        
134
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
135
        
136
        """
137
        
138
        clusters = [[] for i in range(len(self.__medians))];
139
        for index_point in range(len(self.__pointer_data)):
140
            index_optim = -1;
141
            dist_optim = 0.0;
142
             
143
            for index in range(len(self.__medians)):
144
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
145
                 
146
                if ( (dist < dist_optim) or (index is 0)):
147
                    index_optim = index;
148
                    dist_optim = dist;
149
             
150
            clusters[index_optim].append(index_point);
151
            
152
        # If cluster is not able to capture object it should be removed
153
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
154
        
155
        return clusters;
156
    
157
    
158
    def __update_medians(self):
159
        """!
160
        @brief Calculate medians of clusters in line with contained objects.
161
        
162
        @return (list) list of medians for current number of clusters.
163
        
164
        """
165
         
166
        medians = [[] for i in range(len(self.__clusters))];
167
         
168
        for index in range(len(self.__clusters)):
169
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
170
            length_cluster = len(self.__clusters[index]);
171
            
172
            for index_dimension in range(len(self.__pointer_data[0])):
173
                sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
174
                
175
                relative_index_median = math.floor(length_cluster / 2);
176
                index_median = sorted_cluster[relative_index_median];
177
                
178
                if ( (length_cluster % 2) and (relative_index_median + 1 < len(sorted_cluster)) ):
179
                    index_median_second = sorted_cluster[relative_index_median + 1];
180
                    medians[index][index_dimension] =  (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
181
                    
182
                else:
183
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
184
             
185
        return medians;