Completed
Push — master ( 6ed467...f3f897 )
by Andrei
01:29
created

kmeans.get_cluster_encoding()   A

Complexity

Conditions 1

Size

Total Lines 11

Duplication

Lines 11
Ratio 100 %

Importance

Changes 0
Metric Value
cc 1
dl 11
loc 11
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Means
4
@details Based on book description:
5
         - J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import pyclustering.core.kmeans_wrapper as wrapper;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.utils import euclidean_distance_sqrt, list_math_addition, list_math_division_number;
34
35
36
class kmeans:
37
    """!
38
    @brief Class represents clustering algorithm K-Means.
39
    
40
    Example:
41
    @code
42
        # load list of points for cluster analysis
43
        sample = read_sample(path);
44
        
45
        # create instance of K-Means algorithm
46
        kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
47
        
48
        # run cluster analysis and obtain results
49
        kmeans_instance.process();
50
        kmeans_instance.get_clusters();    
51
    @endcode
52
    """
53
    
54
    def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
55
        """!
56
        @brief Constructor of clustering algorithm K-Means.
57
        
58
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
59
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...].
60
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
61
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
62
        
63
        """
64
        self.__pointer_data = data;
65
        self.__clusters = [];
66
        self.__centers = initial_centers[:];     # initial centers shouldn't be chaged
67
        self.__tolerance = tolerance;
68
        
69
        self.__ccore = ccore;
70
71
72
    def process(self):
73
        """!
74
        @brief Performs cluster analysis in line with rules of K-Means algorithm.
75
        
76
        @remark Results of clustering can be obtained using corresponding get methods.
77
        
78
        @see get_clusters()
79
        @see get_centers()
80
        
81
        """
82
        
83 View Code Duplication
        if (self.__ccore is True):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
84
            self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
85
            self.__centers = self.__update_centers();
86
        else: 
87
            changes = float('inf');
88
             
89
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
90
            #stop_condition = self.__tolerance;              # Slow solution
91
             
92
            # Check for dimension
93
            if (len(self.__pointer_data[0]) != len(self.__centers[0])):
94
                raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
95
             
96
            while (changes > stop_condition):
97
                self.__clusters = self.__update_clusters();
98
                updated_centers = self.__update_centers();  # changes should be calculated before asignment
99
             
100
                #changes = max([euclidean_distance(self.__centers[index], updated_centers[index]) for index in range(len(self.__centers))]);        # Slow solution
101
                changes = max([euclidean_distance_sqrt(self.__centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
102
                 
103
                self.__centers = updated_centers;
104
105
106
    def get_clusters(self):
107
        """!
108
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
109
        
110
        @see process()
111
        @see get_centers()
112
        
113
        """
114
        
115
        return self.__clusters;
116
    
117
    
118
    def get_centers(self):
119
        """!
120
        @brief Returns list of centers of allocated clusters.
121
        
122
        @see process()
123
        @see get_clusters()
124
        
125
        """
126 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
127
        return self.__centers;
128
129
130
    def get_cluster_encoding(self):
131
        """!
132
        @brief Returns clustering result representation type that indicate how clusters are encoded.
133
        
134
        @return (type_encoding) Clustering result representation.
135
        
136
        @see get_clusters()
137
        
138
        """
139
        
140
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
141
142
143
    def __update_clusters(self):
144
        """!
145
        @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
146
        
147
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
148
        
149
        """
150
        
151
        clusters = [[] for i in range(len(self.__centers))];
152
        for index_point in range(len(self.__pointer_data)):
153
            index_optim = -1;
154
            dist_optim = 0.0;
155 View Code Duplication
             
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
156
            for index in range(len(self.__centers)):
157
                # dist = euclidean_distance(data[index_point], centers[index]);         # Slow solution
158
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__centers[index]);      # Fast solution
159
                 
160
                if ( (dist < dist_optim) or (index is 0)):
161
                    index_optim = index;
162
                    dist_optim = dist;
163
             
164
            clusters[index_optim].append(index_point);
165
        
166
        # If cluster is not able to capture object it should be removed
167
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
168
        
169
        return clusters;
170
    
171
    
172
    def __update_centers(self):
173
        """!
174
        @brief Calculate centers of clusters in line with contained objects.
175
        
176
        @return (list) Updated centers as list of centers.
177
        
178
        """
179
         
180
        centers = [[] for i in range(len(self.__clusters))];
181
         
182
        for index in range(len(self.__clusters)):
183
            point_sum = [0] * len(self.__pointer_data[0]);
184
             
185
            for index_point in self.__clusters[index]:
186
                point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
187
            
188
            centers[index] = list_math_division_number(point_sum, len(self.__clusters[index]));
189
             
190
        return centers;
191