Completed
Push — 0.8.dev ( 8817a1...2aeeeb )
by Andrei
01:40
created

kmeans_visualizer.show_clusters()   B

Complexity

Conditions 3

Size

Total Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
dl 0
loc 31
rs 8.8571
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Means
4
@details Based on book description:
5
         - J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
31
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
32
33
import pyclustering.core.kmeans_wrapper as wrapper;
34
35
from pyclustering.core.wrapper import ccore_library;
36
37
from pyclustering.cluster.encoder import type_encoding;
38
from pyclustering.cluster import cluster_visualizer;
39
40
41
class kmeans_observer:
42
    """!
43
    @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
44
    
45
    @see kmeans
46
    
47
    """
48
    
49
    def __init__(self):
50
        """!
51
        @brief Initializer of observer of K-Means algorithm.
52
        
53
        """
54
        self.__evolution_clusters   = [];
55
        self.__evolution_centers    = [];
56
57
58
    def __len__(self):
59
        """!
60
        @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
61
        
62
        """
63
        return len(self.__evolution_clusters);
64
65
66
    def notify(self, clusters, centers):
67
        """!
68
        @brief This method is called by K-Means algorithm to notify about changes.
69
        
70
        @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
71
        @param[in] centers (array_like): Allocated centers by K-Means algorithm.
72
        
73
        """
74
        self.__evolution_clusters.append(clusters);
75
        self.__evolution_clusters.append(centers);
76
77
78
    def get_centers(self, iteration):
79
        """!
80
        @brief Get method to return centers at specific iteration of clustering process.
81
        
82
        @param[in] iteration (uint): Clustering process iteration at which centers are required.
83
        
84
        @return (array_like) Centers at specific iteration.
85
        
86
        """
87
        return self.__evolution_centers[iteration];
88
89
90
    def get_clusters(self, iteration):
91
        """!
92
        @brief Get method to return allocated clusters at specific iteration of clustering process.
93
        
94
        @param[in] iteration (uint): Clustering process iteration at which clusters are required.
95
        
96
        @return (array_like) Clusters at specific iteration.
97
        
98
        """
99
        return self.__evolution_clusters[iteration];
100
101
102
103
class kmeans_visualizer:
104
    @staticmethod
105
    def show_clusters(sample, clusters, centers, figure = None, display = True):
106
        """!
107
        @brief 
108
        
109
        @param[in] sample (list): Dataset that were used for clustering.
110
        @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
111
        @param[in] centers (array_like): Centers that were allocated by the algorithm.
112
        @param[in] figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used
113
                    for visualization.
114
        @param[in] display (bool): If 'True' then figure will be shown by the method, otherwise it should be
115
                    shown manually using matplotlib function 'plt.show()'.
116
        
117
        @return (figure) Figure where clusters were drawn.
118
        
119
        """
120
121
        visualizer = cluster_visualizer();
122
        visualizer.append_clusters(clusters, sample);
123
        
124
        if (figure is None):
125
            figure = visualizer.show(display = False);
126
        else:
127
            visualizer.show(figure = figure, display = False);
128
        
129
        kmeans_visualizer.__draw_rays(figure, visualizer, sample, clusters, centers)
130
        
131
        if (display is True):
132
            plt.show();
133
134
        return figure;
135
136
137
    @staticmethod
138
    def __draw_rays(figure, visualizer, sample, clusters, centers):
139
        ax = figure.get_axes()[0];
140
        
141
        for index_cluster in range(len(clusters)):
142
            color = visualizer.get_cluster_color(index_cluster, 0);
143
            kmeans_visualizer.__draw_center(ax, color, centers[index_cluster]);
144
            kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
145
146
147
    @staticmethod
148
    def __draw_center(ax, color, center):
149
        dimension = len(center);
150
        
151
        if (dimension == 1):
152
            ax.plot(center[0], 0.0, color = color, marker = '*', markersize = 15);
153
        elif (dimension == 2):
154
            ax.plot(center[0], center[1], color = color, marker = '*', markersize = 15);
155
        elif (dimension == 3):
156
            ax.scatter(center[0], center[1], center[2], c = color, marker = '*', s = 70);
157
158
    @staticmethod
159
    def __draw_cluster_rays(ax, color, sample, cluster, center):
160
        dimension = len(sample[0]);
161
        
162
        for index_point in cluster:
163
            point = sample[index_point];
164
            if (dimension == 2):
165
                ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5);
166
            elif (dimension == 3):
167
                ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
168
169
170
class kmeans:
171
    """!
172
    @brief Class represents clustering algorithm K-Means.
173
    @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
174
    
175
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
176
             
177
             K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization 
178
             initial centers from module 'pyclustering.cluster.center_initializer'.
179
    
180
    Example #1 - Trivial clustering:
181
    @code
182
        # load list of points for cluster analysis
183
        sample = read_sample(path);
184
        
185
        # create instance of K-Means algorithm
186
        kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
187
        
188
        # run cluster analysis and obtain results
189
        kmeans_instance.process();
190
        clusters = kmeans_instance.get_clusters();
191
    @endcode
192
    
193
    Example #2 - Clustering using K-Means++ for center initialization:
194
    @code
195
        # load list of points for cluster analysis
196
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
197
        
198
        # initialize initial centers using K-Means++ method
199
        initial_centers = kmeans_plusplus_initializer(sample, 3).initialize();
200
        
201
        # create instance of K-Means algorithm with prepared centers
202
        kmeans_instance = kmeans(sample, initial_centers);
203
        
204
        # run cluster analysis and obtain results
205
        kmeans_instance.process();
206
        clusters = kmeans_instance.get_clusters();
207
        final_centers = kmeans_instance.get_centers();
208
    @endcode
209
    
210
    @see center_initializer
211
    
212
    """
213
    
214
    def __init__(self, data, initial_centers, tolerance = 0.001, ccore = True, **kwargs):
215
        """!
216
        @brief Constructor of clustering algorithm K-Means.
217
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
218
        
219
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
220
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
221
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
222
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
223
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer').
224
        
225
        Keyword Args:
226
            observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
227
        
228
        @see center_initializer
229
        
230
        """
231
        self.__pointer_data = numpy.matrix(data);
232
        self.__clusters = [];
233
        self.__centers = numpy.matrix(initial_centers);
234
        self.__tolerance = tolerance;
235
        
236
        self.__observer = None;
237
        if 'observer' in kwargs:
238
            self.__observer = kwargs['observer'];
239
        
240
        self.__ccore = ccore;
241
        if (self.__ccore):
242
            self.__ccore = ccore_library.workable();
243
244
245
    def process(self):
246
        """!
247
        @brief Performs cluster analysis in line with rules of K-Means algorithm.
248
        
249
        @remark Results of clustering can be obtained using corresponding get methods.
250
        
251
        @see get_clusters()
252
        @see get_centers()
253
        
254
        """
255
        
256
        if (self.__ccore is True):
257
            self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
258
            self.__centers = self.__update_centers();
259
        else:
260
            maximum_change = float('inf');
261
             
262
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
263
            #stop_condition = self.__tolerance;              # Slow solution
264
             
265
            # Check for dimension
266
            if (len(self.__pointer_data[0]) != len(self.__centers[0])):
267
                raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
268
             
269
            while (maximum_change > stop_condition):
270
                self.__clusters = self.__update_clusters();
271
                updated_centers = self.__update_centers();  # changes should be calculated before asignment
272
                
273
                if self.__observer:
274
                    self.__observer.notify(self.__clusters, updated_centers);
275
                
276
                if (len(self.__centers) != len(updated_centers)):
277
                    maximum_change = float('inf');
278
                
279
                else:
280
                    changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
281
                    maximum_change = numpy.max(changes);
282
                
283
                self.__centers = updated_centers;
284
285
286
    def get_clusters(self):
287
        """!
288
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
289
        
290
        @see process()
291
        @see get_centers()
292
        
293
        """
294
        
295
        return self.__clusters;
296
    
297
    
298
    def get_centers(self):
299
        """!
300
        @brief Returns list of centers of allocated clusters.
301
        
302
        @see process()
303
        @see get_clusters()
304
        
305
        """
306
307
        return self.__centers.tolist();
308
309
310
    def get_cluster_encoding(self):
311
        """!
312
        @brief Returns clustering result representation type that indicate how clusters are encoded.
313
        
314
        @return (type_encoding) Clustering result representation.
315
        
316
        @see get_clusters()
317
        
318
        """
319
        
320
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
321
322
323
    def __update_clusters(self):
324
        """!
325
        @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
326
        
327
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
328
        
329
        """
330
        
331
        clusters = [[] for _ in range(len(self.__centers))];
332
        
333
        dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
334
        for index_center in range(len(self.__centers)):
335
            dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;
336
        
337
        optimum_indexes = numpy.argmin(dataset_differences, axis=0);
338
        for index_point in range(len(optimum_indexes)):
339
            index_cluster = optimum_indexes[index_point];
340
            clusters[index_cluster].append(index_point);
341
        
342
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
343
        
344
        return clusters;
345
    
346
    
347
    def __update_centers(self):
348
        """!
349
        @brief Calculate centers of clusters in line with contained objects.
350
        
351
        @return (numpy.matrix) Updated centers as list of centers.
352
        
353
        """
354
        
355
        dimension = self.__pointer_data.shape[1];
356
        centers = numpy.zeros((len(self.__clusters), dimension));
357
        
358
        for index in range(len(self.__clusters)):
359
            cluster_points = self.__pointer_data[self.__clusters[index], :];
360
            centers[index] = cluster_points.mean(axis=0);
361
362
        return numpy.matrix(centers);
363