Completed
Push — 0.8.dev ( 77f982...1c45d7 )
by Andrei
01:15
created

kmeans_observer   A

Complexity

Total Complexity 7

Size/Duplication

Total Lines 79
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
dl 0
loc 79
rs 10
c 0
b 0
f 0
wmc 7

7 Methods

Rating   Name   Duplication   Size   Complexity  
A __len__() 0 6 1
A notify() 0 10 1
A __init__() 0 7 1
A set_evolution_clusters() 0 8 1
A get_centers() 0 10 1
A get_clusters() 0 10 1
A set_evolution_centers() 0 8 1
1
"""!
2
3
@brief Cluster analysis algorithm: K-Means
4
@details Based on book description:
5
         - J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
31
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
32
33
import pyclustering.core.kmeans_wrapper as wrapper;
34
35
from pyclustering.core.wrapper import ccore_library;
36
37
from pyclustering.cluster.encoder import type_encoding;
38
from pyclustering.cluster import cluster_visualizer;
39
40
41
class kmeans_observer:
42
    """!
43
    @brief Observer of K-Means algorithm that is used to collect information about clustering process on each iteration of the algorithm.
44
    
45
    @see kmeans
46
    
47
    """
48
    
49
    def __init__(self):
50
        """!
51
        @brief Initializer of observer of K-Means algorithm.
52
        
53
        """
54
        self.__evolution_clusters   = [];
55
        self.__evolution_centers    = [];
56
57
58
    def __len__(self):
59
        """!
60
        @brief Returns amount of steps that were observer during clustering process in K-Means algorithm.
61
        
62
        """
63
        return len(self.__evolution_clusters);
64
65
66
    def notify(self, clusters, centers):
67
        """!
68
        @brief This method is called by K-Means algorithm to notify about changes.
69
        
70
        @param[in] clusters (array_like): Allocated clusters by K-Means algorithm.
71
        @param[in] centers (array_like): Allocated centers by K-Means algorithm.
72
        
73
        """
74
        self.__evolution_clusters.append(clusters);
75
        self.__evolution_centers.append(centers);
76
77
78
    def set_evolution_centers(self, evolution_centers):
79
        """!
80
        @brief Set evolution of changes of centers during clustering process.
81
        
82
        @param[in] evolution_clusters (array_like): Evolution of changes of centers during clustering process.
83
        
84
        """
85
        self.__evolution_centers = evolution_centers;
86
87
88
    def get_centers(self, iteration):
89
        """!
90
        @brief Get method to return centers at specific iteration of clustering process.
91
        
92
        @param[in] iteration (uint): Clustering process iteration at which centers are required.
93
        
94
        @return (array_like) Centers at specific iteration.
95
        
96
        """
97
        return self.__evolution_centers[iteration];
98
99
100
    def set_evolution_clusters(self, evolution_clusters):
101
        """!
102
        @brief Set evolution of changes of centers during clustering process.
103
        
104
        @param[in] evolution_centers (array_like): Evolution of changes of clusters during clustering process.
105
        
106
        """
107
        self.__evolution_clusters = evolution_clusters;
108
109
110
    def get_clusters(self, iteration):
111
        """!
112
        @brief Get method to return allocated clusters at specific iteration of clustering process.
113
        
114
        @param[in] iteration (uint): Clustering process iteration at which clusters are required.
115
        
116
        @return (array_like) Clusters at specific iteration.
117
        
118
        """
119
        return self.__evolution_clusters[iteration];
120
121
122
123
class kmeans_visualizer:
124
    """!
125
    @brief Visualizer of K-Means algorithm's results.
126
    @details K-Means visualizer provides visualization services that are specific for K-Means algorithm.
127
    
128
    """
129
    
130
    __default_2d_marker_size = 15;
131
    __default_3d_marker_size = 70;
132
    
133
    
134
    @staticmethod
135
    def show_clusters(sample, clusters, centers, initial_centers = None, **kwargs):
136
        """!
137
        @brief Display K-Means clustering results.
138
        
139
        @param[in] sample (list): Dataset that were used for clustering.
140
        @param[in] clusters (array_like): Clusters that were allocated by the algorithm.
141
        @param[in] centers (array_like): Centers that were allocated by the algorithm.
142
        @param[in] initial_centers (array_like): Initial centers that were used by the algorithm, if 'None' then initial centers are not displyed.
143
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display').
144
        
145
        Keyword Args:
146
            figure (figure): If 'None' then new is figure is creater, otherwise specified figure is used for visualization.
147
            display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'.
148
            offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified).
149
        
150
        @return (figure) Figure where clusters were drawn.
151
        
152
        """
153
154
        visualizer = cluster_visualizer();
155
        visualizer.append_clusters(clusters, sample);
156
        
157
        offset = kmeans_visualizer.__get_argument('offset', 0, **kwargs);
158
        
159
        if (kmeans_visualizer.__get_argument('figure', None, **kwargs) is None):
160
            figure = visualizer.show(display = False);
161
        else:
162
            visualizer.show(figure = figure, display = False);
163
        
164
        kmeans_visualizer.__draw_centers(figure, offset, visualizer, centers, initial_centers);
165
        kmeans_visualizer.__draw_rays(figure, offset, visualizer, sample, clusters, centers);
166
        
167
        if (kmeans_visualizer.__get_argument('display', True, **kwargs) is True):
168
            plt.show();
169
170
        return figure;
171
172
173
    @staticmethod
174
    def __draw_rays(figure, offset, visualizer, sample, clusters, centers):
175
        ax = figure.get_axes()[offset];
176
        
177
        for index_cluster in range(len(clusters)):
178
            color = visualizer.get_cluster_color(index_cluster, 0);
179
            kmeans_visualizer.__draw_cluster_rays(ax, color, sample, clusters[index_cluster], centers[index_cluster])
180
181
182
    @staticmethod
183
    def __draw_cluster_rays(ax, color, sample, cluster, center):
184
        dimension = len(sample[0]);
185
        
186
        for index_point in cluster:
187
            point = sample[index_point];
188
            if (dimension == 1):
189
                ax.plot([point[0], center[0]], [0.0, 0.0], '-', color=color, linewidth=0.5);
190
            elif (dimension == 2):
191
                ax.plot([point[0], center[0]], [point[1], center[1]], '-', color=color, linewidth=0.5);
192
            elif (dimension == 3):
193
                ax.plot([point[0], center[0]], [point[1], center[1]], [point[2], center[2]], '-', color=color, linewidth=0.5)
194
195
196
    @staticmethod
197
    def __draw_center(ax, center, color, marker, alpha):
198
        dimension = len(center);
199
        
200
        if (dimension == 1):
201
            ax.plot(center[0], 0.0, color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size);
202
        elif (dimension == 2):
203
            ax.plot(center[0], center[1], color=color, alpha=alpha, marker=marker, markersize=kmeans_visualizer.__default_2d_marker_size);
204
        elif (dimension == 3):
205
            ax.scatter(center[0], center[1], center[2], c=color, alpha=alpha, marker=marker, s=kmeans_visualizer.__default_3d_marker_size);
206
207
208
    @staticmethod
209
    def __draw_centers(figure, offset, visualizer, centers, initial_centers):
210
        ax = figure.get_axes()[offset];
211
        
212
        for index_center in range(len(centers)):
213
            color = visualizer.get_cluster_color(index_center, 0);
214
            kmeans_visualizer.__draw_center(ax, centers[index_center], color, '*', 1.0);
215
            
216
            if initial_centers:
217
                kmeans_visualizer.__draw_center(ax, initial_centers[index_center], color, '*', 0.4);
218
219
220
    @staticmethod
221
    def __get_argument(argument_name, default_value, **kwargs):
222
        if (argument_name in kwargs):
223
            return kwargs[argument_name];
224
        
225
        return default_value;
226
227
228
229
class kmeans:
230
    """!
231
    @brief Class represents clustering algorithm K-Means.
232
    @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
233
    
234
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
235
             
236
             K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization 
237
             initial centers from module 'pyclustering.cluster.center_initializer'.
238
    
239
    @image html kmeans_example_clustering.png "K-Means clustering results. At the left - 'Simple03.data' sample, at the right - 'Lsun.data' sample."
240
    
241
    Example #1 - Trivial clustering:
242
    @code
243
        # load list of points for cluster analysis
244
        sample = read_sample(path);
245
        
246
        # create instance of K-Means algorithm
247
        kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
248
        
249
        # run cluster analysis and obtain results
250
        kmeans_instance.process();
251
        clusters = kmeans_instance.get_clusters();
252
    @endcode
253
    
254
    Example #2 - Clustering using K-Means++ for center initialization:
255
    @code
256
        # load list of points for cluster analysis
257
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
258
        
259
        # initialize initial centers using K-Means++ method
260
        initial_centers = kmeans_plusplus_initializer(sample, 3).initialize();
261
        
262
        # create instance of K-Means algorithm with prepared centers
263
        kmeans_instance = kmeans(sample, initial_centers);
264
        
265
        # run cluster analysis and obtain results
266
        kmeans_instance.process();
267
        clusters = kmeans_instance.get_clusters();
268
        final_centers = kmeans_instance.get_centers();
269
    @endcode
270
    
271
    @see center_initializer
272
    
273
    """
274
    
275
    def __init__(self, data, initial_centers, tolerance = 0.001, ccore = True, **kwargs):
276
        """!
277
        @brief Constructor of clustering algorithm K-Means.
278
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
279
        
280
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
281
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
282
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
283
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
284
        @param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'observer').
285
        
286
        Keyword Args:
287
            observer (kmeans_observer): Observer of the algorithm to collect information about clustering process on each iteration.
288
        
289
        @see center_initializer
290
        
291
        """
292
        self.__pointer_data = numpy.matrix(data);
293
        self.__clusters = [];
294
        self.__centers = numpy.matrix(initial_centers);
295
        self.__tolerance = tolerance;
296
        
297
        self.__observer = None;
298
        if 'observer' in kwargs:
299
            self.__observer = kwargs['observer'];
300
        
301
        self.__ccore = ccore;
302
        if (self.__ccore):
303
            self.__ccore = ccore_library.workable();
304
305
306
    def process(self):
307
        """!
308
        @brief Performs cluster analysis in line with rules of K-Means algorithm.
309
        
310
        @remark Results of clustering can be obtained using corresponding get methods.
311
        
312
        @see get_clusters()
313
        @see get_centers()
314
        
315
        """
316
        
317
        if (len(self.__pointer_data[0]) != len(self.__centers[0])):
318
            raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
319
        
320
        if (self.__ccore is True):
321
            results = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance, (self.__observer is not None));
322
            self.__clusters = results[0];
323
            self.__centers  = results[1];
324
            
325
            if self.__observer is not None:
326
                self.__observer.set_evolution_clusters(results[2]);
327
                self.__observer.set_evolution_centers(results[3]);
328
        else:
329
            maximum_change = float('inf');
330
            stop_condition = self.__tolerance * self.__tolerance;
331
             
332
            while (maximum_change > stop_condition):
333
                self.__clusters = self.__update_clusters();
334
                updated_centers = self.__update_centers();  # changes should be calculated before asignment
335
                
336
                if self.__observer is not None:
337
                    self.__observer.notify(self.__clusters, updated_centers.tolist());
338
                
339
                if (len(self.__centers) != len(updated_centers)):
340
                    maximum_change = float('inf');
341
                
342
                else:
343
                    changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
344
                    maximum_change = numpy.max(changes);
345
                
346
                self.__centers = updated_centers.tolist();
347
348
349
    def get_clusters(self):
350
        """!
351
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
352
        
353
        @see process()
354
        @see get_centers()
355
        
356
        """
357
        
358
        return self.__clusters;
359
    
360
    
361
    def get_centers(self):
362
        """!
363
        @brief Returns list of centers of allocated clusters.
364
        
365
        @see process()
366
        @see get_clusters()
367
        
368
        """
369
370
        if isinstance(self.__centers, list):
371
            return self.__centers;
372
        
373
        return self.__centers.tolist();
374
375
376
    def get_cluster_encoding(self):
377
        """!
378
        @brief Returns clustering result representation type that indicate how clusters are encoded.
379
        
380
        @return (type_encoding) Clustering result representation.
381
        
382
        @see get_clusters()
383
        
384
        """
385
        
386
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
387
388
389
    def __update_clusters(self):
390
        """!
391
        @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
392
        
393
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
394
        
395
        """
396
        
397
        clusters = [[] for _ in range(len(self.__centers))];
398
        
399
        dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
400
        for index_center in range(len(self.__centers)):
401
            dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;
402
        
403
        optimum_indexes = numpy.argmin(dataset_differences, axis=0);
404
        for index_point in range(len(optimum_indexes)):
405
            index_cluster = optimum_indexes[index_point];
406
            clusters[index_cluster].append(index_point);
407
        
408
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
409
        
410
        return clusters;
411
    
412
    
413
    def __update_centers(self):
414
        """!
415
        @brief Calculate centers of clusters in line with contained objects.
416
        
417
        @return (numpy.matrix) Updated centers as list of centers.
418
        
419
        """
420
        
421
        dimension = self.__pointer_data.shape[1];
422
        centers = numpy.zeros((len(self.__clusters), dimension));
423
        
424
        for index in range(len(self.__clusters)):
425
            cluster_points = self.__pointer_data[self.__clusters[index], :];
426
            centers[index] = cluster_points.mean(axis=0);
427
428
        return numpy.matrix(centers);
429