Completed
Push — master ( a74470...c8bc69 )
by Andrei
58s
created

kmeans.__init__()   A

Complexity

Conditions 2

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 21
rs 9.3142
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: K-Means
4
@details Based on book description:
5
         - J.B.MacQueen. Some Methods for Classification and Analysis of Multivariate Observations. 1967.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
31
import pyclustering.core.kmeans_wrapper as wrapper;
32
33
from pyclustering.core.wrapper import ccore_library;
34
35
from pyclustering.cluster.encoder import type_encoding;
36
37
38
class kmeans:
39
    """!
40
    @brief Class represents clustering algorithm K-Means.
41
    @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
42
    
43
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
44
             
45
             K-Means clustering results depend on initial centers. Algorithm K-Means++ can used for initialization 
46
             initial centers from module 'pyclustering.cluster.center_initializer'.
47
    
48
    Example #1 - Trivial clustering:
49
    @code
50
        # load list of points for cluster analysis
51
        sample = read_sample(path);
52
        
53
        # create instance of K-Means algorithm
54
        kmeans_instance = kmeans(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
55
        
56
        # run cluster analysis and obtain results
57
        kmeans_instance.process();
58
        clusters = kmeans_instance.get_clusters();
59
    @endcode
60
    
61
    Example #2 - Clustering using K-Means++ for center initialization:
62
    @code
63
        # load list of points for cluster analysis
64
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2);
65
        
66
        # initialize initial centers using K-Means++ method
67
        initial_centers = kmeans_plusplus_initializer(sample, 3).initialize();
68
        
69
        # create instance of K-Means algorithm with prepared centers
70
        kmeans_instance = kmeans(sample, initial_centers);
71
        
72
        # run cluster analysis and obtain results
73
        kmeans_instance.process();
74
        clusters = kmeans_instance.get_clusters();
75
        final_centers = kmeans_instance.get_centers();
76
    @endcode
77
    
78
    @see center_initializer
79
    
80
    """
81
    
82
    def __init__(self, data, initial_centers, tolerance = 0.001, ccore = True):
83
        """!
84
        @brief Constructor of clustering algorithm K-Means.
85
        @details Center initializer can be used for creating initial centers, for example, K-Means++ method.
86
        
87
        @param[in] data (array_like): Input data that is presented as array of points (objects), each point should be represented by array_like data structure.
88
        @param[in] initial_centers (array_like): Initial coordinates of centers of clusters that are represented by array_like data structure: [center1, center2, ...].
89
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance then algorithm stops processing.
90
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
91
        
92
        @see center_initializer
93
        
94
        """
95
        self.__pointer_data = numpy.matrix(data);
96
        self.__clusters = [];
97
        self.__centers = numpy.matrix(initial_centers);
98
        self.__tolerance = tolerance;
99
        
100
        self.__ccore = ccore;
101
        if (self.__ccore):
102
            self.__ccore = ccore_library.workable();
103
104
105
    def process(self):
106
        """!
107
        @brief Performs cluster analysis in line with rules of K-Means algorithm.
108
        
109
        @remark Results of clustering can be obtained using corresponding get methods.
110
        
111
        @see get_clusters()
112
        @see get_centers()
113
        
114
        """
115
        
116
        if (self.__ccore is True):
117
            self.__clusters = wrapper.kmeans(self.__pointer_data, self.__centers, self.__tolerance);
118
            self.__centers = self.__update_centers();
119
        else: 
120
            maximum_change = float('inf');
121
             
122
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
123
            #stop_condition = self.__tolerance;              # Slow solution
124
             
125
            # Check for dimension
126
            if (len(self.__pointer_data[0]) != len(self.__centers[0])):
127
                raise NameError('Dimension of the input data and dimension of the initial cluster centers must be equal.');
128
             
129
            while (maximum_change > stop_condition):
130
                self.__clusters = self.__update_clusters();
131
                updated_centers = self.__update_centers();  # changes should be calculated before asignment
132
                
133
                if (len(self.__centers) != len(updated_centers)):
134
                    maximum_change = float('inf');
135
                
136
                else:
137
                    changes = numpy.sum(numpy.square(self.__centers - updated_centers), axis=1);
138
                    maximum_change = numpy.max(changes);
139
                
140
                self.__centers = updated_centers;
141
142
143
    def get_clusters(self):
144
        """!
145
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
146
        
147
        @see process()
148
        @see get_centers()
149
        
150
        """
151
        
152
        return self.__clusters;
153
    
154
    
155
    def get_centers(self):
156
        """!
157
        @brief Returns list of centers of allocated clusters.
158
        
159
        @see process()
160
        @see get_clusters()
161
        
162
        """
163
164
        return self.__centers.tolist();
165
166
167
    def get_cluster_encoding(self):
168
        """!
169
        @brief Returns clustering result representation type that indicate how clusters are encoded.
170
        
171
        @return (type_encoding) Clustering result representation.
172
        
173
        @see get_clusters()
174
        
175
        """
176
        
177
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
178
179
180
    def __update_clusters(self):
181
        """!
182
        @brief Calculate Euclidean distance to each point from the each cluster. Nearest points are captured by according clusters and as a result clusters are updated.
183
        
184
        @return (list) updated clusters as list of clusters. Each cluster contains indexes of objects from data.
185
        
186
        """
187
        
188
        clusters = [[] for _ in range(len(self.__centers))];
189
        
190
        dataset_differences = numpy.zeros((len(clusters), len(self.__pointer_data)));
191
        for index_center in range(len(self.__centers)):
192
            dataset_differences[index_center] = numpy.sum(numpy.square(self.__pointer_data - self.__centers[index_center]), axis=1).T;
193
        
194
        optimum_indexes = numpy.argmin(dataset_differences, axis=0);
195
        for index_point in range(len(optimum_indexes)):
196
            index_cluster = optimum_indexes[index_point];
197
            clusters[index_cluster].append(index_point);
198
        
199
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
200
        
201
        return clusters;
202
    
203
    
204
    def __update_centers(self):
205
        """!
206
        @brief Calculate centers of clusters in line with contained objects.
207
        
208
        @return (numpy.matrix) Updated centers as list of centers.
209
        
210
        """
211
        
212
        dimension = self.__pointer_data.shape[1];
213
        centers = numpy.zeros((len(self.__clusters), dimension));
214
        
215
        for index in range(len(self.__clusters)):
216
            cluster_points = self.__pointer_data[self.__clusters[index], :];
217
            centers[index] = cluster_points.mean(axis=0);
218
219
        return numpy.matrix(centers);
220