Completed
Push — master ( b7ad63...18ed0c )
by Andrei
01:27
created

kmeans_plusplus_initializer.initialize()   B

Complexity

Conditions 2

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 2
dl 0
loc 28
rs 8.8571
c 2
b 0
f 0
1
"""!
2
3
@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4
@details Implementations based on articles:
5
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
         
7
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@see kmeans
12
@see xmeans
13
14
@cond GNU_PUBLIC_LICENSE
15
    PyClustering is free software: you can redistribute it and/or modify
16
    it under the terms of the GNU General Public License as published by
17
    the Free Software Foundation, either version 3 of the License, or
18
    (at your option) any later version.
19
    
20
    PyClustering is distributed in the hope that it will be useful,
21
    but WITHOUT ANY WARRANTY; without even the implied warranty of
22
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
    GNU General Public License for more details.
24
    
25
    You should have received a copy of the GNU General Public License
26
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
27
@endcond
28
29
"""
30
31
import random;
32
33
from pyclustering.utils import euclidean_distance;
34
35
36
class random_center_initializer:
37
    """!
38
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
39
    
40
    """
41
42
    def __init__(self, data, amount_centers):
43
        """!
44
        @brief Creates instance of random center initializer.
45
        
46
        @param[in] data (list): List of points where each point is represented by list of coordinates.
47
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
48
        
49
        """
50
        
51
        self.__data = data;
52
        self.__amount = amount_centers;
53
54
        if self.__amount <= 0:
55
            raise AttributeError("Amount of cluster centers should be at least 1.");
56
57
58
    def initialize(self):
59
        """!
60
        @brief Generates random centers in line with input parameters.
61
        
62
        @return (list) List of centers where each center is represented by list of coordinates.
63
        
64
        """
65
        return [ self.__create_center() for _ in range(self.__amount) ];
66
67
68
    def __create_center(self):
69
        """!
70
        @brief Generates and returns random center.
71
        
72
        """
73
        return [ random.random() for _ in range(len(self.__data[0])) ];
74
75
76
77
class kmeans_plusplus_initializer:
78
    """!
79
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
80
    @details Clustering results are depends on initial centers in case of K-Means algorithm and even in case of X-Means.
81
              This method is used to find out optimal initial centers. There is an example of initial centers that were
82
              calculated by the K-Means++ method:
83
    
84
    @image html kmeans_plusplus_initializer_results.png
85
    
86
    Code example:
87
    @code
88
        # Read data 'SampleSimple3' from Simple Sample collection.
89
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
90
        
91
        # Calculate initial centers using K-Means++ method.
92
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
93
        
94
        # Display initial centers.
95
        visualizer = cluster_visualizer();
96
        visualizer.append_cluster(sample);
97
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
98
        visualizer.show();
99
        
100
        # Perform cluster analysis using K-Means algorithm with initial centers.
101
        kmeans_instance = kmeans(sample, centers);
102
        
103
        # Run clustering process and obtain result.
104
        kmeans_instance.process();
105
        clusters = kmeans_instance.get_clusters();
106
    @endcode
107
    
108
    """
109
    
110
    def __init__(self, data, amount_centers):
111
        """!
112
        @brief Creates K-Means++ center initializer instance.
113
        
114
        @param[in] data (list): List of points where each point is represented by list of coordinates.
115
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
116
        
117
        """
118
        
119
        self.__data = data;
120
        self.__amount = amount_centers;
121
        
122
        if self.__amount <= 0:
123
            raise AttributeError("Amount of cluster centers should be at least 1.");
124
    
125
    
126
    def __get_uniform(self, probabilities):
127
        """!
128
        @brief Returns index in probabilities.
129
        
130
        @param[in] probabilities (list): List with segments in increasing sequence with val in [0, 1], for example, [0 0.1 0.2 0.3 1.0].
131
        
132
        """
133
134
        # Initialize return value
135
        res_idx = None;
136
137
        # Get random num in range [0, 1)
138
        random_num = random.random();
139
140
        # Find segment with  val1 < random_num < val2
141
        for _idx in range(len(probabilities)):
142
            if random_num < probabilities[_idx]:
143
                res_idx = _idx;
144
                break;
145
146
        if res_idx is None:
147
            # print('Random number : ', random_num);
148
            # print('Probabilities : ', probabilities);
149
            raise AttributeError("List 'probabilities' should contain 1 as the end of last segment(s)");
150
151
        return res_idx
152
153
154
    def __get_first_center(self):
155
        """!
156
        @brief Returns first center chosen uniformly at random from data.
157
        
158
        """
159
160
        # Initialize list with uniform probabilities
161
        probabilities = [];
162
163
        # Fill probability list
164
        for i in range(len(self.__data)):
165
            probabilities.append((i + 1) / len(self.__data));
166
167
        return self.__data[self.__get_uniform(probabilities)];
168
169
170
    def __calc_distance_to_nearest_center(self, data, centers):
171
        """!
172
        @brief Calculates distance from each data point to nearest center.
173
        
174
        @param[in] data (list): List of points where each point is represented by list of coordinates.
175
        @param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
176
        
177
        @return (list) List of distances to closest center for each data point.
178
        
179
        """
180
181
        # Initialize
182
        distance_data = [];
183
184
        # For each data point x, compute D(x), the distance between x and the nearest center
185
        for _point in data:
186
187
            # Min dist to nearest center
188
            min_dist = float('inf');
189
190
            # For each center
191
            for _center in centers:
192
                min_dist = min(min_dist, euclidean_distance(_center, _point));
193
194
            # Add distance to nearest center into result list
195
            distance_data.append(min_dist);
196
197
        return distance_data;
198
199
200
    def __get_sum_for_normalize_distance(self, distance):
201
        """!
202
        @brief Calculates square sum distance that is used for normalization.
203
        
204
        @param[in] distance (list): List of minimum distances from each point to nearest center.
205
        
206
        @return (float) Square sum distance.
207
        
208
        """
209
210
        sum_distance = 0.0;
211
212
        for _dist in distance:
213
            sum_distance += _dist ** 2;
214
215
        return sum_distance;
216
217
218
    def __set_last_value_to_one(self, probabilities):
219
        """!
220
        @brief Update probabilities for all points.
221
        @details All values of probability list equals to the last element are set to 1.
222
        
223
        @param[in] probabilities (list): List of minimum distances from each point to nearest center.
224
        
225
        """
226
        # Start from the last elem
227
        back_idx = - 1;
228
229
        # All values equal to the last elem should be set to 1
230
        last_val = probabilities[back_idx];
231
232
        # for all elements or if a elem not equal to the last elem
233
        for _idx in range(-1, -len(probabilities) - 1):
0 ignored issues
show
Unused Code introduced by
The variable _idx seems to be unused.
Loading history...
234
            if probabilities[back_idx] == last_val:
235
                probabilities[back_idx] = 1;
236
            else:
237
                break;
238
239
240
    def __get_probabilities_from_distance(self, distance):
241
        """!
0 ignored issues
show
Bug introduced by
A suspicious escape sequence \s was found. Did you maybe forget to add an r prefix?

Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with r or R are they interpreted as regular expressions.

The escape sequence that was used indicates that you might have intended to write a regular expression.

Learn more about the available escape sequences. in the Python documentation.

Loading history...
242
        @brief Calculates probabilities from distance.
243
        @details Probabilities are filled by following expression:
244
        
245
        \f[
246
        p[i]=\frac{dist_{i}^2}{\sum_{i=0}^{N}dist_{i}};
247
        \f]
248
        
249
        @param[in] distance (list): List of minimum distances from each point to nearest center.
250
        
251
        @return (list) Weighted belonging probability for each point to its nearest center.
252
        
253
        """
254
        # Normalize distance
255
        sum_for_normalize = self.__get_sum_for_normalize_distance(distance);
256
257
        # Create list with probabilities
258
259
        # Initialize list with probabilities
260
        probabilities = [];
261
262
        # Variable to accumulate probabilities
263
        prev_value = 0;
264
265
        # Fill probabilities as :
266
        #   p[idx] = D[idx]^2 / sum_2
267
        #       where sum_2 = D[0]^2 + D[1]^2 + ...
268
        for _dist in distance:
269
            prev_value = (_dist ** 2) / sum_for_normalize + prev_value;
270
            probabilities.append(prev_value);
271
272
        # Set last value to 1
273
        self.__set_last_value_to_one(probabilities);
274
275
        return probabilities;
276
277
278
    def initialize(self):
279
        """!
280
        @brief Calculates initial centers using K-Means++ method.
281
        
282
        @return (list) List of initialized initial centers.
283
        
284
        """
285
        # Initialize result list by the first centers
286
        centers = [self.__get_first_center()];
287
288
        # For each next center
289
        for _ in range(1, self.__amount):
290
291
            # Calc Distance for each data
292
            distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);
293
294
            # Create list with probabilities
295
            probabilities = self.__get_probabilities_from_distance(distance_data);
296
            # print('Probability : ', probabilities);
297
298
            # Choose one new data point at random as a new center, using a weighted probability distribution
299
            ret_idx = self.__get_uniform(probabilities);
300
301
            # Add new center
302
            centers.append(self.__data[ret_idx]);
303
304
        # Is all centers are initialized
305
        return centers;