1
|
|
|
"""! |
2
|
|
|
|
3
|
|
|
@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means. |
4
|
|
|
@details Implementations based on articles: |
5
|
|
|
- K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007. |
6
|
|
|
|
7
|
|
|
@authors Andrei Novikov, Aleksey Kukushkin ([email protected]) |
8
|
|
|
@date 2014-2018 |
9
|
|
|
@copyright GNU Public License |
10
|
|
|
|
11
|
|
|
@see kmeans |
12
|
|
|
@see xmeans |
13
|
|
|
|
14
|
|
|
@cond GNU_PUBLIC_LICENSE |
15
|
|
|
PyClustering is free software: you can redistribute it and/or modify |
16
|
|
|
it under the terms of the GNU General Public License as published by |
17
|
|
|
the Free Software Foundation, either version 3 of the License, or |
18
|
|
|
(at your option) any later version. |
19
|
|
|
|
20
|
|
|
PyClustering is distributed in the hope that it will be useful, |
21
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
22
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23
|
|
|
GNU General Public License for more details. |
24
|
|
|
|
25
|
|
|
You should have received a copy of the GNU General Public License |
26
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>. |
27
|
|
|
@endcond |
28
|
|
|
|
29
|
|
|
""" |
30
|
|
|
|
31
|
|
|
|
32
|
|
|
import math; |
|
|
|
|
33
|
|
|
import numpy; |
|
|
|
|
34
|
|
|
import random; |
35
|
|
|
|
36
|
|
|
from pyclustering.utils import euclidean_distance; |
|
|
|
|
37
|
|
|
|
38
|
|
|
|
39
|
|
|
class random_center_initializer: |
40
|
|
|
"""! |
41
|
|
|
@brief Random center initializer is for generation specified amount of random of centers for specified data. |
42
|
|
|
|
43
|
|
|
""" |
44
|
|
|
|
45
|
|
|
def __init__(self, data, amount_centers): |
46
|
|
|
"""! |
47
|
|
|
@brief Creates instance of random center initializer. |
48
|
|
|
|
49
|
|
|
@param[in] data (list): List of points where each point is represented by list of coordinates. |
50
|
|
|
@param[in] amount_centers (unit): Amount of centers that should be initialized. |
51
|
|
|
|
52
|
|
|
""" |
53
|
|
|
|
54
|
|
|
self.__data = data; |
55
|
|
|
self.__amount = amount_centers; |
56
|
|
|
|
57
|
|
|
if self.__amount <= 0: |
58
|
|
|
raise AttributeError("Amount of cluster centers should be at least 1."); |
59
|
|
|
|
60
|
|
|
|
61
|
|
|
def initialize(self): |
62
|
|
|
"""! |
63
|
|
|
@brief Generates random centers in line with input parameters. |
64
|
|
|
|
65
|
|
|
@return (list) List of centers where each center is represented by list of coordinates. |
66
|
|
|
|
67
|
|
|
""" |
68
|
|
|
return [ self.__create_center() for _ in range(self.__amount) ]; |
69
|
|
|
|
70
|
|
|
|
71
|
|
|
def __create_center(self): |
72
|
|
|
"""! |
73
|
|
|
@brief Generates and returns random center. |
74
|
|
|
|
75
|
|
|
""" |
76
|
|
|
return [ random.random() for _ in range(len(self.__data[0])) ]; |
77
|
|
|
|
78
|
|
|
|
79
|
|
|
|
80
|
|
|
class kmeans_plusplus_initializer: |
81
|
|
|
"""! |
82
|
|
|
@brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means. |
83
|
|
|
@details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on |
84
|
|
|
initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find |
85
|
|
|
out optimal initial centers. There is an example of initial centers that were calculated by the |
86
|
|
|
K-Means++ method: |
87
|
|
|
|
88
|
|
|
@image html kmeans_plusplus_initializer_results.png |
89
|
|
|
|
90
|
|
|
Code example: |
91
|
|
|
@code |
92
|
|
|
# Read data 'SampleSimple3' from Simple Sample collection. |
93
|
|
|
sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3); |
94
|
|
|
|
95
|
|
|
# Calculate initial centers using K-Means++ method. |
96
|
|
|
centers = kmeans_plusplus_initializer(sample, 4).initialize(); |
97
|
|
|
|
98
|
|
|
# Display initial centers. |
99
|
|
|
visualizer = cluster_visualizer(); |
100
|
|
|
visualizer.append_cluster(sample); |
101
|
|
|
visualizer.append_cluster(centers, marker = '*', markersize = 10); |
102
|
|
|
visualizer.show(); |
103
|
|
|
|
104
|
|
|
# Perform cluster analysis using K-Means algorithm with initial centers. |
105
|
|
|
kmeans_instance = kmeans(sample, centers); |
106
|
|
|
|
107
|
|
|
# Run clustering process and obtain result. |
108
|
|
|
kmeans_instance.process(); |
109
|
|
|
clusters = kmeans_instance.get_clusters(); |
110
|
|
|
@endcode |
111
|
|
|
|
112
|
|
|
""" |
113
|
|
|
|
114
|
|
|
|
115
|
|
|
## Constant denotes that only points with highest probabilities should be considered as centers. |
116
|
|
|
FARTHEST_CENTER_CANDIDATE = "farthest"; |
117
|
|
|
|
118
|
|
|
|
119
|
|
|
def __init__(self, data, amount_centers, amount_candidates = 2): |
120
|
|
|
"""! |
121
|
|
|
@brief Creates K-Means++ center initializer instance. |
122
|
|
|
|
123
|
|
|
@param[in] data (array_like): List of points where each point is represented by list of coordinates. |
124
|
|
|
@param[in] amount_centers (uint): Amount of centers that should be initialized. |
125
|
|
|
@param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should |
126
|
|
|
be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'. |
127
|
|
|
|
128
|
|
|
@see FARTHEST_CENTER_CANDIDATE |
129
|
|
|
|
130
|
|
|
""" |
131
|
|
|
|
132
|
|
|
self.__data = numpy.array(data); |
133
|
|
|
self.__amount = amount_centers; |
134
|
|
|
self.__candidates = amount_candidates; |
135
|
|
|
|
136
|
|
|
self.__check_parameters(); |
137
|
|
|
|
138
|
|
|
|
139
|
|
|
def __check_parameters(self): |
140
|
|
|
"""! |
141
|
|
|
@brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown. |
142
|
|
|
|
143
|
|
|
""" |
144
|
|
|
if (self.__amount <= 0) or (self.__amount > len(self.__data)): |
145
|
|
|
raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data."); |
146
|
|
|
|
147
|
|
|
if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE: |
148
|
|
|
if (self.__candidates <= 0) or (self.__candidates > len(self.__data)): |
149
|
|
|
raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data."); |
150
|
|
|
|
151
|
|
|
if len(self.__data) == 0: |
152
|
|
|
raise AttributeError("Data is empty.") |
153
|
|
|
|
154
|
|
|
|
155
|
|
|
def __calc_distance_to_nearest_center(self, data, centers): |
156
|
|
|
"""! |
157
|
|
|
@brief Calculates distance from each data point to nearest center. |
158
|
|
|
|
159
|
|
|
@param[in] data (numpy.array): Array of points for that initialization is performed. |
160
|
|
|
@param[in] centers (numpy.array): Array of points that represents centers. |
161
|
|
|
|
162
|
|
|
@return (numpy.array) List of distances to closest center for each data point. |
163
|
|
|
|
164
|
|
|
""" |
165
|
|
|
|
166
|
|
|
dataset_differences = numpy.zeros((len(centers), len(data))); |
167
|
|
|
for index_center in range(len(centers)): |
168
|
|
|
dataset_differences[index_center] = numpy.sum( |
169
|
|
|
numpy.square(data - centers[index_center]), axis=1).T; |
170
|
|
|
|
171
|
|
|
shortest_distances = numpy.min(dataset_differences, axis=0); |
172
|
|
|
return shortest_distances; |
173
|
|
|
|
174
|
|
|
|
175
|
|
|
def __get_next_center(self, centers): |
176
|
|
|
"""! |
177
|
|
|
@brief Calculates the next center for the data. |
178
|
|
|
|
179
|
|
|
@param[in] centers (list): Current initialized centers. |
180
|
|
|
|
181
|
|
|
@return (list) Next initialized center. |
182
|
|
|
|
183
|
|
|
""" |
184
|
|
|
|
185
|
|
|
distance_data = self.__calc_distance_to_nearest_center(data=self.__data, centers=centers); |
186
|
|
|
center_index = numpy.argmax(distance_data); |
187
|
|
|
|
188
|
|
|
return self.__data[center_index]; |
189
|
|
|
|
190
|
|
|
|
191
|
|
|
def initialize(self): |
192
|
|
|
"""! |
193
|
|
|
@brief Calculates initial centers using K-Means++ method. |
194
|
|
|
|
195
|
|
|
@return (list) List of initialized initial centers. |
196
|
|
|
|
197
|
|
|
""" |
198
|
|
|
|
199
|
|
|
# Initialize result list by the first centers |
200
|
|
|
index_center = random.randint(0, len(self.__data) - 1); |
201
|
|
|
centers = [ self.__data[ index_center ] ]; |
202
|
|
|
|
203
|
|
|
# For each next center |
204
|
|
|
for _ in range(1, self.__amount): |
205
|
|
|
next_center = self.__get_next_center(centers); |
206
|
|
|
centers.append(next_center); |
207
|
|
|
|
208
|
|
|
return centers; |