|
1
|
|
|
import random |
|
2
|
|
|
import math |
|
3
|
|
|
|
|
4
|
|
|
|
|
5
|
|
|
class random_center_initializer: |
|
6
|
|
|
def __init__(self, data, amount_centers): |
|
7
|
|
|
self.__data = data; |
|
8
|
|
|
self.__amount = amount_centers; |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
def initialize(self): |
|
12
|
|
|
return [ self.__create_center() for _ in range(len(self.__amount)) ]; |
|
13
|
|
|
|
|
14
|
|
|
|
|
15
|
|
|
def __create_center(self): |
|
16
|
|
|
return [ random.random() for _ in range(len(self.__data[0])) ]; |
|
17
|
|
|
|
|
18
|
|
|
|
|
19
|
|
|
class kmeans_plusplus_initializer: |
|
20
|
|
|
|
|
21
|
|
|
def __init__(self, data, amount_centers): |
|
22
|
|
|
self.__data = data |
|
23
|
|
|
self.__amount = amount_centers |
|
24
|
|
|
|
|
25
|
|
|
@staticmethod |
|
26
|
|
|
def get_euclidean_distance(point1, point2): |
|
27
|
|
|
""" |
|
28
|
|
|
Euclidean distance : d(q,p) = sqrt((q1 - p1)^2 + (q2 - p2)^2 + ...) |
|
29
|
|
|
:param point1: list with coordinated |
|
30
|
|
|
:param point2: list with coordinated |
|
31
|
|
|
""" |
|
32
|
|
|
|
|
33
|
|
|
# Initialize distance |
|
34
|
|
|
distance = 0.0 |
|
35
|
|
|
|
|
36
|
|
|
# Points should have equal size |
|
37
|
|
|
if len(point1) != len(point2): |
|
38
|
|
|
raise AttributeError('Try to calc Euclidean distance for points with different dimensions') |
|
39
|
|
|
|
|
40
|
|
|
# Calc (q1 - p1)^2 + (q2 - p2)^2 + ... |
|
41
|
|
|
for _p1, _p2 in zip(point1, point2): |
|
42
|
|
|
distance += (_p1 - _p2) ** 2 |
|
43
|
|
|
|
|
44
|
|
|
# Calc sqrt |
|
45
|
|
|
distance = math.sqrt(distance) |
|
46
|
|
|
|
|
47
|
|
|
return distance |
|
48
|
|
|
|
|
49
|
|
|
@staticmethod |
|
50
|
|
|
def get_uniform(probabilities): |
|
51
|
|
|
""" |
|
52
|
|
|
:param probabilities: List with segments in increasing sequence with val in [0, 1]. |
|
53
|
|
|
Example [0 0.1 0.2 0.3 1.0] |
|
54
|
|
|
:return: Index in 'probabilities' |
|
55
|
|
|
""" |
|
56
|
|
|
|
|
57
|
|
|
# Initialize return value |
|
58
|
|
|
res_idx = None |
|
59
|
|
|
|
|
60
|
|
|
# Get random num in range [0, 1) |
|
61
|
|
|
random_num = random.random() |
|
62
|
|
|
|
|
63
|
|
|
# Find segment with val1 < random_num < val2 |
|
64
|
|
|
for _idx in range(len(probabilities)): |
|
65
|
|
|
|
|
66
|
|
|
if random_num < probabilities[_idx]: |
|
67
|
|
|
res_idx = _idx |
|
68
|
|
|
break |
|
69
|
|
|
|
|
70
|
|
|
if res_idx is None: |
|
71
|
|
|
print('Random number : ', random_num) |
|
72
|
|
|
print('Probabilities : ', probabilities) |
|
73
|
|
|
raise AttributeError('list "probabilities" should contains 1 as the end of last segment(s)') |
|
74
|
|
|
|
|
75
|
|
|
return res_idx |
|
76
|
|
|
|
|
77
|
|
|
def get_first_center(self): |
|
78
|
|
|
""" Get first center chosen uniformly at random from data """ |
|
79
|
|
|
|
|
80
|
|
|
# Initialize list with uniform probabilities |
|
81
|
|
|
probabilities = [] |
|
82
|
|
|
|
|
83
|
|
|
# Fill probability list |
|
84
|
|
|
for i in range(len(self.__data)): |
|
85
|
|
|
probabilities.append((i + 1) / len(self.__data)) |
|
86
|
|
|
|
|
87
|
|
|
return self.__data[self.get_uniform(probabilities)] |
|
88
|
|
|
|
|
89
|
|
|
@staticmethod |
|
90
|
|
|
def calc_distance_to_nearest_center(data, centers): |
|
91
|
|
|
""" """ |
|
92
|
|
|
|
|
93
|
|
|
# Initialize |
|
94
|
|
|
distance_data = [] |
|
95
|
|
|
|
|
96
|
|
|
# For each data point x, compute D(x), the distance between x and the nearest center |
|
97
|
|
|
for _point in data: |
|
98
|
|
|
|
|
99
|
|
|
# Min dist to nearest center |
|
100
|
|
|
min_dist = float('inf') |
|
101
|
|
|
|
|
102
|
|
|
# For each center |
|
103
|
|
|
for _center in centers: |
|
104
|
|
|
min_dist = min(min_dist, kmeans_plusplus_initializer.get_euclidean_distance(_center, _point)) |
|
105
|
|
|
|
|
106
|
|
|
# Add distance to nearest center into result list |
|
107
|
|
|
distance_data.append(min_dist) |
|
108
|
|
|
|
|
109
|
|
|
return distance_data |
|
110
|
|
|
|
|
111
|
|
|
@staticmethod |
|
112
|
|
|
def get_sum_for_normalize_distance(distance): |
|
113
|
|
|
""" """ |
|
114
|
|
|
|
|
115
|
|
|
sum_distance = 0 |
|
116
|
|
|
|
|
117
|
|
|
for _dist in distance: |
|
118
|
|
|
sum_distance += _dist ** 2 |
|
119
|
|
|
|
|
120
|
|
|
return sum_distance |
|
121
|
|
|
|
|
122
|
|
|
@staticmethod |
|
123
|
|
|
def set_last_value_to_one(probabilities): |
|
124
|
|
|
""" """ |
|
125
|
|
|
|
|
126
|
|
|
# Start from the last elem |
|
127
|
|
|
back_idx = - 1 |
|
128
|
|
|
|
|
129
|
|
|
# All values equal to the last elem should be set to 1 |
|
130
|
|
|
last_val = probabilities[back_idx] |
|
131
|
|
|
|
|
132
|
|
|
# for all elements or if a elem not equal to the last elem |
|
133
|
|
|
for _idx in range(-1, -len(probabilities) - 1): |
|
|
|
|
|
|
134
|
|
|
if probabilities[back_idx] == last_val: |
|
135
|
|
|
probabilities[back_idx] = 1 |
|
136
|
|
|
else: |
|
137
|
|
|
break |
|
138
|
|
|
|
|
139
|
|
|
@staticmethod |
|
140
|
|
|
def get_probabilities_from_distance(distance): |
|
141
|
|
|
""" """ |
|
142
|
|
|
|
|
143
|
|
|
# Normalize distance |
|
144
|
|
|
sum_for_normalize = kmeans_plusplus_initializer.get_sum_for_normalize_distance(distance) |
|
145
|
|
|
|
|
146
|
|
|
# Create list with probabilities |
|
147
|
|
|
|
|
148
|
|
|
# Initialize list with probabilities |
|
149
|
|
|
probabilities = [] |
|
150
|
|
|
|
|
151
|
|
|
# Variable to accumulate probabilities |
|
152
|
|
|
prev_value = 0 |
|
153
|
|
|
|
|
154
|
|
|
# Fill probabilities as : |
|
155
|
|
|
# p[idx] = D[idx]^2 / sum_2 |
|
156
|
|
|
# where sum_2 = D[0]^2 + D[1]^2 + ... |
|
157
|
|
|
for _dist in distance: |
|
158
|
|
|
prev_value = (_dist ** 2) / sum_for_normalize + prev_value |
|
159
|
|
|
probabilities.append(prev_value) |
|
160
|
|
|
|
|
161
|
|
|
# Set last value to 1 |
|
162
|
|
|
kmeans_plusplus_initializer.set_last_value_to_one(probabilities) |
|
163
|
|
|
|
|
164
|
|
|
return probabilities |
|
165
|
|
|
|
|
166
|
|
|
def initialize(self): |
|
167
|
|
|
""" kmeans++ method for center initialization """ |
|
168
|
|
|
|
|
169
|
|
|
# Should be at least 1 center |
|
170
|
|
|
if self.__amount <= 0: |
|
171
|
|
|
raise AttributeError('Amount of cluster centers should be at least 1') |
|
172
|
|
|
|
|
173
|
|
|
# Initialize result list by the first centers |
|
174
|
|
|
centers = [self.get_first_center()] |
|
175
|
|
|
|
|
176
|
|
|
# For each next center |
|
177
|
|
|
for _ in range(1, self.__amount): |
|
178
|
|
|
|
|
179
|
|
|
# Calc Distance for each data |
|
180
|
|
|
distance_data = self.calc_distance_to_nearest_center(data=self.__data, centers=centers) |
|
181
|
|
|
|
|
182
|
|
|
# Create list with probabilities |
|
183
|
|
|
probabilities = self.get_probabilities_from_distance(distance_data) |
|
184
|
|
|
print('Probability : ', probabilities) |
|
185
|
|
|
|
|
186
|
|
|
# Choose one new data point at random as a new center, using a weighted probability distribution |
|
187
|
|
|
ret_idx = self.get_uniform(probabilities) |
|
188
|
|
|
|
|
189
|
|
|
# Add new center |
|
190
|
|
|
centers.append(self.__data[ret_idx]) |
|
191
|
|
|
|
|
192
|
|
|
print('Centers : ', centers) |
|
193
|
|
|
print('Return index : ', ret_idx) |
|
194
|
|
|
# where a point x is chosen with probability proportional to D(x)^2. |
|
195
|
|
|
|
|
196
|
|
|
# Is all centers are initialized |
|
197
|
|
|
|
|
198
|
|
|
|
|
199
|
|
|
# # Test |
|
200
|
|
|
# random.seed() |
|
201
|
|
|
# obj = kmeans_plusplus_initializer([[0, 1], [0, 0], [1, 0], [1, 1], [-1, 1], [-1, -1]], 6) |
|
202
|
|
|
# obj.initialize() |
|
203
|
|
|
|
|
204
|
|
|
# import time |
|
205
|
|
|
# import numpy as np |
|
206
|
|
|
# from scipy.spatial import distance |
|
207
|
|
|
# |
|
208
|
|
|
# # a = np.random.rand(100000000) |
|
209
|
|
|
# # b = np.random.rand(100000000) |
|
210
|
|
|
# |
|
211
|
|
|
# # point_1 = [0.21312, -1.214214] |
|
212
|
|
|
# # point_2 = [198237.1231, 3231.1231] |
|
213
|
|
|
# # point_3 = [-312.3213, 312.3213] |
|
214
|
|
|
# # point_4 = [-32131, -31231] |
|
215
|
|
|
# |
|
216
|
|
|
# # c = a - b |
|
217
|
|
|
# |
|
218
|
|
|
# time_start = time.clock() |
|
219
|
|
|
# # dist = np.linalg.norm(c) |
|
220
|
|
|
# # dst = distance.euclidean(a, b) |
|
221
|
|
|
# # dst = distance.cityblock(a, b) |
|
222
|
|
|
# # dst = distance.canberra(a, b) |
|
223
|
|
|
# # dst = distance.pdist(a, b, 'canberra') |
|
224
|
|
|
# |
|
225
|
|
|
# # print(distance.euclidean(point_1, point_2)) |
|
226
|
|
|
# # print(distance.euclidean(point_2, point_3)) |
|
227
|
|
|
# # print(distance.euclidean(point_3, point_4)) |
|
228
|
|
|
# |
|
229
|
|
|
# time_end = time.clock() |
|
230
|
|
|
# |
|
231
|
|
|
# print('time : ', time_end - time_start) |
|
232
|
|
|
|
|
233
|
|
|
|
|
234
|
|
|
|
|
235
|
|
|
|