Completed
Push — 0.7.dev ( a4d05c...92cf01 )
by
unknown
01:00
created

get_euclidean_distance()   A

Complexity

Conditions 3

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 23
rs 9.0856
1
import random
2
import math
3
4
5
class random_center_initializer:
6
    def __init__(self, data, amount_centers):
7
        self.__data = data;
8
        self.__amount = amount_centers;
9
10
11
    def initialize(self):
12
        return [ self.__create_center() for _ in range(len(self.__amount)) ];
13
14
15
    def __create_center(self):
16
        return [ random.random() for _ in range(len(self.__data[0])) ];
17
18
19
class kmeans_plusplus_initializer:
20
21
    def __init__(self, data, amount_centers):
22
        self.__data = data
23
        self.__amount = amount_centers
24
25
    @staticmethod
26
    def get_euclidean_distance(point1, point2):
27
        """
28
            Euclidean distance : d(q,p) = sqrt((q1 - p1)^2 + (q2 - p2)^2 + ...)
29
            :param point1: list with coordinated
30
            :param point2: list with coordinated
31
        """
32
33
        # Initialize distance
34
        distance = 0.0
35
36
        # Points should have equal size
37
        if len(point1) != len(point2):
38
            raise AttributeError('Try to calc Euclidean distance for points with different dimensions')
39
40
        # Calc (q1 - p1)^2 + (q2 - p2)^2 + ...
41
        for _p1, _p2 in zip(point1, point2):
42
            distance += (_p1 - _p2) ** 2
43
44
        # Calc sqrt
45
        distance = math.sqrt(distance)
46
47
        return distance
48
49
    @staticmethod
50
    def get_uniform(probabilities):
51
        """
52
        :param probabilities: List with segments in increasing sequence with val in [0, 1].
53
                    Example [0 0.1 0.2 0.3 1.0]
54
        :return: Index in 'probabilities'
55
        """
56
57
        # Initialize return value
58
        res_idx = None
59
60
        # Get random num in range [0, 1)
61
        random_num = random.random()
62
63
        # Find segment with  val1 < random_num < val2
64
        for _idx in range(len(probabilities)):
65
66
            if random_num < probabilities[_idx]:
67
                res_idx = _idx
68
                break
69
70
        if res_idx is None:
71
            print('Random number : ', random_num)
72
            print('Probabilities : ', probabilities)
73
            raise AttributeError('list "probabilities" should contains 1 as the end of last segment(s)')
74
75
        return res_idx
76
77
    def get_first_center(self):
78
        """  Get first center chosen uniformly at random from data """
79
80
        # Initialize list with uniform probabilities
81
        probabilities = []
82
83
        # Fill probability list
84
        for i in range(len(self.__data)):
85
            probabilities.append((i + 1) / len(self.__data))
86
87
        return self.__data[self.get_uniform(probabilities)]
88
89
    @staticmethod
90
    def calc_distance_to_nearest_center(data, centers):
91
        """   """
92
93
        # Initialize
94
        distance_data = []
95
96
        # For each data point x, compute D(x), the distance between x and the nearest center
97
        for _point in data:
98
99
            # Min dist to nearest center
100
            min_dist = float('inf')
101
102
            # For each center
103
            for _center in centers:
104
                min_dist = min(min_dist, kmeans_plusplus_initializer.get_euclidean_distance(_center, _point))
105
106
            # Add distance to nearest center into result list
107
            distance_data.append(min_dist)
108
109
        return distance_data
110
111
    @staticmethod
112
    def get_sum_for_normalize_distance(distance):
113
        """  """
114
115
        sum_distance = 0
116
117
        for _dist in distance:
118
            sum_distance += _dist ** 2
119
120
        return sum_distance
121
122
    @staticmethod
123
    def set_last_value_to_one(probabilities):
124
        """ """
125
126
        # Start from the last elem
127
        back_idx = - 1
128
129
        # All values equal to the last elem should be set to 1
130
        last_val = probabilities[back_idx]
131
132
        # for all elements or if a elem not equal to the last elem
133
        for _idx in range(-1, -len(probabilities) - 1):
0 ignored issues
show
Unused Code introduced by
The variable _idx seems to be unused.
Loading history...
134
            if probabilities[back_idx] == last_val:
135
                probabilities[back_idx] = 1
136
            else:
137
                break
138
139
    @staticmethod
140
    def get_probabilities_from_distance(distance):
141
        """   """
142
143
        # Normalize distance
144
        sum_for_normalize = kmeans_plusplus_initializer.get_sum_for_normalize_distance(distance)
145
146
        # Create list with probabilities
147
148
        # Initialize list with probabilities
149
        probabilities = []
150
151
        # Variable to accumulate probabilities
152
        prev_value = 0
153
154
        # Fill probabilities as :
155
        #   p[idx] = D[idx]^2 / sum_2
156
        #       where sum_2 = D[0]^2 + D[1]^2 + ...
157
        for _dist in distance:
158
            prev_value = (_dist ** 2) / sum_for_normalize + prev_value
159
            probabilities.append(prev_value)
160
161
        # Set last value to 1
162
        kmeans_plusplus_initializer.set_last_value_to_one(probabilities)
163
164
        return probabilities
165
166
    def initialize(self):
167
        """  kmeans++ method for center initialization  """
168
169
        # Should be at least 1 center
170
        if self.__amount <= 0:
171
            raise AttributeError('Amount of cluster centers should be at least 1')
172
173
        # Initialize result list by the first centers
174
        centers = [self.get_first_center()]
175
176
        # For each next center
177
        for _ in range(1, self.__amount):
178
179
            # Calc Distance for each data
180
            distance_data = self.calc_distance_to_nearest_center(data=self.__data, centers=centers)
181
182
            # Create list with probabilities
183
            probabilities = self.get_probabilities_from_distance(distance_data)
184
            print('Probability : ', probabilities)
185
186
            # Choose one new data point at random as a new center, using a weighted probability distribution
187
            ret_idx = self.get_uniform(probabilities)
188
189
            # Add new center
190
            centers.append(self.__data[ret_idx])
191
192
            print('Centers : ', centers)
193
            print('Return index : ', ret_idx)
194
            # where a point x is chosen with probability proportional to D(x)^2.
195
196
        # Is all centers are initialized
197
198
199
# # Test
200
# random.seed()
201
# obj = kmeans_plusplus_initializer([[0, 1], [0, 0], [1, 0], [1, 1], [-1, 1], [-1, -1]], 6)
202
# obj.initialize()
203
204
# import time
205
# import numpy as np
206
# from scipy.spatial import distance
207
#
208
# # a = np.random.rand(100000000)
209
# # b = np.random.rand(100000000)
210
#
211
# # point_1 = [0.21312, -1.214214]
212
# # point_2 = [198237.1231, 3231.1231]
213
# # point_3 = [-312.3213, 312.3213]
214
# # point_4 = [-32131, -31231]
215
#
216
# # c = a - b
217
#
218
# time_start = time.clock()
219
# # dist = np.linalg.norm(c)
220
# # dst = distance.euclidean(a, b)
221
# # dst = distance.cityblock(a, b)
222
# # dst = distance.canberra(a, b)
223
# # dst = distance.pdist(a, b, 'canberra')
224
#
225
# # print(distance.euclidean(point_1, point_2))
226
# # print(distance.euclidean(point_2, point_3))
227
# # print(distance.euclidean(point_3, point_4))
228
#
229
# time_end = time.clock()
230
#
231
# print('time : ', time_end - time_start)
232
233
234
235