kmeans_plusplus_initializer.initialize() - Code Metrics - Inspection of "#355: Order and local order parameter evolution fo..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( b7ad63...18ed0c )

by Andrei

created 2017-09-04 10:23 UTC

kmeans_plusplus_initializer.initialize() B

↳ Parent: kmeans_plusplus_initializer

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	0	Features	0

Metric	Value
cc	2
dl	0
loc	28
rs	8.8571
c	2
b	0
f	0

"""!

@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
@details Implementations based on articles:
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
         
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
@date 2014-2017
@copyright GNU Public License

@see kmeans
@see xmeans

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

import random;

from pyclustering.utils import euclidean_distance;


class random_center_initializer:
    """!
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
    
    """

    def __init__(self, data, amount_centers):
        """!
        @brief Creates instance of random center initializer.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;

        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");


    def initialize(self):
        """!
        @brief Generates random centers in line with input parameters.
        
        @return (list) List of centers where each center is represented by list of coordinates.
        
        """
        return [ self.__create_center() for _ in range(self.__amount) ];


    def __create_center(self):
        """!
        @brief Generates and returns random center.
        
        """
        return [ random.random() for _ in range(len(self.__data[0])) ];



class kmeans_plusplus_initializer:
    """!
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
    @details Clustering results are depends on initial centers in case of K-Means algorithm and even in case of X-Means.
              This method is used to find out optimal initial centers. There is an example of initial centers that were
              calculated by the K-Means++ method:
    
    @image html kmeans_plusplus_initializer_results.png
    
    Code example:
    @code
        # Read data 'SampleSimple3' from Simple Sample collection.
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
        
        # Calculate initial centers using K-Means++ method.
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
        
        # Display initial centers.
        visualizer = cluster_visualizer();
        visualizer.append_cluster(sample);
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
        visualizer.show();
        
        # Perform cluster analysis using K-Means algorithm with initial centers.
        kmeans_instance = kmeans(sample, centers);
        
        # Run clustering process and obtain result.
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, amount_centers):
        """!
        @brief Creates K-Means++ center initializer instance.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;
        
        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");
    
    
    def __get_uniform(self, probabilities):
        """!
        @brief Returns index in probabilities.
        
        @param[in] probabilities (list): List with segments in increasing sequence with val in [0, 1], for example, [0 0.1 0.2 0.3 1.0].
        
        """

        # Initialize return value
        res_idx = None;

        # Get random num in range [0, 1)
        random_num = random.random();

        # Find segment with  val1 < random_num < val2
        for _idx in range(len(probabilities)):
            if random_num < probabilities[_idx]:
                res_idx = _idx;
                break;

        if res_idx is None:
            # print('Random number : ', random_num);
            # print('Probabilities : ', probabilities);
            raise AttributeError("List 'probabilities' should contain 1 as the end of last segment(s)");

        return res_idx


    def __get_first_center(self):
        """!
        @brief Returns first center chosen uniformly at random from data.
        
        """

        # Initialize list with uniform probabilities
        probabilities = [];

        # Fill probability list
        for i in range(len(self.__data)):
            probabilities.append((i + 1) / len(self.__data));

        return self.__data[self.__get_uniform(probabilities)];


    def __calc_distance_to_nearest_center(self, data, centers):
        """!
        @brief Calculates distance from each data point to nearest center.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
        
        @return (list) List of distances to closest center for each data point.
        
        """

        # Initialize
        distance_data = [];

        # For each data point x, compute D(x), the distance between x and the nearest center
        for _point in data:

            # Min dist to nearest center
            min_dist = float('inf');

            # For each center
            for _center in centers:
                min_dist = min(min_dist, euclidean_distance(_center, _point));

            # Add distance to nearest center into result list
            distance_data.append(min_dist);

        return distance_data;


    def __get_sum_for_normalize_distance(self, distance):
        """!
        @brief Calculates square sum distance that is used for normalization.
        
        @param[in] distance (list): List of minimum distances from each point to nearest center.
        
        @return (float) Square sum distance.
        
        """

        sum_distance = 0.0;

        for _dist in distance:
            sum_distance += _dist ** 2;

        return sum_distance;


    def __set_last_value_to_one(self, probabilities):
        """!
        @brief Update probabilities for all points.
        @details All values of probability list equals to the last element are set to 1.
        
        @param[in] probabilities (list): List of minimum distances from each point to nearest center.
        
        """
        # Start from the last elem
        back_idx = - 1;

        # All values equal to the last elem should be set to 1
        last_val = probabilities[back_idx];

        # for all elements or if a elem not equal to the last elem
        for _idx in range(-1, -len(probabilities) - 1):

            if probabilities[back_idx] == last_val:
                probabilities[back_idx] = 1;
            else:
                break;


    def __get_probabilities_from_distance(self, distance):
        """!

        @brief Calculates probabilities from distance.
        @details Probabilities are filled by following expression:
        
        \f[
        p[i]=\frac{dist_{i}^2}{\sum_{i=0}^{N}dist_{i}};
        \f]
        
        @param[in] distance (list): List of minimum distances from each point to nearest center.
        
        @return (list) Weighted belonging probability for each point to its nearest center.
        
        """
        # Normalize distance
        sum_for_normalize = self.__get_sum_for_normalize_distance(distance);

        # Create list with probabilities

        # Initialize list with probabilities
        probabilities = [];

        # Variable to accumulate probabilities
        prev_value = 0;

        # Fill probabilities as :
        #   p[idx] = D[idx]^2 / sum_2
        #       where sum_2 = D[0]^2 + D[1]^2 + ...
        for _dist in distance:
            prev_value = (_dist ** 2) / sum_for_normalize + prev_value;
            probabilities.append(prev_value);

        # Set last value to 1
        self.__set_last_value_to_one(probabilities);

        return probabilities;


    def initialize(self):
        """!
        @brief Calculates initial centers using K-Means++ method.
        
        @return (list) List of initialized initial centers.
        
        """
        # Initialize result list by the first centers
        centers = [self.__get_first_center()];

        # For each next center
        for _ in range(1, self.__amount):

            # Calc Distance for each data
            distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);

            # Create list with probabilities
            probabilities = self.__get_probabilities_from_distance(distance_data);
            # print('Probability : ', probabilities);

            # Choose one new data point at random as a new center, using a weighted probability distribution
            ret_idx = self.__get_uniform(probabilities);

            # Add new center
            centers.append(self.__data[ret_idx]);

        # Is all centers are initialized
        return centers;

1			"""!
2
3			@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4			@details Implementations based on articles:
5			- K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
7			@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8			@date 2014-2017
9			@copyright GNU Public License
10
11			@see kmeans
12			@see xmeans
13
14			@cond GNU_PUBLIC_LICENSE
15			PyClustering is free software: you can redistribute it and/or modify
16			it under the terms of the GNU General Public License as published by
17			the Free Software Foundation, either version 3 of the License, or
18			(at your option) any later version.
19
20			PyClustering is distributed in the hope that it will be useful,
21			but WITHOUT ANY WARRANTY; without even the implied warranty of
22			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23			GNU General Public License for more details.
24
25			You should have received a copy of the GNU General Public License
26			along with this program. If not, see <http://www.gnu.org/licenses/>.
27			@endcond
28
29			"""
30
31			import random;
32
33			from pyclustering.utils import euclidean_distance;
34
35
36			class random_center_initializer:
37			"""!
38			@brief Random center initializer is for generation specified amount of random of centers for specified data.
39
40			"""
41
42			def __init__(self, data, amount_centers):
43			"""!
44			@brief Creates instance of random center initializer.
45
46			@param[in] data (list): List of points where each point is represented by list of coordinates.
47			@param[in] amount_centers (unit): Amount of centers that should be initialized.
48
49			"""
50
51			self.__data = data;
52			self.__amount = amount_centers;
53
54			if self.__amount <= 0:
55			raise AttributeError("Amount of cluster centers should be at least 1.");
56
57
58			def initialize(self):
59			"""!
60			@brief Generates random centers in line with input parameters.
61
62			@return (list) List of centers where each center is represented by list of coordinates.
63
64			"""
65			return [ self.__create_center() for _ in range(self.__amount) ];
66
67
68			def __create_center(self):
69			"""!
70			@brief Generates and returns random center.
71
72			"""
73			return [ random.random() for _ in range(len(self.__data[0])) ];
74
75
76
77			class kmeans_plusplus_initializer:
78			"""!
79			@brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
80			@details Clustering results are depends on initial centers in case of K-Means algorithm and even in case of X-Means.
81			This method is used to find out optimal initial centers. There is an example of initial centers that were
82			calculated by the K-Means++ method:
83
84			@image html kmeans_plusplus_initializer_results.png
85
86			Code example:
87			@code
88			# Read data 'SampleSimple3' from Simple Sample collection.
89			sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
90
91			# Calculate initial centers using K-Means++ method.
92			centers = kmeans_plusplus_initializer(sample, 4).initialize();
93
94			# Display initial centers.
95			visualizer = cluster_visualizer();
96			visualizer.append_cluster(sample);
97			visualizer.append_cluster(centers, marker = '*', markersize = 10);
98			visualizer.show();
99
100			# Perform cluster analysis using K-Means algorithm with initial centers.
101			kmeans_instance = kmeans(sample, centers);
102
103			# Run clustering process and obtain result.
104			kmeans_instance.process();
105			clusters = kmeans_instance.get_clusters();
106			@endcode
107
108			"""
109
110			def __init__(self, data, amount_centers):
111			"""!
112			@brief Creates K-Means++ center initializer instance.
113
114			@param[in] data (list): List of points where each point is represented by list of coordinates.
115			@param[in] amount_centers (unit): Amount of centers that should be initialized.
116
117			"""
118
119			self.__data = data;
120			self.__amount = amount_centers;
121
122			if self.__amount <= 0:
123			raise AttributeError("Amount of cluster centers should be at least 1.");
124
125
126			def __get_uniform(self, probabilities):
127			"""!
128			@brief Returns index in probabilities.
129
130			@param[in] probabilities (list): List with segments in increasing sequence with val in [0, 1], for example, [0 0.1 0.2 0.3 1.0].
131
132			"""
133
134			# Initialize return value
135			res_idx = None;
136
137			# Get random num in range [0, 1)
138			random_num = random.random();
139
140			# Find segment with val1 < random_num < val2
141			for _idx in range(len(probabilities)):
142			if random_num < probabilities[_idx]:
143			res_idx = _idx;
144			break;
145
146			if res_idx is None:
147			# print('Random number : ', random_num);
148			# print('Probabilities : ', probabilities);
149			raise AttributeError("List 'probabilities' should contain 1 as the end of last segment(s)");
150
151			return res_idx
152
153
154			def __get_first_center(self):
155			"""!
156			@brief Returns first center chosen uniformly at random from data.
157
158			"""
159
160			# Initialize list with uniform probabilities
161			probabilities = [];
162
163			# Fill probability list
164			for i in range(len(self.__data)):
165			probabilities.append((i + 1) / len(self.__data));
166
167			return self.__data[self.__get_uniform(probabilities)];
168
169
170			def __calc_distance_to_nearest_center(self, data, centers):
171			"""!
172			@brief Calculates distance from each data point to nearest center.
173
174			@param[in] data (list): List of points where each point is represented by list of coordinates.
175			@param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
176
177			@return (list) List of distances to closest center for each data point.
178
179			"""
180
181			# Initialize
182			distance_data = [];
183
184			# For each data point x, compute D(x), the distance between x and the nearest center
185			for _point in data:
186
187			# Min dist to nearest center
188			min_dist = float('inf');
189
190			# For each center
191			for _center in centers:
192			min_dist = min(min_dist, euclidean_distance(_center, _point));
193
194			# Add distance to nearest center into result list
195			distance_data.append(min_dist);
196
197			return distance_data;
198
199
200			def __get_sum_for_normalize_distance(self, distance):
201			"""!
202			@brief Calculates square sum distance that is used for normalization.
203
204			@param[in] distance (list): List of minimum distances from each point to nearest center.
205
206			@return (float) Square sum distance.
207
208			"""
209
210			sum_distance = 0.0;
211
212			for _dist in distance:
213			sum_distance += _dist ** 2;
214
215			return sum_distance;
216
217
218			def __set_last_value_to_one(self, probabilities):
219			"""!
220			@brief Update probabilities for all points.
221			@details All values of probability list equals to the last element are set to 1.
222
223			@param[in] probabilities (list): List of minimum distances from each point to nearest center.
224
225			"""
226			# Start from the last elem
227			back_idx = - 1;
228
229			# All values equal to the last elem should be set to 1
230			last_val = probabilities[back_idx];
231
232			# for all elements or if a elem not equal to the last elem
233			for _idx in range(-1, -len(probabilities) - 1):
			0 ignored issues – show Unused Code introduced 2017-09-01 15:07 UTC by Report Bug Copy Issue Report The variable `_idx` seems to be unused. Loading history...
234			if probabilities[back_idx] == last_val:
235			probabilities[back_idx] = 1;
236			else:
237			break;
238
239
240			def __get_probabilities_from_distance(self, distance):
241			"""!
			0 ignored issues – show Bug introduced 2017-09-02 13:52 UTC by Report Bug Copy Issue Report A suspicious escape sequence `\s` was found. Did you maybe forget to add an `r` prefix? Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with `r` or `R` are they interpreted as regular expressions. The escape sequence that was used indicates that you might have intended to write a regular expression. Learn more about the available escape sequences. in the Python documentation. Loading history...
242			@brief Calculates probabilities from distance.
243			@details Probabilities are filled by following expression:
244
245			\f[
246			p[i]=\frac{dist_{i}^2}{\sum_{i=0}^{N}dist_{i}};
247			\f]
248
249			@param[in] distance (list): List of minimum distances from each point to nearest center.
250
251			@return (list) Weighted belonging probability for each point to its nearest center.
252
253			"""
254			# Normalize distance
255			sum_for_normalize = self.__get_sum_for_normalize_distance(distance);
256
257			# Create list with probabilities
258
259			# Initialize list with probabilities
260			probabilities = [];
261
262			# Variable to accumulate probabilities
263			prev_value = 0;
264
265			# Fill probabilities as :
266			# p[idx] = D[idx]^2 / sum_2
267			# where sum_2 = D[0]^2 + D[1]^2 + ...
268			for _dist in distance:
269			prev_value = (_dist ** 2) / sum_for_normalize + prev_value;
270			probabilities.append(prev_value);
271
272			# Set last value to 1
273			self.__set_last_value_to_one(probabilities);
274
275			return probabilities;
276
277
278			def initialize(self):
279			"""!
280			@brief Calculates initial centers using K-Means++ method.
281
282			@return (list) List of initialized initial centers.
283
284			"""
285			# Initialize result list by the first centers
286			centers = [self.__get_first_center()];
287
288			# For each next center
289			for _ in range(1, self.__amount):
290
291			# Calc Distance for each data
292			distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);
293
294			# Create list with probabilities
295			probabilities = self.__get_probabilities_from_distance(distance_data);
296			# print('Probability : ', probabilities);
297
298			# Choose one new data point at random as a new center, using a weighted probability distribution
299			ret_idx = self.__get_uniform(probabilities);
300
301			# Add new center
302			centers.append(self.__data[ret_idx]);
303
304			# Is all centers are initialized
305			return centers;

annoviko / pyclustering

Push — master ( b7ad63...18ed0c )

kmeans_plusplus_initializer.initialize() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like