kmeans_plusplus_initializer - Code Metrics - Inspection of "#382: Feature to use candidates and farthest point..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 0.8.dev ( a514fe...60b2bb )

by Andrei

created 2018-02-21 19:28 UTC

kmeans_plusplus_initializer A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	177
Duplicated Lines	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	177
rs	10
wmc	21

7 Methods

Rating	Name	Size	Complexity
A	__get_next_center()	19	2
A	initialize()	18	2
A	__calculate_probabilities()	12	1
B	__check_parameters()	14	7
A	__init__()	18	1
B	__get_probable_center()	27	6
A	__calculate_shortest_distances()	18	2

"""!

@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
@details Implementations based on articles:
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
         
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
@date 2014-2018
@copyright GNU Public License

@see kmeans
@see xmeans

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from pyclustering.utils import euclidean_distance;



class random_center_initializer:
    """!
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
    
    """

    def __init__(self, data, amount_centers):
        """!
        @brief Creates instance of random center initializer.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;

        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");


    def initialize(self):
        """!
        @brief Generates random centers in line with input parameters.
        
        @return (list) List of centers where each center is represented by list of coordinates.
        
        """
        return [ self.__create_center() for _ in range(self.__amount) ];


    def __create_center(self):
        """!
        @brief Generates and returns random center.
        
        """
        return [ random.random() for _ in range(len(self.__data[0])) ];



class kmeans_plusplus_initializer:
    """!
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
    @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
              initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
              out optimal initial centers. There is an example of initial centers that were calculated by the
              K-Means++ method:
    
    @image html kmeans_plusplus_initializer_results.png
    
    Code example:
    @code
        # Read data 'SampleSimple3' from Simple Sample collection.
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
        
        # Calculate initial centers using K-Means++ method.
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
        
        # Display initial centers.
        visualizer = cluster_visualizer();
        visualizer.append_cluster(sample);
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
        visualizer.show();
        
        # Perform cluster analysis using K-Means algorithm with initial centers.
        kmeans_instance = kmeans(sample, centers);
        
        # Run clustering process and obtain result.
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
    @endcode
    
    """


    ## Constant denotes that only points with highest probabilities should be considered as centers.
    FARTHEST_CENTER_CANDIDATE = "farthest";


    def __init__(self, data, amount_centers, amount_candidates = 2):
        """!
        @brief Creates K-Means++ center initializer instance.
        
        @param[in] data (array_like): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (uint): Amount of centers that should be initialized.
        @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
                    be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.

        @see FARTHEST_CENTER_CANDIDATE

        """
        
        self.__data = numpy.array(data);
        self.__amount = amount_centers;
        self.__candidates = amount_candidates;

        self.__check_parameters();


    def __check_parameters(self):
        """!
        @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.

        """
        if (self.__amount <= 0) or (self.__amount > len(self.__data)):
            raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data.");

        if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
            if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
                raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data.");

        if len(self.__data) == 0:
            raise AttributeError("Data is empty.")


    def __calculate_shortest_distances(self, data, centers):
        """!
        @brief Calculates distance from each data point to nearest center.
        
        @param[in] data (numpy.array): Array of points for that initialization is performed.
        @param[in] centers (numpy.array): Array of points that represents centers.
        
        @return (numpy.array) List of distances to closest center for each data point.
        
        """

        dataset_differences = numpy.zeros((len(centers), len(data)));
        for index_center in range(len(centers)):
            dataset_differences[index_center] = numpy.sum(
                numpy.square(data - centers[index_center]), axis=1).T;

        shortest_distances = numpy.min(dataset_differences, axis=0);
        return shortest_distances;


    def __get_next_center(self, centers):
        """!
        @brief Calculates the next center for the data.

        @param[in] centers (array_like): Current initialized centers.

        @return (array_like) Next initialized center.

        """

        distances = self.__calculate_shortest_distances(data=self.__data, centers=centers);

        if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
            center_index = numpy.argmax(distances);
        else:
            probabilities = self.__calculate_probabilities(distances);
            center_index = self.__get_probable_center(distances, probabilities);

        return self.__data[center_index];


    def __calculate_probabilities(self, distances):
        """!
        @brief Calculates cumulative probabilities of being center of each point.

        @param[in] distances (array_like): Distances from each point to closest center.

        @return (array_like) Cumulative probabilities of being center of each point.

        """

        probabilities = distances / numpy.sum(distances);
        return numpy.cumsum(probabilities);


    def __get_probable_center(self, distances, probabilities):
        """!
        @brief Calculates the next probable center considering amount candidates.

        @param[in] distances (array_like): Distances from each point to closest center.
        @param[in] probabilities (array_like): Cumulative probabilities of being center of each point.

        @return (uint) Index point that is next initialized center.

        """

        index_best_candidate = -1;
        for _ in range(self.__candidates):
            candidate_probability = random.random();
            index_candidate = 0;


            for index_object in range(len(probabilities)):
                if candidate_probability < probabilities[index_object]:
                    index_candidate = index_object;
                    break;

            if index_best_candidate == -1:
                index_best_candidate = index_object;

            elif distances[index_best_candidate] < distances[index_object]:

                index_best_candidate = index_object;


        return index_best_candidate;


    def initialize(self):
        """!
        @brief Calculates initial centers using K-Means++ method.
        
        @return (list) List of initialized initial centers.
        
        """

        # Initialize result list by the first centers
        index_center = random.randint(0, len(self.__data) - 1);
        centers = [ self.__data[ index_center ] ];

        # For each next center
        for _ in range(1, self.__amount):
            next_center = self.__get_next_center(centers);
            centers.append(next_center);

        return centers;

Push — 0.8.dev ( a514fe...60b2bb )

kmeans_plusplus_initializer A

Complexity

Size/Duplication

Importance

7 Methods

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4			@details Implementations based on articles:
5			- K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
7			@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8			@date 2014-2018
9			@copyright GNU Public License
10
11			@see kmeans
12			@see xmeans
13
14			@cond GNU_PUBLIC_LICENSE
15			PyClustering is free software: you can redistribute it and/or modify
16			it under the terms of the GNU General Public License as published by
17			the Free Software Foundation, either version 3 of the License, or
18			(at your option) any later version.
19
20			PyClustering is distributed in the hope that it will be useful,
21			but WITHOUT ANY WARRANTY; without even the implied warranty of
22			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23			GNU General Public License for more details.
24
25			You should have received a copy of the GNU General Public License
26			along with this program. If not, see <http://www.gnu.org/licenses/>.
27			@endcond
28
29			"""
30
31
32			import numpy;
			0 ignored issues – show Configuration introduced 2018-02-21 18:56 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
33			import random;
34
35			from pyclustering.utils import euclidean_distance;
			0 ignored issues – show Unused Code introduced 2017-10-27 12:21 UTC by Report Bug Copy Issue Report Unused euclidean_distance imported from pyclustering.utils Loading history...
36
37
38			class random_center_initializer:
39			"""!
40			@brief Random center initializer is for generation specified amount of random of centers for specified data.
41
42			"""
43
44			def __init__(self, data, amount_centers):
45			"""!
46			@brief Creates instance of random center initializer.
47
48			@param[in] data (list): List of points where each point is represented by list of coordinates.
49			@param[in] amount_centers (unit): Amount of centers that should be initialized.
50
51			"""
52
53			self.__data = data;
54			self.__amount = amount_centers;
55
56			if self.__amount <= 0:
57			raise AttributeError("Amount of cluster centers should be at least 1.");
58
59
60			def initialize(self):
61			"""!
62			@brief Generates random centers in line with input parameters.
63
64			@return (list) List of centers where each center is represented by list of coordinates.
65
66			"""
67			return [ self.__create_center() for _ in range(self.__amount) ];
68
69
70			def __create_center(self):
71			"""!
72			@brief Generates and returns random center.
73
74			"""
75			return [ random.random() for _ in range(len(self.__data[0])) ];
76
77
78
79			class kmeans_plusplus_initializer:
80			"""!
81			@brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
82			@details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
83			initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
84			out optimal initial centers. There is an example of initial centers that were calculated by the
85			K-Means++ method:
86
87			@image html kmeans_plusplus_initializer_results.png
88
89			Code example:
90			@code
91			# Read data 'SampleSimple3' from Simple Sample collection.
92			sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
93
94			# Calculate initial centers using K-Means++ method.
95			centers = kmeans_plusplus_initializer(sample, 4).initialize();
96
97			# Display initial centers.
98			visualizer = cluster_visualizer();
99			visualizer.append_cluster(sample);
100			visualizer.append_cluster(centers, marker = '*', markersize = 10);
101			visualizer.show();
102
103			# Perform cluster analysis using K-Means algorithm with initial centers.
104			kmeans_instance = kmeans(sample, centers);
105
106			# Run clustering process and obtain result.
107			kmeans_instance.process();
108			clusters = kmeans_instance.get_clusters();
109			@endcode
110
111			"""
112
113
114			## Constant denotes that only points with highest probabilities should be considered as centers.
115			FARTHEST_CENTER_CANDIDATE = "farthest";
116
117
118			def __init__(self, data, amount_centers, amount_candidates = 2):
119			"""!
120			@brief Creates K-Means++ center initializer instance.
121
122			@param[in] data (array_like): List of points where each point is represented by list of coordinates.
123			@param[in] amount_centers (uint): Amount of centers that should be initialized.
124			@param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
125			be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.
126
127			@see FARTHEST_CENTER_CANDIDATE
128
129			"""
130
131			self.__data = numpy.array(data);
132			self.__amount = amount_centers;
133			self.__candidates = amount_candidates;
134
135			self.__check_parameters();
136
137
138			def __check_parameters(self):
139			"""!
140			@brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
141
142			"""
143			if (self.__amount <= 0) or (self.__amount > len(self.__data)):
144			raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data.");
145
146			if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
147			if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
148			raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data.");
149
150			if len(self.__data) == 0:
151			raise AttributeError("Data is empty.")
152
153
154			def __calculate_shortest_distances(self, data, centers):
155			"""!
156			@brief Calculates distance from each data point to nearest center.
157
158			@param[in] data (numpy.array): Array of points for that initialization is performed.
159			@param[in] centers (numpy.array): Array of points that represents centers.
160
161			@return (numpy.array) List of distances to closest center for each data point.
162
163			"""
164
165			dataset_differences = numpy.zeros((len(centers), len(data)));
166			for index_center in range(len(centers)):
167			dataset_differences[index_center] = numpy.sum(
168			numpy.square(data - centers[index_center]), axis=1).T;
169
170			shortest_distances = numpy.min(dataset_differences, axis=0);
171			return shortest_distances;
172
173
174			def __get_next_center(self, centers):
175			"""!
176			@brief Calculates the next center for the data.
177
178			@param[in] centers (array_like): Current initialized centers.
179
180			@return (array_like) Next initialized center.
181
182			"""
183
184			distances = self.__calculate_shortest_distances(data=self.__data, centers=centers);
185
186			if self.__candidates == kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
187			center_index = numpy.argmax(distances);
188			else:
189			probabilities = self.__calculate_probabilities(distances);
190			center_index = self.__get_probable_center(distances, probabilities);
191
192			return self.__data[center_index];
193
194
195			def __calculate_probabilities(self, distances):
196			"""!
197			@brief Calculates cumulative probabilities of being center of each point.
198
199			@param[in] distances (array_like): Distances from each point to closest center.
200
201			@return (array_like) Cumulative probabilities of being center of each point.
202
203			"""
204
205			probabilities = distances / numpy.sum(distances);
206			return numpy.cumsum(probabilities);
207
208
209			def __get_probable_center(self, distances, probabilities):
210			"""!
211			@brief Calculates the next probable center considering amount candidates.
212
213			@param[in] distances (array_like): Distances from each point to closest center.
214			@param[in] probabilities (array_like): Cumulative probabilities of being center of each point.
215
216			@return (uint) Index point that is next initialized center.
217
218			"""
219
220			index_best_candidate = -1;
221			for _ in range(self.__candidates):
222			candidate_probability = random.random();
223			index_candidate = 0;
			0 ignored issues – show Unused Code introduced 2018-02-21 19:29 UTC by Report Bug Copy Issue Report The variable `index_candidate` seems to be unused. Loading history...
224
225			for index_object in range(len(probabilities)):
226			if candidate_probability < probabilities[index_object]:
227			index_candidate = index_object;
228			break;
229
230			if index_best_candidate == -1:
231			index_best_candidate = index_object;
			0 ignored issues – show Bug introduced 2018-02-21 19:29 UTC by Report Bug Copy Issue Report The loop variable `index_object` might not be defined here. Loading history...
232			elif distances[index_best_candidate] < distances[index_object]:
			0 ignored issues – show Bug introduced 2018-02-21 19:29 UTC by Report Bug Copy Issue Report The loop variable `index_object` might not be defined here. Loading history...
233			index_best_candidate = index_object;
			0 ignored issues – show Bug introduced 2018-02-21 19:29 UTC by Report Bug Copy Issue Report The loop variable `index_object` might not be defined here. Loading history...
234
235			return index_best_candidate;
236
237
238			def initialize(self):
239			"""!
240			@brief Calculates initial centers using K-Means++ method.
241
242			@return (list) List of initialized initial centers.
243
244			"""
245
246			# Initialize result list by the first centers
247			index_center = random.randint(0, len(self.__data) - 1);
248			centers = [ self.__data[ index_center ] ];
249
250			# For each next center
251			for _ in range(1, self.__amount):
252			next_center = self.__get_next_center(centers);
253			centers.append(next_center);
254
255			return centers;

annoviko / pyclustering

Push — 0.8.dev ( a514fe...60b2bb )

kmeans_plusplus_initializer A

Complexity

Size/Duplication

Importance

7 Methods

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files