kmeans_plusplus_initializer.initialize() - Code Metrics - Inspection of "#382: Optimization for python version of K-Means++..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 0.8.dev ( ce998b...a514fe )

by Andrei

created 2018-02-21 18:55 UTC

kmeans_plusplus_initializer.initialize() A

↳ Parent: kmeans_plusplus_initializer

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
dl	0
loc	18
rs	9.4285
c	0
b	0
f	0

"""!

@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
@details Implementations based on articles:
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
         
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
@date 2014-2018
@copyright GNU Public License

@see kmeans
@see xmeans

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import math;

import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from pyclustering.utils import euclidean_distance;



class random_center_initializer:
    """!
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
    
    """

    def __init__(self, data, amount_centers):
        """!
        @brief Creates instance of random center initializer.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;

        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");


    def initialize(self):
        """!
        @brief Generates random centers in line with input parameters.
        
        @return (list) List of centers where each center is represented by list of coordinates.
        
        """
        return [ self.__create_center() for _ in range(self.__amount) ];


    def __create_center(self):
        """!
        @brief Generates and returns random center.
        
        """
        return [ random.random() for _ in range(len(self.__data[0])) ];



class kmeans_plusplus_initializer:
    """!
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
    @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
              initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
              out optimal initial centers. There is an example of initial centers that were calculated by the
              K-Means++ method:
    
    @image html kmeans_plusplus_initializer_results.png
    
    Code example:
    @code
        # Read data 'SampleSimple3' from Simple Sample collection.
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
        
        # Calculate initial centers using K-Means++ method.
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
        
        # Display initial centers.
        visualizer = cluster_visualizer();
        visualizer.append_cluster(sample);
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
        visualizer.show();
        
        # Perform cluster analysis using K-Means algorithm with initial centers.
        kmeans_instance = kmeans(sample, centers);
        
        # Run clustering process and obtain result.
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
    @endcode
    
    """


    ## Constant denotes that only points with highest probabilities should be considered as centers.
    FARTHEST_CENTER_CANDIDATE = "farthest";


    def __init__(self, data, amount_centers, amount_candidates = 2):
        """!
        @brief Creates K-Means++ center initializer instance.
        
        @param[in] data (array_like): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (uint): Amount of centers that should be initialized.
        @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
                    be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.

        @see FARTHEST_CENTER_CANDIDATE

        """
        
        self.__data = numpy.array(data);
        self.__amount = amount_centers;
        self.__candidates = amount_candidates;

        self.__check_parameters();


    def __check_parameters(self):
        """!
        @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.

        """
        if (self.__amount <= 0) or (self.__amount > len(self.__data)):
            raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data.");

        if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
            if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
                raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data.");

        if len(self.__data) == 0:
            raise AttributeError("Data is empty.")


    def __calc_distance_to_nearest_center(self, data, centers):
        """!
        @brief Calculates distance from each data point to nearest center.
        
        @param[in] data (numpy.array): Array of points for that initialization is performed.
        @param[in] centers (numpy.array): Array of points that represents centers.
        
        @return (numpy.array) List of distances to closest center for each data point.
        
        """

        dataset_differences = numpy.zeros((len(centers), len(data)));
        for index_center in range(len(centers)):
            dataset_differences[index_center] = numpy.sum(
                numpy.square(data - centers[index_center]), axis=1).T;

        shortest_distances = numpy.min(dataset_differences, axis=0);
        return shortest_distances;


    def __get_next_center(self, centers):
        """!
        @brief Calculates the next center for the data.

        @param[in] centers (list): Current initialized centers.

        @return (list) Next initialized center.

        """

        distance_data = self.__calc_distance_to_nearest_center(data=self.__data, centers=centers);
        center_index = numpy.argmax(distance_data);

        return self.__data[center_index];


    def initialize(self):
        """!
        @brief Calculates initial centers using K-Means++ method.
        
        @return (list) List of initialized initial centers.
        
        """

        # Initialize result list by the first centers
        index_center = random.randint(0, len(self.__data) - 1);
        centers = [ self.__data[ index_center ] ];

        # For each next center
        for _ in range(1, self.__amount):
            next_center = self.__get_next_center(centers);
            centers.append(next_center);

        return centers;

Push — 0.8.dev ( ce998b...a514fe )

kmeans_plusplus_initializer.initialize() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4			@details Implementations based on articles:
5			- K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
7			@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8			@date 2014-2018
9			@copyright GNU Public License
10
11			@see kmeans
12			@see xmeans
13
14			@cond GNU_PUBLIC_LICENSE
15			PyClustering is free software: you can redistribute it and/or modify
16			it under the terms of the GNU General Public License as published by
17			the Free Software Foundation, either version 3 of the License, or
18			(at your option) any later version.
19
20			PyClustering is distributed in the hope that it will be useful,
21			but WITHOUT ANY WARRANTY; without even the implied warranty of
22			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23			GNU General Public License for more details.
24
25			You should have received a copy of the GNU General Public License
26			along with this program. If not, see <http://www.gnu.org/licenses/>.
27			@endcond
28
29			"""
30
31
32			import math;
			0 ignored issues – show Unused Code introduced 2018-02-21 18:56 UTC by Report Bug Copy Issue Report The import `math` seems to be unused. Loading history...
33			import numpy;
			0 ignored issues – show Configuration introduced 2018-02-21 18:56 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
34			import random;
35
36			from pyclustering.utils import euclidean_distance;
			0 ignored issues – show Unused Code introduced 2017-10-27 12:21 UTC by Report Bug Copy Issue Report Unused euclidean_distance imported from pyclustering.utils Loading history...
37
38
39			class random_center_initializer:
40			"""!
41			@brief Random center initializer is for generation specified amount of random of centers for specified data.
42
43			"""
44
45			def __init__(self, data, amount_centers):
46			"""!
47			@brief Creates instance of random center initializer.
48
49			@param[in] data (list): List of points where each point is represented by list of coordinates.
50			@param[in] amount_centers (unit): Amount of centers that should be initialized.
51
52			"""
53
54			self.__data = data;
55			self.__amount = amount_centers;
56
57			if self.__amount <= 0:
58			raise AttributeError("Amount of cluster centers should be at least 1.");
59
60
61			def initialize(self):
62			"""!
63			@brief Generates random centers in line with input parameters.
64
65			@return (list) List of centers where each center is represented by list of coordinates.
66
67			"""
68			return [ self.__create_center() for _ in range(self.__amount) ];
69
70
71			def __create_center(self):
72			"""!
73			@brief Generates and returns random center.
74
75			"""
76			return [ random.random() for _ in range(len(self.__data[0])) ];
77
78
79
80			class kmeans_plusplus_initializer:
81			"""!
82			@brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
83			@details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
84			initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
85			out optimal initial centers. There is an example of initial centers that were calculated by the
86			K-Means++ method:
87
88			@image html kmeans_plusplus_initializer_results.png
89
90			Code example:
91			@code
92			# Read data 'SampleSimple3' from Simple Sample collection.
93			sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
94
95			# Calculate initial centers using K-Means++ method.
96			centers = kmeans_plusplus_initializer(sample, 4).initialize();
97
98			# Display initial centers.
99			visualizer = cluster_visualizer();
100			visualizer.append_cluster(sample);
101			visualizer.append_cluster(centers, marker = '*', markersize = 10);
102			visualizer.show();
103
104			# Perform cluster analysis using K-Means algorithm with initial centers.
105			kmeans_instance = kmeans(sample, centers);
106
107			# Run clustering process and obtain result.
108			kmeans_instance.process();
109			clusters = kmeans_instance.get_clusters();
110			@endcode
111
112			"""
113
114
115			## Constant denotes that only points with highest probabilities should be considered as centers.
116			FARTHEST_CENTER_CANDIDATE = "farthest";
117
118
119			def __init__(self, data, amount_centers, amount_candidates = 2):
120			"""!
121			@brief Creates K-Means++ center initializer instance.
122
123			@param[in] data (array_like): List of points where each point is represented by list of coordinates.
124			@param[in] amount_centers (uint): Amount of centers that should be initialized.
125			@param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
126			be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.
127
128			@see FARTHEST_CENTER_CANDIDATE
129
130			"""
131
132			self.__data = numpy.array(data);
133			self.__amount = amount_centers;
134			self.__candidates = amount_candidates;
135
136			self.__check_parameters();
137
138
139			def __check_parameters(self):
140			"""!
141			@brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
142
143			"""
144			if (self.__amount <= 0) or (self.__amount > len(self.__data)):
145			raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data.");
146
147			if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
148			if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
149			raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data.");
150
151			if len(self.__data) == 0:
152			raise AttributeError("Data is empty.")
153
154
155			def __calc_distance_to_nearest_center(self, data, centers):
156			"""!
157			@brief Calculates distance from each data point to nearest center.
158
159			@param[in] data (numpy.array): Array of points for that initialization is performed.
160			@param[in] centers (numpy.array): Array of points that represents centers.
161
162			@return (numpy.array) List of distances to closest center for each data point.
163
164			"""
165
166			dataset_differences = numpy.zeros((len(centers), len(data)));
167			for index_center in range(len(centers)):
168			dataset_differences[index_center] = numpy.sum(
169			numpy.square(data - centers[index_center]), axis=1).T;
170
171			shortest_distances = numpy.min(dataset_differences, axis=0);
172			return shortest_distances;
173
174
175			def __get_next_center(self, centers):
176			"""!
177			@brief Calculates the next center for the data.
178
179			@param[in] centers (list): Current initialized centers.
180
181			@return (list) Next initialized center.
182
183			"""
184
185			distance_data = self.__calc_distance_to_nearest_center(data=self.__data, centers=centers);
186			center_index = numpy.argmax(distance_data);
187
188			return self.__data[center_index];
189
190
191			def initialize(self):
192			"""!
193			@brief Calculates initial centers using K-Means++ method.
194
195			@return (list) List of initialized initial centers.
196
197			"""
198
199			# Initialize result list by the first centers
200			index_center = random.randint(0, len(self.__data) - 1);
201			centers = [ self.__data[ index_center ] ];
202
203			# For each next center
204			for _ in range(1, self.__amount):
205			next_center = self.__get_next_center(centers);
206			centers.append(next_center);
207
208			return centers;

annoviko / pyclustering

Push — 0.8.dev ( ce998b...a514fe )

kmeans_plusplus_initializer.initialize() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files