kmeans_plusplus_initializer.initialize() - Code Metrics - Inspection of "#382: X-Means uses K-Means++ for region initial ce..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — 0.8.dev ( 56e704...0601a0 )

by Andrei

created 2018-02-18 19:44 UTC

kmeans_plusplus_initializer.initialize() B

↳ Parent: kmeans_plusplus_initializer

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
dl	0
loc	26
rs	8.5806
c	0
b	0
f	0

"""!

@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
@details Implementations based on articles:
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
         
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
@date 2014-2018
@copyright GNU Public License

@see kmeans
@see xmeans

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

import random;
import copy


from pyclustering.utils import euclidean_distance;


class random_center_initializer:
    """!
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
    
    """

    def __init__(self, data, amount_centers):
        """!
        @brief Creates instance of random center initializer.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;

        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");


    def initialize(self):
        """!
        @brief Generates random centers in line with input parameters.
        
        @return (list) List of centers where each center is represented by list of coordinates.
        
        """
        return [ self.__create_center() for _ in range(self.__amount) ];


    def __create_center(self):
        """!
        @brief Generates and returns random center.
        
        """
        return [ random.random() for _ in range(len(self.__data[0])) ];



class kmeans_plusplus_initializer:
    """!
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
    @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
              initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
              out optimal initial centers. There is an example of initial centers that were calculated by the
              K-Means++ method:
    
    @image html kmeans_plusplus_initializer_results.png
    
    Code example:
    @code
        # Read data 'SampleSimple3' from Simple Sample collection.
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
        
        # Calculate initial centers using K-Means++ method.
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
        
        # Display initial centers.
        visualizer = cluster_visualizer();
        visualizer.append_cluster(sample);
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
        visualizer.show();
        
        # Perform cluster analysis using K-Means algorithm with initial centers.
        kmeans_instance = kmeans(sample, centers);
        
        # Run clustering process and obtain result.
        kmeans_instance.process();
        clusters = kmeans_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, amount_centers):
        """!
        @brief Creates K-Means++ center initializer instance.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
        
        """
        
        self.__data = data;
        self.__amount = amount_centers;
        
        if self.__amount <= 0:
            raise AttributeError("Amount of cluster centers should be at least 1.");


    def __calc_distance_to_nearest_center(self, data, centers):
        """!
        @brief Calculates distance from each data point to nearest center.
        
        @param[in] data (list): List of points where each point is represented by list of coordinates.
        @param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
        
        @return (list) List of distances to closest center for each data point.
        
        """

        # Initialize
        distance_data = [];

        # For each data point x, compute D(x), the distance between x and the nearest center
        for _point in data:

            # Min dist to nearest center
            min_dist = float('inf');

            # For each center
            for _center in centers:
                min_dist = min(min_dist, euclidean_distance(_center, _point));

            # Add distance to nearest center into result list
            distance_data.append(min_dist);

        return distance_data;


    def initialize(self):
        """!
        @brief Calculates initial centers using K-Means++ method.
        
        @return (list) List of initialized initial centers.
        
        """
        # Initialize result list by the first centers
        index_center = random.randint(0, len(self.__data) - 1);
        centers = [ self.__data[ index_center ] ];

        # For each next center
        for _ in range(1, self.__amount):
            # Calc Distance for each data
            distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);

            center_index = 0;
            longest_distance = 0.0;
            for index_distance in range(len(distance_data)):
                if (longest_distance < distance_data[index_distance]):
                    center_index = index_distance;
                    longest_distance = distance_data[index_distance];

            centers.append(self.__data[center_index]);

        return centers;

1			"""!
2
3			@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4			@details Implementations based on articles:
5			- K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
7			@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8			@date 2014-2018
9			@copyright GNU Public License
10
11			@see kmeans
12			@see xmeans
13
14			@cond GNU_PUBLIC_LICENSE
15			PyClustering is free software: you can redistribute it and/or modify
16			it under the terms of the GNU General Public License as published by
17			the Free Software Foundation, either version 3 of the License, or
18			(at your option) any later version.
19
20			PyClustering is distributed in the hope that it will be useful,
21			but WITHOUT ANY WARRANTY; without even the implied warranty of
22			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23			GNU General Public License for more details.
24
25			You should have received a copy of the GNU General Public License
26			along with this program. If not, see <http://www.gnu.org/licenses/>.
27			@endcond
28
29			"""
30
31			import random;
32			import copy
			0 ignored issues – show Unused Code introduced 2017-09-20 09:46 UTC by Report Bug Copy Issue Report The import `copy` seems to be unused. Loading history...
33
34			from pyclustering.utils import euclidean_distance;
35
36
37			class random_center_initializer:
38			"""!
39			@brief Random center initializer is for generation specified amount of random of centers for specified data.
40
41			"""
42
43			def __init__(self, data, amount_centers):
44			"""!
45			@brief Creates instance of random center initializer.
46
47			@param[in] data (list): List of points where each point is represented by list of coordinates.
48			@param[in] amount_centers (unit): Amount of centers that should be initialized.
49
50			"""
51
52			self.__data = data;
53			self.__amount = amount_centers;
54
55			if self.__amount <= 0:
56			raise AttributeError("Amount of cluster centers should be at least 1.");
57
58
59			def initialize(self):
60			"""!
61			@brief Generates random centers in line with input parameters.
62
63			@return (list) List of centers where each center is represented by list of coordinates.
64
65			"""
66			return [ self.__create_center() for _ in range(self.__amount) ];
67
68
69			def __create_center(self):
70			"""!
71			@brief Generates and returns random center.
72
73			"""
74			return [ random.random() for _ in range(len(self.__data[0])) ];
75
76
77
78			class kmeans_plusplus_initializer:
79			"""!
80			@brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
81			@details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
82			initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
83			out optimal initial centers. There is an example of initial centers that were calculated by the
84			K-Means++ method:
85
86			@image html kmeans_plusplus_initializer_results.png
87
88			Code example:
89			@code
90			# Read data 'SampleSimple3' from Simple Sample collection.
91			sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
92
93			# Calculate initial centers using K-Means++ method.
94			centers = kmeans_plusplus_initializer(sample, 4).initialize();
95
96			# Display initial centers.
97			visualizer = cluster_visualizer();
98			visualizer.append_cluster(sample);
99			visualizer.append_cluster(centers, marker = '*', markersize = 10);
100			visualizer.show();
101
102			# Perform cluster analysis using K-Means algorithm with initial centers.
103			kmeans_instance = kmeans(sample, centers);
104
105			# Run clustering process and obtain result.
106			kmeans_instance.process();
107			clusters = kmeans_instance.get_clusters();
108			@endcode
109
110			"""
111
112			def __init__(self, data, amount_centers):
113			"""!
114			@brief Creates K-Means++ center initializer instance.
115
116			@param[in] data (list): List of points where each point is represented by list of coordinates.
117			@param[in] amount_centers (unit): Amount of centers that should be initialized.
118
119			"""
120
121			self.__data = data;
122			self.__amount = amount_centers;
123
124			if self.__amount <= 0:
125			raise AttributeError("Amount of cluster centers should be at least 1.");
126
127
128			def __calc_distance_to_nearest_center(self, data, centers):
129			"""!
130			@brief Calculates distance from each data point to nearest center.
131
132			@param[in] data (list): List of points where each point is represented by list of coordinates.
133			@param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
134
135			@return (list) List of distances to closest center for each data point.
136
137			"""
138
139			# Initialize
140			distance_data = [];
141
142			# For each data point x, compute D(x), the distance between x and the nearest center
143			for _point in data:
144
145			# Min dist to nearest center
146			min_dist = float('inf');
147
148			# For each center
149			for _center in centers:
150			min_dist = min(min_dist, euclidean_distance(_center, _point));
151
152			# Add distance to nearest center into result list
153			distance_data.append(min_dist);
154
155			return distance_data;
156
157
158			def initialize(self):
159			"""!
160			@brief Calculates initial centers using K-Means++ method.
161
162			@return (list) List of initialized initial centers.
163
164			"""
165			# Initialize result list by the first centers
166			index_center = random.randint(0, len(self.__data) - 1);
167			centers = [ self.__data[ index_center ] ];
168
169			# For each next center
170			for _ in range(1, self.__amount):
171			# Calc Distance for each data
172			distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);
173
174			center_index = 0;
175			longest_distance = 0.0;
176			for index_distance in range(len(distance_data)):
177			if (longest_distance < distance_data[index_distance]):
178			center_index = index_distance;
179			longest_distance = distance_data[index_distance];
180
181			centers.append(self.__data[center_index]);
182
183			return centers;

annoviko / pyclustering

Push — 0.8.dev ( 56e704...0601a0 )

kmeans_plusplus_initializer.initialize() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like