Completed
Push — 0.8.dev ( ce998b...a514fe )
by Andrei
01:02
created

kmeans_plusplus_initializer.__check_parameters()   B

Complexity

Conditions 7

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
dl 0
loc 14
rs 7.3333
c 0
b 0
f 0
1
"""!
2
3
@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4
@details Implementations based on articles:
5
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
         
7
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@see kmeans
12
@see xmeans
13
14
@cond GNU_PUBLIC_LICENSE
15
    PyClustering is free software: you can redistribute it and/or modify
16
    it under the terms of the GNU General Public License as published by
17
    the Free Software Foundation, either version 3 of the License, or
18
    (at your option) any later version.
19
    
20
    PyClustering is distributed in the hope that it will be useful,
21
    but WITHOUT ANY WARRANTY; without even the implied warranty of
22
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
    GNU General Public License for more details.
24
    
25
    You should have received a copy of the GNU General Public License
26
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
27
@endcond
28
29
"""
30
31
32
import math;
0 ignored issues
show
Unused Code introduced by
The import math seems to be unused.
Loading history...
33
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
34
import random;
35
36
from pyclustering.utils import euclidean_distance;
0 ignored issues
show
Unused Code introduced by
Unused euclidean_distance imported from pyclustering.utils
Loading history...
37
38
39
class random_center_initializer:
40
    """!
41
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
42
    
43
    """
44
45
    def __init__(self, data, amount_centers):
46
        """!
47
        @brief Creates instance of random center initializer.
48
        
49
        @param[in] data (list): List of points where each point is represented by list of coordinates.
50
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
51
        
52
        """
53
        
54
        self.__data = data;
55
        self.__amount = amount_centers;
56
57
        if self.__amount <= 0:
58
            raise AttributeError("Amount of cluster centers should be at least 1.");
59
60
61
    def initialize(self):
62
        """!
63
        @brief Generates random centers in line with input parameters.
64
        
65
        @return (list) List of centers where each center is represented by list of coordinates.
66
        
67
        """
68
        return [ self.__create_center() for _ in range(self.__amount) ];
69
70
71
    def __create_center(self):
72
        """!
73
        @brief Generates and returns random center.
74
        
75
        """
76
        return [ random.random() for _ in range(len(self.__data[0])) ];
77
78
79
80
class kmeans_plusplus_initializer:
81
    """!
82
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
83
    @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
84
              initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
85
              out optimal initial centers. There is an example of initial centers that were calculated by the
86
              K-Means++ method:
87
    
88
    @image html kmeans_plusplus_initializer_results.png
89
    
90
    Code example:
91
    @code
92
        # Read data 'SampleSimple3' from Simple Sample collection.
93
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
94
        
95
        # Calculate initial centers using K-Means++ method.
96
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
97
        
98
        # Display initial centers.
99
        visualizer = cluster_visualizer();
100
        visualizer.append_cluster(sample);
101
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
102
        visualizer.show();
103
        
104
        # Perform cluster analysis using K-Means algorithm with initial centers.
105
        kmeans_instance = kmeans(sample, centers);
106
        
107
        # Run clustering process and obtain result.
108
        kmeans_instance.process();
109
        clusters = kmeans_instance.get_clusters();
110
    @endcode
111
    
112
    """
113
114
115
    ## Constant denotes that only points with highest probabilities should be considered as centers.
116
    FARTHEST_CENTER_CANDIDATE = "farthest";
117
118
119
    def __init__(self, data, amount_centers, amount_candidates = 2):
120
        """!
121
        @brief Creates K-Means++ center initializer instance.
122
        
123
        @param[in] data (array_like): List of points where each point is represented by list of coordinates.
124
        @param[in] amount_centers (uint): Amount of centers that should be initialized.
125
        @param[in] amount_candidates (uint): Amount of candidates that is considered as a center, if the farthest points (with the highest probability) should
126
                    be considered as centers then special constant should be used 'FARTHEST_CENTER_CANDIDATE'.
127
128
        @see FARTHEST_CENTER_CANDIDATE
129
130
        """
131
        
132
        self.__data = numpy.array(data);
133
        self.__amount = amount_centers;
134
        self.__candidates = amount_candidates;
135
136
        self.__check_parameters();
137
138
139
    def __check_parameters(self):
140
        """!
141
        @brief Checks input parameters of the algorithm and if something wrong then corresponding exception is thrown.
142
143
        """
144
        if (self.__amount <= 0) or (self.__amount > len(self.__data)):
145
            raise AttributeError("Amount of cluster centers should be at least 1 and should be less or equal to amount of points in data.");
146
147
        if self.__candidates != kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE:
148
            if (self.__candidates <= 0) or (self.__candidates > len(self.__data)):
149
                raise AttributeError("Amount of candidates centers should be at least 1 and should be less or equal to amount of points in data.");
150
151
        if len(self.__data) == 0:
152
            raise AttributeError("Data is empty.")
153
154
155
    def __calc_distance_to_nearest_center(self, data, centers):
156
        """!
157
        @brief Calculates distance from each data point to nearest center.
158
        
159
        @param[in] data (numpy.array): Array of points for that initialization is performed.
160
        @param[in] centers (numpy.array): Array of points that represents centers.
161
        
162
        @return (numpy.array) List of distances to closest center for each data point.
163
        
164
        """
165
166
        dataset_differences = numpy.zeros((len(centers), len(data)));
167
        for index_center in range(len(centers)):
168
            dataset_differences[index_center] = numpy.sum(
169
                numpy.square(data - centers[index_center]), axis=1).T;
170
171
        shortest_distances = numpy.min(dataset_differences, axis=0);
172
        return shortest_distances;
173
174
175
    def __get_next_center(self, centers):
176
        """!
177
        @brief Calculates the next center for the data.
178
179
        @param[in] centers (list): Current initialized centers.
180
181
        @return (list) Next initialized center.
182
183
        """
184
185
        distance_data = self.__calc_distance_to_nearest_center(data=self.__data, centers=centers);
186
        center_index = numpy.argmax(distance_data);
187
188
        return self.__data[center_index];
189
190
191
    def initialize(self):
192
        """!
193
        @brief Calculates initial centers using K-Means++ method.
194
        
195
        @return (list) List of initialized initial centers.
196
        
197
        """
198
199
        # Initialize result list by the first centers
200
        index_center = random.randint(0, len(self.__data) - 1);
201
        centers = [ self.__data[ index_center ] ];
202
203
        # For each next center
204
        for _ in range(1, self.__amount):
205
            next_center = self.__get_next_center(centers);
206
            centers.append(next_center);
207
208
        return centers;