Completed
Push — 0.8.dev ( 56e704...0601a0 )
by Andrei
51s
created

kmeans_plusplus_initializer.__get_first_center()   A

Complexity

Conditions 2

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 14
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Collection of center initializers for algorithm that uses initial centers, for example, for K-Means or X-Means.
4
@details Implementations based on articles:
5
         - K-Means++: The Advantages of careful seeding. D. Arthur, S. Vassilvitskii. 2007.
6
         
7
@authors Andrei Novikov, Aleksey Kukushkin ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@see kmeans
12
@see xmeans
13
14
@cond GNU_PUBLIC_LICENSE
15
    PyClustering is free software: you can redistribute it and/or modify
16
    it under the terms of the GNU General Public License as published by
17
    the Free Software Foundation, either version 3 of the License, or
18
    (at your option) any later version.
19
    
20
    PyClustering is distributed in the hope that it will be useful,
21
    but WITHOUT ANY WARRANTY; without even the implied warranty of
22
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
    GNU General Public License for more details.
24
    
25
    You should have received a copy of the GNU General Public License
26
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
27
@endcond
28
29
"""
30
31
import random;
32
import copy
0 ignored issues
show
Unused Code introduced by
The import copy seems to be unused.
Loading history...
33
34
from pyclustering.utils import euclidean_distance;
35
36
37
class random_center_initializer:
38
    """!
39
    @brief Random center initializer is for generation specified amount of random of centers for specified data.
40
    
41
    """
42
43
    def __init__(self, data, amount_centers):
44
        """!
45
        @brief Creates instance of random center initializer.
46
        
47
        @param[in] data (list): List of points where each point is represented by list of coordinates.
48
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
49
        
50
        """
51
        
52
        self.__data = data;
53
        self.__amount = amount_centers;
54
55
        if self.__amount <= 0:
56
            raise AttributeError("Amount of cluster centers should be at least 1.");
57
58
59
    def initialize(self):
60
        """!
61
        @brief Generates random centers in line with input parameters.
62
        
63
        @return (list) List of centers where each center is represented by list of coordinates.
64
        
65
        """
66
        return [ self.__create_center() for _ in range(self.__amount) ];
67
68
69
    def __create_center(self):
70
        """!
71
        @brief Generates and returns random center.
72
        
73
        """
74
        return [ random.random() for _ in range(len(self.__data[0])) ];
75
76
77
78
class kmeans_plusplus_initializer:
79
    """!
80
    @brief K-Means++ is an algorithm for choosing the initial centers for algorithms like K-Means or X-Means.
81
    @details K-Means++ algorithm guarantees an approximation ratio O(log k). Clustering results are depends on
82
              initial centers in case of K-Means algorithm and even in case of X-Means. This method is used to find
83
              out optimal initial centers. There is an example of initial centers that were calculated by the
84
              K-Means++ method:
85
    
86
    @image html kmeans_plusplus_initializer_results.png
87
    
88
    Code example:
89
    @code
90
        # Read data 'SampleSimple3' from Simple Sample collection.
91
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3);
92
        
93
        # Calculate initial centers using K-Means++ method.
94
        centers = kmeans_plusplus_initializer(sample, 4).initialize();
95
        
96
        # Display initial centers.
97
        visualizer = cluster_visualizer();
98
        visualizer.append_cluster(sample);
99
        visualizer.append_cluster(centers, marker = '*', markersize = 10);
100
        visualizer.show();
101
        
102
        # Perform cluster analysis using K-Means algorithm with initial centers.
103
        kmeans_instance = kmeans(sample, centers);
104
        
105
        # Run clustering process and obtain result.
106
        kmeans_instance.process();
107
        clusters = kmeans_instance.get_clusters();
108
    @endcode
109
    
110
    """
111
    
112
    def __init__(self, data, amount_centers):
113
        """!
114
        @brief Creates K-Means++ center initializer instance.
115
        
116
        @param[in] data (list): List of points where each point is represented by list of coordinates.
117
        @param[in] amount_centers (unit): Amount of centers that should be initialized.
118
        
119
        """
120
        
121
        self.__data = data;
122
        self.__amount = amount_centers;
123
        
124
        if self.__amount <= 0:
125
            raise AttributeError("Amount of cluster centers should be at least 1.");
126
127
128
    def __calc_distance_to_nearest_center(self, data, centers):
129
        """!
130
        @brief Calculates distance from each data point to nearest center.
131
        
132
        @param[in] data (list): List of points where each point is represented by list of coordinates.
133
        @param[in] centers (list): List of points that represents centers and where each center is represented by list of coordinates.
134
        
135
        @return (list) List of distances to closest center for each data point.
136
        
137
        """
138
139
        # Initialize
140
        distance_data = [];
141
142
        # For each data point x, compute D(x), the distance between x and the nearest center
143
        for _point in data:
144
145
            # Min dist to nearest center
146
            min_dist = float('inf');
147
148
            # For each center
149
            for _center in centers:
150
                min_dist = min(min_dist, euclidean_distance(_center, _point));
151
152
            # Add distance to nearest center into result list
153
            distance_data.append(min_dist);
154
155
        return distance_data;
156
157
158
    def initialize(self):
159
        """!
160
        @brief Calculates initial centers using K-Means++ method.
161
        
162
        @return (list) List of initialized initial centers.
163
        
164
        """
165
        # Initialize result list by the first centers
166
        index_center = random.randint(0, len(self.__data) - 1);
167
        centers = [ self.__data[ index_center ] ];
168
169
        # For each next center
170
        for _ in range(1, self.__amount):
171
            # Calc Distance for each data
172
            distance_data = self.__calc_distance_to_nearest_center(data = self.__data, centers = centers);
173
174
            center_index = 0;
175
            longest_distance = 0.0;
176
            for index_distance in range(len(distance_data)):
177
                if (longest_distance < distance_data[index_distance]):
178
                    center_index = index_distance;
179
                    longest_distance = distance_data[index_distance];
180
181
            centers.append(self.__data[center_index]);
182
183
        return centers;