Completed
Push — master ( 6ed467...f3f897 )
by Andrei
01:29
created

rock.__create_adjacency_matrix()   B

Complexity

Conditions 6

Size

Total Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
dl 0
loc 15
rs 8
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: ROCK
4
@details Based on article description:
5
         - S.Guha, R.Rastogi, K.Shim. ROCK: A Robust Clustering Algorithm for Categorical Attributes. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.cluster.encoder import type_encoding;
30
31
from pyclustering.utils import euclidean_distance;
32
33
import pyclustering.core.rock_wrapper as wrapper;
34
35
36
class rock:
37
    """!
38
    @brief Class represents clustering algorithm ROCK.
39
40
    Example:
41
    @code
42
        # Read sample for clustering from some file
43
        sample = read_sample(path_to_sample);
44
        
45
        # Create instance of ROCK algorithm for cluster analysis
46
        # Five clusters should be allocated
47
        rock_instance = rock(sample, 1.0, 5);
48
        
49
        # Run cluster analysis
50
        rock_instance.process();
51
        
52
        # Obtain results of clustering
53
        clusters = rock_instance.get_clusters();  
54
    @endcode
55
       
56
    """
57
    
58
    def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = False):
59
        """!
60
        @brief Constructor of clustering algorithm ROCK.
61
        
62
        @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
63
        @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
64
        @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
65
        @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
66
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
67
        
68
        """
69
        
70
        self.__pointer_data = data;
71
        self.__eps = eps;
72
        self.__number_clusters = number_clusters;
73
        self.__threshold = threshold;
74
        
75
        self.__clusters = None;
76
        
77
        self.__ccore = ccore;
78
        
79
        self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
80
        
81
        self.__adjacency_matrix = None;
82
        self.__create_adjacency_matrix();
83
        
84
        
85
    def process(self):
86
        """!
87
        @brief Performs cluster analysis in line with rules of ROCK algorithm.
88
        
89
        @remark Results of clustering can be obtained using corresponding get methods.
90
        
91
        @see get_clusters()
92
        
93
        """
94
        
95
        # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two 
96
        # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
97
        
98
        if (self.__ccore is True):
99
            self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
100
        
101
        else:  
102
            self.__clusters = [[index] for index in range(len(self.__pointer_data))];
103
            
104
            while (len(self.__clusters) > self.__number_clusters):
105
                indexes = self.__find_pair_clusters(self.__clusters);
106
                
107
                if (indexes != [-1, -1]):
108
                    self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
109
                    self.__clusters.pop(indexes[1]);   # remove merged cluster.
110
                else:
111
                    break;  # totally separated clusters have been allocated
112
    
113
    
114
    def get_clusters(self):
115
        """!
116
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
117
        
118
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
119
        
120
        @see process()
121
        
122
        """
123
        
124
        return self.__clusters;
125
126
127
    def get_cluster_encoding(self):
128
        """!
129
        @brief Returns clustering result representation type that indicate how clusters are encoded.
130
        
131
        @return (type_encoding) Clustering result representation.
132
        
133
        @see get_clusters()
134
        
135
        """
136
        
137
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
138
139
140
    def __find_pair_clusters(self, clusters):
141
        """!
142
        @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
143
               The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
144
               
145
        @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
146
        
147
        @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
148
                It can be equals to [-1, -1] when no links between clusters.
149
        
150
        """
151
        
152
        maximum_goodness = 0.0;
153
        cluster_indexes = [-1, -1];
154
        
155
        for i in range(0, len(clusters)):
156
            for j in range(i + 1, len(clusters)):
157
                goodness = self.__calculate_goodness(clusters[i], clusters[j]);
158
                if (goodness > maximum_goodness):
159
                    maximum_goodness = goodness;
160
                    cluster_indexes = [i, j];
161
        
162
        return cluster_indexes;
163
164
165
    def __calculate_links(self, cluster1, cluster2):
166
        """!
167
        @brief Returns number of link between two clusters. 
168
        @details Link between objects (points) exists only if distance between them less than connectivity radius.
169
        
170
        @param[in] cluster1 (list): The first cluster.
171
        @param[in] cluster2 (list): The second cluster.
172
        
173
        @return (uint) Number of links between two clusters.
174
        
175
        """
176
        
177
        number_links = 0;
178
        
179
        for index1 in cluster1:
180
            for index2 in cluster2:
181
                number_links += self.__adjacency_matrix[index1][index2];
182
                
183
        return number_links;
184
            
185
186
    def __create_adjacency_matrix(self):
187
        """!
188
        @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
189
        
190
        """
191
        
192
        size_data = len(self.__pointer_data);
193
        
194
        self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
195
        for i in range(0, size_data):
196
            for j in range(i + 1, size_data):
197
                distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
198
                if (distance <= self.__eps):
199
                    self.__adjacency_matrix[i][j] = 1;
200
                    self.__adjacency_matrix[j][i] = 1;
201
        
202
    
203
204
    def __calculate_goodness(self, cluster1, cluster2):
205
        """!
206
        @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
207
        
208
        @param[in] cluster1 (list): The first cluster.
209
        @param[in] cluster2 (list): The second cluster.
210
        
211
        @return Goodness measure between two clusters.
212
        
213
        """
214
        
215
        number_links = self.__calculate_links(cluster1, cluster2);
216
        devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
217
        
218
        return (number_links / devider);
219