Completed
Push — master ( aa7a06...786fcc )
by Andrei
01:40
created

rock.__init__()   B

Complexity

Conditions 1

Size

Total Lines 25

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 25
rs 8.8571
1
"""!
2
3
@brief Cluster analysis algorithm: ROCK
4
@details Based on article description:
5
         - S.Guha, R.Rastogi, K.Shim. ROCK: A Robust Clustering Algorithm for Categorical Attributes. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
30
import pyclustering.core.wrapper as wrapper;
31
32
33
class rock:
34
    """!
35
    @brief Class represents clustering algorithm ROCK.
36
37
    Example:
38
    @code
39
        # Read sample for clustering from some file
40
        sample = read_sample(path_to_sample);
41
        
42
        # Create instance of ROCK algorithm for cluster analysis
43
        # Five clusters should be allocated
44
        rock_instance = rock(sample, 1.0, 5);
45
        
46
        # Run cluster analysis
47
        rock_instance.process();
48
        
49
        # Obtain results of clustering
50
        clusters = rock_instance.get_clusters();  
51
    @endcode
52
       
53
    """
54
    
55
    def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = False):
56
        """!
57
        @brief Constructor of clustering algorithm ROCK.
58
        
59
        @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
60
        @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
61
        @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
62
        @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
63
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
64
        
65
        """
66
        
67
        self.__pointer_data = data;
68
        self.__eps = eps;
69
        self.__number_clusters = number_clusters;
70
        self.__threshold = threshold;
71
        
72
        self.__clusters = None;
73
        
74
        self.__ccore = ccore;
75
        
76
        self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
77
        
78
        self.__adjacency_matrix = None;
79
        self.__create_adjacency_matrix();
80
        
81
        
82
    def process(self):
83
        """!
84
        @brief Performs cluster analysis in line with rules of ROCK algorithm.
85
        
86
        @remark Results of clustering can be obtained using corresponding get methods.
87
        
88
        @see get_clusters()
89
        
90
        """
91
        
92
        # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two 
93
        # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
94
        
95
        if (self.__ccore is True):
96
            self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
97
        
98
        else:  
99
            self.__clusters = [[index] for index in range(len(self.__pointer_data))];
100
            
101
            while (len(self.__clusters) > self.__number_clusters):
102
                indexes = self.__find_pair_clusters(self.__clusters);
103
                
104
                if (indexes != [-1, -1]):
105
                    self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
106
                    self.__clusters.pop(indexes[1]);   # remove merged cluster.
107
                else:
108
                    break;  # totally separated clusters have been allocated
109
    
110
    
111
    def get_clusters(self):
112
        """!
113
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
114
        
115
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
116
        
117
        @see process()
118
        
119
        """
120
        
121
        return self.__clusters;
122
123
124
    def __find_pair_clusters(self, clusters):
125
        """!
126
        @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
127
               The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
128
               
129
        @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
130
        
131
        @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
132
                It can be equals to [-1, -1] when no links between clusters.
133
        
134
        """
135
        
136
        maximum_goodness = 0.0;
137
        cluster_indexes = [-1, -1];
138
        
139
        for i in range(0, len(clusters)):
140
            for j in range(i + 1, len(clusters)):
141
                goodness = self.__calculate_goodness(clusters[i], clusters[j]);
142
                if (goodness > maximum_goodness):
143
                    maximum_goodness = goodness;
144
                    cluster_indexes = [i, j];
145
        
146
        return cluster_indexes;
147
148
149
    def __calculate_links(self, cluster1, cluster2):
150
        """!
151
        @brief Returns number of link between two clusters. Link between objects (points) exists only if distance between them less than connectivity radius.
152
        
153
        @param[in] cluster1 (list): The first cluster.
154
        @param[in] cluster2 (list): The second cluster.
155
        
156
        @return (uint) Number of links between two clusters.
157
        
158
        """
159
        
160
        number_links = 0;
161
        
162
        for index1 in cluster1:
163
            for index2 in cluster2:
164
                number_links += self.__adjacency_matrix[index1][index2];
165
                
166
        return number_links;
167
            
168
169
    def __create_adjacency_matrix(self):
170
        """!
171
        @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
172
        
173
        """
174
        
175
        size_data = len(self.__pointer_data);
176
        
177
        self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
178
        for i in range(0, size_data):
179
            for j in range(i + 1, size_data):
180
                distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
181
                if (distance <= self.__eps):
182
                    self.__adjacency_matrix[i][j] = 1;
183
                    self.__adjacency_matrix[j][i] = 1;
184
        
185
    
186
187
    def __calculate_goodness(self, cluster1, cluster2):
188
        """!
189
        @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
190
        
191
        @param[in] cluster1 (list): The first cluster.
192
        @param[in] cluster2 (list): The second cluster.
193
        
194
        @return Goodness measure between two clusters.
195
        
196
        """
197
        
198
        number_links = self.__calculate_links(cluster1, cluster2);
199
        devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
200
        
201
        return (number_links / devider);
202