rock.__init__() - Code Metrics - Inspection of "Merge branch 'master' of https://github.com/annovi..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( aa7a06...786fcc )

by Andrei

created 2016-04-15 13:19 UTC

rock.init() B

↳ Parent: rock

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	1
dl	0
loc	25
rs	8.8571

"""!

@brief Cluster analysis algorithm: ROCK
@details Based on article description:
         - S.Guha, R.Rastogi, K.Shim. ROCK: A Robust Clustering Algorithm for Categorical Attributes. 1999.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

from pyclustering.utils import euclidean_distance;

import pyclustering.core.wrapper as wrapper;


class rock:
    """!
    @brief Class represents clustering algorithm ROCK.

    Example:
    @code
        # Read sample for clustering from some file
        sample = read_sample(path_to_sample);
        
        # Create instance of ROCK algorithm for cluster analysis
        # Five clusters should be allocated
        rock_instance = rock(sample, 1.0, 5);
        
        # Run cluster analysis
        rock_instance.process();
        
        # Obtain results of clustering
        clusters = rock_instance.get_clusters();  
    @endcode
       
    """
    
    def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = False):
        """!
        @brief Constructor of clustering algorithm ROCK.
        
        @param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
        @param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
        @param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
        @param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """
        
        self.__pointer_data = data;
        self.__eps = eps;
        self.__number_clusters = number_clusters;
        self.__threshold = threshold;
        
        self.__clusters = None;
        
        self.__ccore = ccore;
        
        self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
        
        self.__adjacency_matrix = None;
        self.__create_adjacency_matrix();
        
        
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of ROCK algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        
        """
        
        # TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two 
        # points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
        
        if (self.__ccore is True):
            self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
        
        else:  
            self.__clusters = [[index] for index in range(len(self.__pointer_data))];
            
            while (len(self.__clusters) > self.__number_clusters):
                indexes = self.__find_pair_clusters(self.__clusters);
                
                if (indexes != [-1, -1]):
                    self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
                    self.__clusters.pop(indexes[1]);   # remove merged cluster.
                else:
                    break;  # totally separated clusters have been allocated
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        
        """
        
        return self.__clusters;


    def __find_pair_clusters(self, clusters):
        """!
        @brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
               The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
               
        @param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
        
        @return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
                It can be equals to [-1, -1] when no links between clusters.
        
        """
        
        maximum_goodness = 0.0;
        cluster_indexes = [-1, -1];
        
        for i in range(0, len(clusters)):
            for j in range(i + 1, len(clusters)):
                goodness = self.__calculate_goodness(clusters[i], clusters[j]);
                if (goodness > maximum_goodness):
                    maximum_goodness = goodness;
                    cluster_indexes = [i, j];
        
        return cluster_indexes;


    def __calculate_links(self, cluster1, cluster2):
        """!
        @brief Returns number of link between two clusters. Link between objects (points) exists only if distance between them less than connectivity radius.
        
        @param[in] cluster1 (list): The first cluster.
        @param[in] cluster2 (list): The second cluster.
        
        @return (uint) Number of links between two clusters.
        
        """
        
        number_links = 0;
        
        for index1 in cluster1:
            for index2 in cluster2:
                number_links += self.__adjacency_matrix[index1][index2];
                
        return number_links;
            

    def __create_adjacency_matrix(self):
        """!
        @brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
        
        """
        
        size_data = len(self.__pointer_data);
        
        self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
        for i in range(0, size_data):
            for j in range(i + 1, size_data):
                distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
                if (distance <= self.__eps):
                    self.__adjacency_matrix[i][j] = 1;
                    self.__adjacency_matrix[j][i] = 1;
        
    

    def __calculate_goodness(self, cluster1, cluster2):
        """!
        @brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
        
        @param[in] cluster1 (list): The first cluster.
        @param[in] cluster2 (list): The second cluster.
        
        @return Goodness measure between two clusters.
        
        """
        
        number_links = self.__calculate_links(cluster1, cluster2);
        devider = (len(cluster1) + len(cluster2)) ** self.__degree_normalization - len(cluster1) ** self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
        
        return (number_links / devider);


1			"""!
2
3			@brief Cluster analysis algorithm: ROCK
4			@details Based on article description:
5			- S.Guha, R.Rastogi, K.Shim. ROCK: A Robust Clustering Algorithm for Categorical Attributes. 1999.
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2016
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28			from pyclustering.utils import euclidean_distance;
29
30			import pyclustering.core.wrapper as wrapper;
31
32
33			class rock:
34			"""!
35			@brief Class represents clustering algorithm ROCK.
36
37			Example:
38			@code
39			# Read sample for clustering from some file
40			sample = read_sample(path_to_sample);
41
42			# Create instance of ROCK algorithm for cluster analysis
43			# Five clusters should be allocated
44			rock_instance = rock(sample, 1.0, 5);
45
46			# Run cluster analysis
47			rock_instance.process();
48
49			# Obtain results of clustering
50			clusters = rock_instance.get_clusters();
51			@endcode
52
53			"""
54
55			def __init__(self, data, eps, number_clusters, threshold = 0.5, ccore = False):
56			"""!
57			@brief Constructor of clustering algorithm ROCK.
58
59			@param[in] data (list): Input data - list of points where each point is represented by list of coordinates.
60			@param[in] eps (double): Connectivity radius (similarity threshold), points are neighbors if distance between them is less than connectivity radius.
61			@param[in] number_clusters (uint): Defines number of clusters that should be allocated from the input data set.
62			@param[in] threshold (double): Value that defines degree of normalization that influences on choice of clusters for merging during processing.
63			@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
64
65			"""
66
67			self.__pointer_data = data;
68			self.__eps = eps;
69			self.__number_clusters = number_clusters;
70			self.__threshold = threshold;
71
72			self.__clusters = None;
73
74			self.__ccore = ccore;
75
76			self.__degree_normalization = 1.0 + 2.0 * ( (1.0 - threshold) / (1.0 + threshold) );
77
78			self.__adjacency_matrix = None;
79			self.__create_adjacency_matrix();
80
81
82			def process(self):
83			"""!
84			@brief Performs cluster analysis in line with rules of ROCK algorithm.
85
86			@remark Results of clustering can be obtained using corresponding get methods.
87
88			@see get_clusters()
89
90			"""
91
92			# TODO: (Not related to specification, just idea) First iteration should be investigated. Euclidean distance should be used for clustering between two
93			# points and rock algorithm between clusters because we consider non-categorical samples. But it is required more investigations.
94
95			if (self.__ccore is True):
96			self.__clusters = wrapper.rock(self.__pointer_data, self.__eps, self.__number_clusters, self.__threshold);
97
98			else:
99			self.__clusters = [[index] for index in range(len(self.__pointer_data))];
100
101			while (len(self.__clusters) > self.__number_clusters):
102			indexes = self.__find_pair_clusters(self.__clusters);
103
104			if (indexes != [-1, -1]):
105			self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
106			self.__clusters.pop(indexes[1]); # remove merged cluster.
107			else:
108			break; # totally separated clusters have been allocated
109
110
111			def get_clusters(self):
112			"""!
113			@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
114
115			@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
116
117			@see process()
118
119			"""
120
121			return self.__clusters;
122
123
124			def __find_pair_clusters(self, clusters):
125			"""!
126			@brief Returns pair of clusters that are best candidates for merging in line with goodness measure.
127			The pair of clusters for which the above goodness measure is maximum is the best pair of clusters to be merged.
128
129			@param[in] clusters (list): List of clusters that have been allocated during processing, each cluster is represented by list of indexes of points from the input data set.
130
131			@return (list) List that contains two indexes of clusters (from list 'clusters') that should be merged on this step.
132			It can be equals to [-1, -1] when no links between clusters.
133
134			"""
135
136			maximum_goodness = 0.0;
137			cluster_indexes = [-1, -1];
138
139			for i in range(0, len(clusters)):
140			for j in range(i + 1, len(clusters)):
141			goodness = self.__calculate_goodness(clusters[i], clusters[j]);
142			if (goodness > maximum_goodness):
143			maximum_goodness = goodness;
144			cluster_indexes = [i, j];
145
146			return cluster_indexes;
147
148
149			def __calculate_links(self, cluster1, cluster2):
150			"""!
151			@brief Returns number of link between two clusters. Link between objects (points) exists only if distance between them less than connectivity radius.
152
153			@param[in] cluster1 (list): The first cluster.
154			@param[in] cluster2 (list): The second cluster.
155
156			@return (uint) Number of links between two clusters.
157
158			"""
159
160			number_links = 0;
161
162			for index1 in cluster1:
163			for index2 in cluster2:
164			number_links += self.__adjacency_matrix[index1][index2];
165
166			return number_links;
167
168
169			def __create_adjacency_matrix(self):
170			"""!
171			@brief Creates 2D adjacency matrix (list of lists) where each element described existence of link between points (means that points are neighbors).
172
173			"""
174
175			size_data = len(self.__pointer_data);
176
177			self.__adjacency_matrix = [ [ 0 for i in range(size_data) ] for j in range(size_data) ];
178			for i in range(0, size_data):
179			for j in range(i + 1, size_data):
180			distance = euclidean_distance(self.__pointer_data[i], self.__pointer_data[j]);
181			if (distance <= self.__eps):
182			self.__adjacency_matrix[i][j] = 1;
183			self.__adjacency_matrix[j][i] = 1;
184
185
186
187			def __calculate_goodness(self, cluster1, cluster2):
188			"""!
189			@brief Calculates coefficient 'goodness measurement' between two clusters. The coefficient defines level of suitability of clusters for merging.
190
191			@param[in] cluster1 (list): The first cluster.
192			@param[in] cluster2 (list): The second cluster.
193
194			@return Goodness measure between two clusters.
195
196			"""
197
198			number_links = self.__calculate_links(cluster1, cluster2);
199			devider = (len(cluster1) + len(cluster2)) self.__degree_normalization - len(cluster1) self.__degree_normalization - len(cluster2) ** self.__degree_normalization;
200
201			return (number_links / devider);
202

annoviko / pyclustering

Push — master ( aa7a06...786fcc )

rock.__init__() B

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like

rock.init() B