Completed
Push — master ( 55f4eb...4814fa )
by Andrei
03:31
created

dbscan.__expand_cluster()   D

Complexity

Conditions 8

Size

Total Lines 36

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
dl 0
loc 36
rs 4
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: DBSCAN.
4
@details Implementation based on article:
5
         - M.Ester, H.Kriegel, J.Sander, X.Xiaowei. A density-based algorithm for discovering clusters in large spatial databases with noise. 1996.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.container.kdtree import kdtree;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
import pyclustering.core.dbscan_wrapper as wrapper;
34
35
36
class dbscan:
37
    """!
38
    @brief Class represents clustering algorithm DBSCAN.
39
    @details This DBSCAN algorithm is KD-tree optimized.
40
             
41
             CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
42
    
43
    Example:
44
    @code
45
        # sample for cluster analysis (represented by list)
46
        sample = read_sample(path_to_sample);
47
        
48
        # create object that uses CCORE for processing
49
        dbscan_instance = dbscan(sample, 0.5, 3, True);
50
        
51
        # cluster analysis
52
        dbscan_instance.process();
53
        
54
        # obtain results of clustering
55
        clusters = dbscan_instance.get_clusters();
56
        noise = dbscan_instance.get_noise();
57
    @endcode
58
    
59
    """
60
    
61
    def __init__(self, data, eps, neighbors, ccore = False):
62
        """!
63
        @brief Constructor of clustering algorithm DBSCAN.
64
        
65
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
66
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
67
        @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
68
        @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
69
        
70
        """
71
        
72
        self.__pointer_data = data;
73
        self.__kdtree = None;
74
        self.__eps = eps;
75
        self.__sqrt_eps = eps * eps;
76
        self.__neighbors = neighbors;
77
        
78
        self.__visited = [False] * len(self.__pointer_data);
79
        self.__belong = [False] * len(self.__pointer_data);
80
        
81
        self.__clusters = [];
82
        self.__noise = [];
83
        
84
        self.__ccore = ccore;
85
86
87
    def process(self):
88
        """!
89
        @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
90
        
91
        @see get_clusters()
92
        @see get_noise()
93
        
94
        """
95
        
96
        if (self.__ccore is True):
97
            (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, True);
98
            
99
        else:
100
            self.__kdtree = kdtree(self.__pointer_data, range(len(self.__pointer_data)));
101
            for i in range(0, len(self.__pointer_data)):
102
                if (self.__visited[i] == False):
103
                     
104
                    cluster = self.__expand_cluster(i);    # Fast mode
105
                    if (cluster != None):
106
                        self.__clusters.append(cluster);
107
                    else:
108
                        self.__noise.append(i);
109
                        self.__belong[i] = True;
110
111
112
    def get_clusters(self):
113
        """!
114
        @brief Returns allocated clusters.
115
        
116
        @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
117
        
118
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
119
        
120
        @see process()
121
        @see get_noise()
122
        
123
        """
124
        
125
        return self.__clusters;
126
127
128
    def get_noise(self):
129
        """!
130
        @brief Returns allocated noise.
131
        
132
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
133
        
134
        @return (list) List of indexes that are marked as a noise.
135
        
136
        @see process()
137
        @see get_clusters()
138
        
139
        """
140
141
        return self.__noise;
142
143
144
    def get_cluster_encoding(self):
145
        """!
146
        @brief Returns clustering result representation type that indicate how clusters are encoded.
147
        
148
        @return (type_encoding) Clustering result representation.
149
        
150
        @see get_clusters()
151
        
152
        """
153
        
154
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
155
156
157
    def __expand_cluster(self, point):
158
        """!
159
        @brief Expands cluster from specified point in the input data space.
160
        
161
        @param[in] point (list): Index of a point from the data.
162
163
        @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
164
        
165
        """
166
        
167
        cluster = None;
168
        self.__visited[point] = True;
169
        neighbors = self.__neighbor_indexes(point);
170
         
171
        if (len(neighbors) >=self.__neighbors):
172
             
173
            cluster = [];
174
            cluster.append(point);
175
             
176
            self.__belong[point] = True;
177
             
178
            for i in neighbors:
179
                if (self.__visited[i] == False):
180
                    self.__visited[i] = True;
181
                    next_neighbors = self.__neighbor_indexes(i);
182
                     
183
                    if (len(next_neighbors) >= self.__neighbors):
184
                        # if some node has less then minimal number of neighbors than we shouldn't look at them
185
                        # because maybe it's a noise.
186
                        neighbors += [k for k in next_neighbors if ( (k in neighbors) == False)];
187
                 
188
                if (self.__belong[i] == False):
189
                    cluster.append(i);
190
                    self.__belong[i] = True;
191
             
192
        return cluster;
193
194
    def __neighbor_indexes(self, index_point):
195
        """!
196
        @brief Return list of indexes of neighbors of specified point for the data.
197
        
198
        @param[in] index_point (list): An index of a point for which potential neighbors should be returned in line with connectivity radius.
199
        
200
        @return (list) Return list of indexes of neighbors in line the connectivity radius.
201
        
202
        """
203
        
204
        kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps);
205
        return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point];
206
207
        # return [i for i in range(0, len(self.__pointer_data)) if euclidean_distance_sqrt(self.__pointer_data[index_point], self.__pointer_data[i]) <= self.__sqrt_eps and (i != index_point) ]; # Fast mode
208