Completed
Push — 0.7.dev ( 8f39e7...242f19 )
by Andrei
53s
created

dbscan.get_clusters()   A

Complexity

Conditions 1

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 14
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: DBSCAN.
4
@details Implementation based on article:
5
         - M.Ester, H.Kriegel, J.Sander, X.Xiaowei. A density-based algorithm for discovering clusters in large spatial databases with noise. 1996.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.container.kdtree import kdtree;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
import pyclustering.core.dbscan_wrapper as wrapper;
34
35
36
class dbscan:
37
    """!
38
    @brief Class represents clustering algorithm DBSCAN.
39
    
40
    Example:
41
    @code
42
        # sample for cluster analysis (represented by list)
43
        sample = read_sample(path_to_sample);
44
        
45
        # create object that uses CCORE for processing
46
        dbscan_instance = dbscan(sample, 0.5, 3, True);
47
        
48
        # cluster analysis
49
        dbscan_instance.process();
50
        
51
        # obtain results of clustering
52
        clusters = dbscan_instance.get_clusters();
53
        noise = dbscan_instance.get_noise();
54
    @endcode
55
    
56
    """
57
    
58
    def __init__(self, data, eps, neighbors, ccore = False):
59
        """!
60
        @brief Constructor of clustering algorithm DBSCAN.
61
        
62
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
63
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less then the radius.
64
        @param[in] neighbors (uint): minimum number of shared neighbors that is required for establish links between points.
65
        @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
66
        
67
        """
68
        
69
        self.__pointer_data = data;
70
        self.__kdtree = None;
71
        self.__eps = eps;
72
        self.__sqrt_eps = eps * eps;
73
        self.__neighbors = neighbors;
74
        
75
        self.__visited = [False] * len(self.__pointer_data);
76
        self.__belong = [False] * len(self.__pointer_data);
77
        
78
        self.__clusters = [];
79
        self.__noise = [];
80
        
81
        self.__ccore = ccore;
82
83
84
    def process(self):
85
        """!
86
        @brief Performs cluster analysis in line with rules of DBSCAN algorithm.
87
        
88
        @see get_clusters()
89
        @see get_noise()
90
        
91
        """
92
        
93
        if (self.__ccore is True):
94
            (self.__clusters, self.__noise) = wrapper.dbscan(self.__pointer_data, self.__eps, self.__neighbors, True);
95
            
96
        else:
97
            self.__kdtree = kdtree(self.__pointer_data, range(len(self.__pointer_data)));
98
            for i in range(0, len(self.__pointer_data)):
99
                if (self.__visited[i] == False):
100
                     
101
                    cluster = self.__expand_cluster(i);    # Fast mode
102
                    if (cluster != None):
103
                        self.__clusters.append(cluster);
104
                    else:
105
                        self.__noise.append(i);
106
                        self.__belong[i] = True;
107
108
109
    def get_clusters(self):
110
        """!
111
        @brief Returns allocated clusters.
112
        
113
        @remark Allocated clusters can be returned only after data processing (use method process()). Otherwise empty list is returned.
114
        
115
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
116
        
117
        @see process()
118
        @see get_noise()
119
        
120
        """
121
        
122
        return self.__clusters;
123
124
125
    def get_noise(self):
126
        """!
127
        @brief Returns allocated noise.
128
        
129
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
130
        
131
        @return (list) List of indexes that are marked as a noise.
132
        
133
        @see process()
134
        @see get_clusters()
135
        
136
        """
137
138
        return self.__noise;
139
140
141
    def get_cluster_encoding(self):
142
        """!
143
        @brief Returns clustering result representation type that indicate how clusters are encoded.
144
        
145
        @return (type_encoding) Clustering result representation.
146
        
147
        @see get_clusters()
148
        
149
        """
150
        
151
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
152
153
154
    def __expand_cluster(self, point):
155
        """!
156
        @brief Expands cluster from specified point in the input data space.
157
        
158
        @param[in] point (list): Index of a point from the data.
159
160
        @return (list) Return tuple of list of indexes that belong to the same cluster and list of points that are marked as noise: (cluster, noise), or None if nothing has been expanded.
161
        
162
        """
163
        
164
        cluster = None;
165
        self.__visited[point] = True;
166
        neighbors = self.__neighbor_indexes(point);
167
         
168
        if (len(neighbors) >=self.__neighbors):
169
             
170
            cluster = [];
171
            cluster.append(point);
172
             
173
            self.__belong[point] = True;
174
             
175
            for i in neighbors:
176
                if (self.__visited[i] == False):
177
                    self.__visited[i] = True;
178
                    next_neighbors = self.__neighbor_indexes(i);
179
                     
180
                    if (len(next_neighbors) >= self.__neighbors):
181
                        # if some node has less then minimal number of neighbors than we shouldn't look at them
182
                        # because maybe it's a noise.
183
                        neighbors += [k for k in next_neighbors if ( (k in neighbors) == False)];
184
                 
185
                if (self.__belong[i] == False):
186
                    cluster.append(i);
187
                    self.__belong[i] = True;
188
             
189
        return cluster;
190
191
    def __neighbor_indexes(self, index_point):
192
        """!
193
        @brief Return list of indexes of neighbors of specified point for the data.
194
        
195
        @param[in] index_point (list): An index of a point for which potential neighbors should be returned in line with connectivity radius.
196
        
197
        @return (list) Return list of indexes of neighbors in line the connectivity radius.
198
        
199
        """
200
        
201
        kdnodes = self.__kdtree.find_nearest_dist_nodes(self.__pointer_data[index_point], self.__eps);
202
        return [node_tuple[1].payload for node_tuple in kdnodes if node_tuple[1].payload != index_point];
203
204
        #return [i for i in range(0, len(self.__pointer_data)) if euclidean_distance_sqrt(self.__pointer_data[index_point], self.__pointer_data[i]) <= self.__sqrt_eps and (i != index_point) ]; # Fast mode
205