|
1
|
|
|
"""! |
|
2
|
|
|
|
|
3
|
|
|
@brief Cluster analysis algorithm: BSAS (Basic Sequential Algorithmic Scheme). |
|
4
|
|
|
@details Implementation based on book: |
|
5
|
|
|
- Theodoridis, Koutroumbas, Konstantinos. Elsevier Academic Press - Pattern Recognition - 2nd Edition. 2003. |
|
6
|
|
|
|
|
7
|
|
|
@authors Andrei Novikov ([email protected]) |
|
8
|
|
|
@date 2014-2018 |
|
9
|
|
|
@copyright GNU Public License |
|
10
|
|
|
|
|
11
|
|
|
@cond GNU_PUBLIC_LICENSE |
|
12
|
|
|
PyClustering is free software: you can redistribute it and/or modify |
|
13
|
|
|
it under the terms of the GNU General Public License as published by |
|
14
|
|
|
the Free Software Foundation, either version 3 of the License, or |
|
15
|
|
|
(at your option) any later version. |
|
16
|
|
|
|
|
17
|
|
|
PyClustering is distributed in the hope that it will be useful, |
|
18
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
19
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
20
|
|
|
GNU General Public License for more details. |
|
21
|
|
|
|
|
22
|
|
|
You should have received a copy of the GNU General Public License |
|
23
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
24
|
|
|
@endcond |
|
25
|
|
|
|
|
26
|
|
|
""" |
|
27
|
|
|
|
|
28
|
|
|
|
|
29
|
|
|
from pyclustering.core.wrapper import ccore_library; |
|
30
|
|
|
from pyclustering.core.bsas_wrapper import bsas as bsas_wrapper; |
|
31
|
|
|
from pyclustering.core.metric_wrapper import metric_wrapper; |
|
32
|
|
|
|
|
33
|
|
|
from pyclustering.cluster import cluster_visualizer; |
|
34
|
|
|
from pyclustering.cluster.encoder import type_encoding; |
|
35
|
|
|
|
|
36
|
|
|
from pyclustering.utils.metric import type_metric, distance_metric; |
|
37
|
|
|
|
|
38
|
|
|
|
|
39
|
|
|
class bsas_visualizer: |
|
40
|
|
|
"""! |
|
41
|
|
|
@brief Visualizer of BSAS algorithm's results. |
|
42
|
|
|
@details BSAS visualizer provides visualization services that are specific for BSAS algorithm. |
|
43
|
|
|
|
|
44
|
|
|
""" |
|
45
|
|
|
|
|
46
|
|
|
@staticmethod |
|
47
|
|
|
def show_clusters(sample, clusters, representatives, **kwargs): |
|
48
|
|
|
"""! |
|
49
|
|
|
@brief Display BSAS clustering results. |
|
50
|
|
|
|
|
51
|
|
|
@param[in] sample (list): Dataset that was used for clustering. |
|
52
|
|
|
@param[in] clusters (array_like): Clusters that were allocated by the algorithm. |
|
53
|
|
|
@param[in] representatives (array_like): Allocated representatives correspond to clusters. |
|
54
|
|
|
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'figure', 'display'). |
|
55
|
|
|
|
|
56
|
|
|
Keyword Args: |
|
57
|
|
|
figure (figure): If 'None' then new is figure is created, otherwise specified figure is used for visualization. |
|
58
|
|
|
display (bool): If 'True' then figure will be shown by the method, otherwise it should be shown manually using matplotlib function 'plt.show()'. |
|
59
|
|
|
offset (uint): Specify axes index on the figure where results should be drawn (only if argument 'figure' is specified). |
|
60
|
|
|
|
|
61
|
|
|
@return (figure) Figure where clusters were drawn. |
|
62
|
|
|
|
|
63
|
|
|
""" |
|
64
|
|
|
|
|
65
|
|
|
figure = kwargs.get('figure', None); |
|
66
|
|
|
display = kwargs.get('display', True); |
|
67
|
|
|
offset = kwargs.get('offset', 0); |
|
68
|
|
|
|
|
69
|
|
|
visualizer = cluster_visualizer(); |
|
70
|
|
|
visualizer.append_clusters(clusters, sample, canvas=offset); |
|
71
|
|
|
|
|
72
|
|
|
for cluster_index in range(len(clusters)): |
|
73
|
|
|
visualizer.append_cluster_attribute(offset, cluster_index, [representatives[cluster_index]], '*', 10); |
|
74
|
|
|
|
|
75
|
|
|
return visualizer.show(figure=figure, display=display); |
|
76
|
|
|
|
|
77
|
|
|
|
|
78
|
|
|
class bsas: |
|
79
|
|
|
"""! |
|
|
|
|
|
|
80
|
|
|
@brief Class represents BSAS clustering algorithm - basic sequential algorithmic scheme. |
|
81
|
|
|
@details Algorithm has two mandatory parameters: maximum allowable number of clusters and threshold |
|
82
|
|
|
of dissimilarity or in other words maximum distance between points. Distance metric also can |
|
83
|
|
|
be specified using 'metric' parameters, by default 'Manhattan' distance is used. |
|
84
|
|
|
BSAS using following rule for updating cluster representative: |
|
85
|
|
|
|
|
86
|
|
|
\f[ |
|
87
|
|
|
\vec{m}_{C_{k}}^{new}=\frac{ \left ( n_{C_{k}^{new}} - 1 \right )\vec{m}_{C_{k}}^{old} + \vec{x} }{n_{C_{k}^{new}}} |
|
88
|
|
|
\f] |
|
89
|
|
|
|
|
90
|
|
|
Clustering results of this algorithm depends on objects order in input data. |
|
91
|
|
|
|
|
92
|
|
|
Example: |
|
93
|
|
|
@code |
|
94
|
|
|
# Read data sample from 'Simple02.data'. |
|
95
|
|
|
sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE2); |
|
96
|
|
|
|
|
97
|
|
|
# Prepare algorithm's parameters. |
|
98
|
|
|
max_clusters = 2; |
|
99
|
|
|
threshold = 1.0; |
|
100
|
|
|
|
|
101
|
|
|
# Create instance of BSAS algorithm. |
|
102
|
|
|
bsas_instance = bsas(sample, max_clusters, threshold); |
|
103
|
|
|
bsas_instance.process(); |
|
104
|
|
|
|
|
105
|
|
|
# Get clustering results. |
|
106
|
|
|
clusters = bsas_instance.get_clusters(); |
|
107
|
|
|
representatives = bsas_instance.get_representatives(); |
|
108
|
|
|
|
|
109
|
|
|
# Display results. |
|
110
|
|
|
bsas_visualizer.show_clusters(sample, clusters, representatives); |
|
111
|
|
|
@endcode |
|
112
|
|
|
|
|
113
|
|
|
@see pyclustering.cluster.mbsas, pyclustering.cluster.ttsas |
|
114
|
|
|
|
|
115
|
|
|
""" |
|
116
|
|
|
|
|
117
|
|
|
def __init__(self, data, maximum_clusters, threshold, ccore=True, **kwargs): |
|
118
|
|
|
"""! |
|
119
|
|
|
@brief Creates classical BSAS algorithm. |
|
120
|
|
|
|
|
121
|
|
|
@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
|
122
|
|
|
@param[in] maximum_clusters: Maximum allowable number of clusters that can be allocated during processing. |
|
123
|
|
|
@param[in] threshold: Threshold of dissimilarity (maximum distance) between points. |
|
124
|
|
|
@param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving. |
|
125
|
|
|
@param[in] **kwargs: Arbitrary keyword arguments (available arguments: 'metric'). |
|
126
|
|
|
|
|
127
|
|
|
Keyword Args: |
|
128
|
|
|
metric (distance_metric): Metric that is used for distance calculation between two points. |
|
129
|
|
|
|
|
130
|
|
|
""" |
|
131
|
|
|
|
|
132
|
|
|
self._data = data; |
|
133
|
|
|
self._amount = maximum_clusters; |
|
134
|
|
|
self._threshold = threshold; |
|
135
|
|
|
self._metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); |
|
136
|
|
|
self._ccore = ccore and self._metric.get_type() != type_metric.USER_DEFINED; |
|
137
|
|
|
|
|
138
|
|
|
self._clusters = []; |
|
139
|
|
|
self._representatives = []; |
|
140
|
|
|
|
|
141
|
|
|
if self._ccore is True: |
|
142
|
|
|
self._ccore = ccore_library.workable(); |
|
143
|
|
|
|
|
144
|
|
|
|
|
145
|
|
|
def process(self): |
|
146
|
|
|
"""! |
|
147
|
|
|
@brief Performs cluster analysis in line with rules of BSAS algorithm. |
|
148
|
|
|
|
|
149
|
|
|
@remark Results of clustering can be obtained using corresponding get methods. |
|
150
|
|
|
|
|
151
|
|
|
@see get_clusters() |
|
152
|
|
|
@see get_representatives() |
|
153
|
|
|
|
|
154
|
|
|
""" |
|
155
|
|
|
|
|
156
|
|
|
if self._ccore is True: |
|
157
|
|
|
self.__process_by_ccore(); |
|
158
|
|
|
else: |
|
159
|
|
|
self.__prcess_by_python(); |
|
160
|
|
|
|
|
161
|
|
|
|
|
162
|
|
|
def __process_by_ccore(self): |
|
163
|
|
|
ccore_metric = metric_wrapper.create_instance(self._metric); |
|
164
|
|
|
self._clusters, self._representatives = bsas_wrapper(self._data, self._amount, self._threshold, ccore_metric.get_pointer()); |
|
165
|
|
|
|
|
166
|
|
|
|
|
167
|
|
|
def __prcess_by_python(self): |
|
168
|
|
|
self._clusters.append([0]); |
|
169
|
|
|
self._representatives.append(self._data[0]); |
|
170
|
|
|
|
|
171
|
|
|
for i in range(1, len(self._data)): |
|
172
|
|
|
point = self._data[i]; |
|
173
|
|
|
index_cluster, distance = self._find_nearest_cluster(point); |
|
174
|
|
|
|
|
175
|
|
|
if (distance > self._threshold) and (len(self._clusters) < self._amount): |
|
176
|
|
|
self._representatives.append(point); |
|
177
|
|
|
self._clusters.append([i]); |
|
178
|
|
|
else: |
|
179
|
|
|
self._clusters[index_cluster].append(i); |
|
180
|
|
|
self._update_representative(index_cluster, point); |
|
181
|
|
|
|
|
182
|
|
|
|
|
183
|
|
|
def get_clusters(self): |
|
184
|
|
|
"""! |
|
185
|
|
|
@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
|
186
|
|
|
|
|
187
|
|
|
@see process() |
|
188
|
|
|
@see get_representatives() |
|
189
|
|
|
|
|
190
|
|
|
""" |
|
191
|
|
|
return self._clusters; |
|
192
|
|
|
|
|
193
|
|
|
|
|
194
|
|
|
def get_representatives(self): |
|
195
|
|
|
"""! |
|
196
|
|
|
@brief Returns list of representatives of allocated clusters. |
|
197
|
|
|
|
|
198
|
|
|
@see process() |
|
199
|
|
|
@see get_clusters() |
|
200
|
|
|
|
|
201
|
|
|
""" |
|
202
|
|
|
return self._representatives; |
|
203
|
|
|
|
|
204
|
|
|
|
|
205
|
|
|
def get_cluster_encoding(self): |
|
206
|
|
|
"""! |
|
207
|
|
|
@brief Returns clustering result representation type that indicate how clusters are encoded. |
|
208
|
|
|
|
|
209
|
|
|
@return (type_encoding) Clustering result representation. |
|
210
|
|
|
|
|
211
|
|
|
@see get_clusters() |
|
212
|
|
|
|
|
213
|
|
|
""" |
|
214
|
|
|
|
|
215
|
|
|
return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
|
216
|
|
|
|
|
217
|
|
|
|
|
218
|
|
|
def _find_nearest_cluster(self, point): |
|
219
|
|
|
"""! |
|
220
|
|
|
@brief Find nearest cluster to the specified point. |
|
221
|
|
|
|
|
222
|
|
|
@param[in] point (list): Point from dataset. |
|
223
|
|
|
|
|
224
|
|
|
@return (uint, double) Index of nearest cluster and distance to it. |
|
225
|
|
|
|
|
226
|
|
|
""" |
|
227
|
|
|
index_cluster = -1; |
|
228
|
|
|
nearest_distance = float('inf'); |
|
229
|
|
|
|
|
230
|
|
|
for index in range(len(self._representatives)): |
|
231
|
|
|
distance = self._metric(point, self._representatives[index]); |
|
232
|
|
|
if distance < nearest_distance: |
|
233
|
|
|
index_cluster = index; |
|
234
|
|
|
nearest_distance = distance; |
|
235
|
|
|
|
|
236
|
|
|
return index_cluster, nearest_distance; |
|
237
|
|
|
|
|
238
|
|
|
|
|
239
|
|
|
def _update_representative(self, index_cluster, point): |
|
240
|
|
|
"""! |
|
241
|
|
|
@brief Update cluster representative in line with new cluster size and added point to it. |
|
242
|
|
|
|
|
243
|
|
|
@param[in] index_cluster (uint): Index of cluster whose representative should be updated. |
|
244
|
|
|
@param[in] point (list): Point that was added to cluster. |
|
245
|
|
|
|
|
246
|
|
|
""" |
|
247
|
|
|
length = len(self._clusters[index_cluster]); |
|
248
|
|
|
rep = self._representatives[index_cluster]; |
|
249
|
|
|
|
|
250
|
|
|
for dimension in range(len(rep)): |
|
251
|
|
|
rep[dimension] = ( (length - 1) * rep[dimension] + point[dimension] ) / length; |
Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with
rorRare they interpreted as regular expressions.The escape sequence that was used indicates that you might have intended to write a regular expression.
Learn more about the available escape sequences. in the Python documentation.