1 | <?php |
||
2 | declare(strict_types=1); |
||
3 | /** |
||
4 | * @copyright Copyright (c) 2023, Matias De lellis |
||
5 | * |
||
6 | * @author Matias De lellis <[email protected]> |
||
7 | * |
||
8 | * @license AGPL-3.0-or-later |
||
9 | * |
||
10 | * This code is free software: you can redistribute it and/or modify |
||
11 | * it under the terms of the GNU Affero General Public License, version 3, |
||
12 | * as published by the Free Software Foundation. |
||
13 | * |
||
14 | * This program is distributed in the hope that it will be useful, |
||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
17 | * GNU Affero General Public License for more details. |
||
18 | * |
||
19 | * You should have received a copy of the GNU Affero General Public License, version 3, |
||
20 | * along with this program. If not, see <http://www.gnu.org/licenses/> |
||
21 | * |
||
22 | */ |
||
23 | |||
24 | namespace OCA\FaceRecognition\Clusterer; |
||
25 | |||
26 | |||
27 | /** |
||
28 | * This class implements the graph clustering algorithm described in the |
||
29 | * paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its |
||
30 | * Application to Natural Language Processing Problems by Chris Biemann. |
||
31 | * |
||
32 | * In particular, it tries to be a shameless copy of the original dlib |
||
33 | * implementation. |
||
34 | * - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h |
||
35 | */ |
||
36 | class ChineseWhispers { |
||
37 | |||
38 | /** |
||
39 | * Cluster the dataset by assigning a label to each sample.from the edges |
||
40 | */ |
||
41 | static public function predict(array &$edges, array &$labels, int $num_iterations = 100) |
||
42 | { |
||
43 | // To improve the stability of the clusters, we must |
||
44 | // iterate the neighbors in a pseudo-random way. |
||
45 | mt_srand(2023); |
||
46 | |||
47 | $labels = []; |
||
48 | if (count($edges) == 0) |
||
49 | return 0; |
||
50 | |||
51 | $neighbors = []; |
||
52 | self::find_neighbor_ranges($edges, $neighbors); |
||
53 | |||
54 | // Initialize the labels, each node gets a different label. |
||
55 | for ($i = 0; $i < count($neighbors); ++$i) |
||
0 ignored issues
–
show
|
|||
56 | $labels[$i] = $i; |
||
57 | |||
58 | for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter) |
||
59 | { |
||
60 | // Pick a random node. |
||
61 | $idx = mt_rand()%count($neighbors); |
||
62 | |||
63 | // Count how many times each label happens amongst our neighbors. |
||
64 | $labels_to_counts = []; |
||
65 | $end = $neighbors[$idx][1]; |
||
66 | |||
67 | for ($i = $neighbors[$idx][0]; $i != $end; ++$i) |
||
68 | { |
||
69 | $iLabelFirst = $edges[$i][1]; |
||
70 | $iLabel = $labels[$iLabelFirst]; |
||
71 | if (isset($labels_to_counts[$iLabel])) |
||
72 | $labels_to_counts[$iLabel]++; |
||
73 | else |
||
74 | $labels_to_counts[$iLabel] = 1; |
||
75 | } |
||
76 | |||
77 | // find the most common label |
||
78 | // std::map<unsigned long, double>::iterator i; |
||
79 | $best_score = PHP_INT_MIN; |
||
80 | $best_label = $labels[$idx]; |
||
81 | foreach ($labels_to_counts as $key => $value) |
||
82 | { |
||
83 | if ($value > $best_score) |
||
84 | { |
||
85 | $best_score = $value; |
||
86 | $best_label = $key; |
||
87 | } |
||
88 | } |
||
89 | |||
90 | $labels[$idx] = $best_label; |
||
91 | } |
||
92 | |||
93 | // Remap the labels into a contiguous range. First we find the |
||
94 | // mapping. |
||
95 | $label_remap = []; |
||
96 | for ($i = 0; $i < count($labels); ++$i) |
||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||
97 | { |
||
98 | $next_id = count($label_remap); |
||
99 | if (!isset($label_remap[$labels[$i]])) |
||
100 | $label_remap[$labels[$i]] = $next_id; |
||
101 | } |
||
102 | // now apply the mapping to all the labels. |
||
103 | for ($i = 0; $i < count($labels); ++$i) |
||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||
104 | { |
||
105 | $labels[$i] = $label_remap[$labels[$i]]; |
||
106 | } |
||
107 | |||
108 | return count($label_remap); |
||
109 | } |
||
110 | |||
111 | static function find_neighbor_ranges (&$edges, &$neighbors) { |
||
0 ignored issues
–
show
|
|||
112 | // setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range |
||
113 | // within edges that contains all node i's edges. |
||
114 | $num_nodes = self::max_index_plus_one($edges); |
||
115 | for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0]; |
||
116 | $cur_node = 0; |
||
117 | $start_idx = 0; |
||
118 | for ($i = 0; $i < count($edges); ++$i) |
||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||
119 | { |
||
120 | if ($edges[$i][0] != $cur_node) |
||
121 | { |
||
122 | $neighbors[$cur_node] = [$start_idx, $i]; |
||
123 | $start_idx = $i; |
||
124 | $cur_node = $edges[$i][0]; |
||
125 | } |
||
126 | } |
||
127 | if (count($neighbors) !== 0) |
||
128 | $neighbors[$cur_node] = [$start_idx, count($edges)]; |
||
129 | } |
||
130 | |||
131 | static function max_index_plus_one ($pairs): int { |
||
0 ignored issues
–
show
|
|||
132 | if (count($pairs) === 0) |
||
133 | { |
||
134 | return 0; |
||
135 | } |
||
136 | else { |
||
137 | $max_idx = 0; |
||
138 | for ($i = 0; $i < count($pairs); ++$i) |
||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||
139 | { |
||
140 | if ($pairs[$i][0] > $max_idx) |
||
141 | $max_idx = $pairs[$i][0]; |
||
142 | if ($pairs[$i][1] > $max_idx) |
||
143 | $max_idx = $pairs[$i][1]; |
||
144 | } |
||
145 | return $max_idx + 1; |
||
146 | } |
||
147 | } |
||
148 | |||
149 | static function convert_unordered_to_ordered (&$edges, &$out_edges) |
||
0 ignored issues
–
show
|
|||
150 | { |
||
151 | $out_edges = []; |
||
152 | for ($i = 0; $i < count($edges); ++$i) |
||
0 ignored issues
–
show
It seems like you are calling the size function
count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}
// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
![]() |
|||
153 | { |
||
154 | $out_edges[] = [$edges[$i][0], $edges[$i][1]]; |
||
155 | if ($edges[$i][0] != $edges[$i][1]) |
||
156 | $out_edges[] = [$edges[$i][1], $edges[$i][0]]; |
||
157 | } |
||
158 | } |
||
159 | } |
||
160 |
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: