ChineseWhispers - Code Metrics - matiasdelellis/facerecognition - Measure and Improve Code Quality continuously with Scrutinizer

ChineseWhispers A
last analyzed 2025-04-04 20:37 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	121
Duplicated Lines	0 %

Test Coverage

Coverage

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
eloc	61
c	1
b	0
f	0
dl	0
loc	121
ccs	0
cts	62
cp	0
rs	10
wmc	24

4 Methods

Rating	Name	Size	Complexity
A	max_index_plus_one()	15	5
A	convert_unordered_to_ordered()	8	3
B	predict()	68	11
A	find_neighbor_ranges()	18	5

<?php
declare(strict_types=1);
/**
 * @copyright Copyright (c) 2023, Matias De lellis
 *
 * @author Matias De lellis <[email protected]>
 *
 * @license AGPL-3.0-or-later
 *
 * This code is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License, version 3,
 * along with this program. If not, see <http://www.gnu.org/licenses/>
 *
 */

namespace OCA\FaceRecognition\Clusterer;


/**
 * This class implements the graph clustering algorithm described in the
 * paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
 * Application to Natural Language Processing Problems by Chris Biemann.
 *
 * In particular, it tries to be a shameless copy of the original dlib
 * implementation.
 *  - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h
 */
class ChineseWhispers {

	/**
	 * Cluster the dataset by assigning a label to each sample.from the edges
	 */
	static public function predict(array &$edges, array &$labels, int $num_iterations = 100)
	{
		// To improve the stability of the clusters, we must
		// iterate the neighbors in a pseudo-random way.
		mt_srand(2023);

		$labels = [];
		if (count($edges) == 0)
			return 0;

		$neighbors = [];
		self::find_neighbor_ranges($edges, $neighbors);

		// Initialize the labels, each node gets a different label.
		for ($i = 0; $i < count($neighbors); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
			$labels[$i] = $i;

		for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter)
		{
			// Pick a random node.
			$idx = mt_rand()%count($neighbors);

			// Count how many times each label happens amongst our neighbors.
			$labels_to_counts = [];
			$end = $neighbors[$idx][1];

			for ($i = $neighbors[$idx][0]; $i != $end; ++$i)
			{
				$iLabelFirst = $edges[$i][1];
				$iLabel = $labels[$iLabelFirst];
				if (isset($labels_to_counts[$iLabel]))
					$labels_to_counts[$iLabel]++;
				else
					$labels_to_counts[$iLabel] = 1;
			}

			// find the most common label
			// std::map<unsigned long, double>::iterator i;
			$best_score = PHP_INT_MIN;
			$best_label = $labels[$idx];
			foreach ($labels_to_counts as $key => $value)
			{
				if ($value > $best_score)
				{
					$best_score = $value;
					$best_label = $key;
				}
			}

			$labels[$idx] = $best_label;
		}

		// Remap the labels into a contiguous range.  First we find the
		// mapping.
		$label_remap = [];
		for ($i = 0; $i < count($labels); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
		{
			$next_id = count($label_remap);
			if (!isset($label_remap[$labels[$i]]))
				$label_remap[$labels[$i]] = $next_id;
		}
		// now apply the mapping to all the labels.
		for ($i = 0; $i < count($labels); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
		{
			$labels[$i] = $label_remap[$labels[$i]];
		}

		return count($label_remap);
	}

	static function find_neighbor_ranges (&$edges, &$neighbors) {

		// setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range
		// within edges that contains all node i's edges.
		$num_nodes = self::max_index_plus_one($edges);
		for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0];
		$cur_node = 0;
		$start_idx = 0;
		for ($i = 0; $i < count($edges); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
		{
			if ($edges[$i][0] != $cur_node)
			{
				$neighbors[$cur_node] = [$start_idx, $i];
				$start_idx = $i;
				$cur_node = $edges[$i][0];
			}
		}
		if (count($neighbors) !== 0)
			$neighbors[$cur_node] = [$start_idx, count($edges)];
	}

	static function max_index_plus_one ($pairs): int {

		if (count($pairs) === 0)
		{
			return 0;
		}
		else {
			$max_idx = 0;
			for ($i = 0; $i < count($pairs); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
			{
				if ($pairs[$i][0] > $max_idx)
					$max_idx = $pairs[$i][0];
				if ($pairs[$i][1] > $max_idx)
					$max_idx = $pairs[$i][1];
			}
			return $max_idx + 1;
		}
	}

	static function convert_unordered_to_ordered (&$edges, &$out_edges)

	{
		$out_edges = [];
		for ($i = 0; $i < count($edges); ++$i)
for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
		{
			$out_edges[] = [$edges[$i][0], $edges[$i][1]];
			if ($edges[$i][0] != $edges[$i][1])
				$out_edges[] = [$edges[$i][1], $edges[$i][0]];
		}
	}
}


1			<?php
2			declare(strict_types=1);
3			/**
4			* @copyright Copyright (c) 2023, Matias De lellis
5			*
6			* @author Matias De lellis <[email protected]>
7			*
8			* @license AGPL-3.0-or-later
9			*
10			* This code is free software: you can redistribute it and/or modify
11			* it under the terms of the GNU Affero General Public License, version 3,
12			* as published by the Free Software Foundation.
13			*
14			* This program is distributed in the hope that it will be useful,
15			* but WITHOUT ANY WARRANTY; without even the implied warranty of
16			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17			* GNU Affero General Public License for more details.
18			*
19			* You should have received a copy of the GNU Affero General Public License, version 3,
20			* along with this program. If not, see <http://www.gnu.org/licenses/>
21			*
22			*/
23
24			namespace OCA\FaceRecognition\Clusterer;
25
26
27			/**
28			* This class implements the graph clustering algorithm described in the
29			* paper: Chinese Whispers - an Efficient Graph Clustering Algorithm and its
30			* Application to Natural Language Processing Problems by Chris Biemann.
31			*
32			* In particular, it tries to be a shameless copy of the original dlib
33			* implementation.
34			* - https://github.com/davisking/dlib/blob/master/dlib/clustering/chinese_whispers.h
35			*/
36			class ChineseWhispers {
37
38			/**
39			* Cluster the dataset by assigning a label to each sample.from the edges
40			*/
41			static public function predict(array &$edges, array &$labels, int $num_iterations = 100)
42			{
43			// To improve the stability of the clusters, we must
44			// iterate the neighbors in a pseudo-random way.
45			mt_srand(2023);
46
47			$labels = [];
48			if (count($edges) == 0)
49			return 0;
50
51			$neighbors = [];
52			self::find_neighbor_ranges($edges, $neighbors);
53
54			// Initialize the labels, each node gets a different label.
55			for ($i = 0; $i < count($neighbors); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
56			$labels[$i] = $i;
57
58			for ($iter = 0; $iter < count($neighbors)*$num_iterations; ++$iter)
59			{
60			// Pick a random node.
61			$idx = mt_rand()%count($neighbors);
62
63			// Count how many times each label happens amongst our neighbors.
64			$labels_to_counts = [];
65			$end = $neighbors[$idx][1];
66
67			for ($i = $neighbors[$idx][0]; $i != $end; ++$i)
68			{
69			$iLabelFirst = $edges[$i][1];
70			$iLabel = $labels[$iLabelFirst];
71			if (isset($labels_to_counts[$iLabel]))
72			$labels_to_counts[$iLabel]++;
73			else
74			$labels_to_counts[$iLabel] = 1;
75			}
76
77			// find the most common label
78			// std::map<unsigned long, double>::iterator i;
79			$best_score = PHP_INT_MIN;
80			$best_label = $labels[$idx];
81			foreach ($labels_to_counts as $key => $value)
82			{
83			if ($value > $best_score)
84			{
85			$best_score = $value;
86			$best_label = $key;
87			}
88			}
89
90			$labels[$idx] = $best_label;
91			}
92
93			// Remap the labels into a contiguous range. First we find the
94			// mapping.
95			$label_remap = [];
96			for ($i = 0; $i < count($labels); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
97			{
98			$next_id = count($label_remap);
99			if (!isset($label_remap[$labels[$i]]))
100			$label_remap[$labels[$i]] = $next_id;
101			}
102			// now apply the mapping to all the labels.
103			for ($i = 0; $i < count($labels); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
104			{
105			$labels[$i] = $label_remap[$labels[$i]];
106			}
107
108			return count($label_remap);
109			}
110
111			static function find_neighbor_ranges (&$edges, &$neighbors) {
			0 ignored issues – show Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It is generally recommended to explicitly declare the visibility for methods. Adding explicit visibility (`private`, `protected`, or `public`) is generally recommend to communicate to other developers how, and from where this method is intended to be used. Loading history...
112			// setup neighbors so that [neighbors[i].first, neighbors[i].second) is the range
113			// within edges that contains all node i's edges.
114			$num_nodes = self::max_index_plus_one($edges);
115			for ($i = 0; $i < $num_nodes; ++$i) $neighbors[$i] = [0, 0];
116			$cur_node = 0;
117			$start_idx = 0;
118			for ($i = 0; $i < count($edges); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
119			{
120			if ($edges[$i][0] != $cur_node)
121			{
122			$neighbors[$cur_node] = [$start_idx, $i];
123			$start_idx = $i;
124			$cur_node = $edges[$i][0];
125			}
126			}
127			if (count($neighbors) !== 0)
128			$neighbors[$cur_node] = [$start_idx, count($edges)];
129			}
130
131			static function max_index_plus_one ($pairs): int {
			0 ignored issues – show Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It is generally recommended to explicitly declare the visibility for methods. Adding explicit visibility (`private`, `protected`, or `public`) is generally recommend to communicate to other developers how, and from where this method is intended to be used. Loading history...
132			if (count($pairs) === 0)
133			{
134			return 0;
135			}
136			else {
137			$max_idx = 0;
138			for ($i = 0; $i < count($pairs); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
139			{
140			if ($pairs[$i][0] > $max_idx)
141			$max_idx = $pairs[$i][0];
142			if ($pairs[$i][1] > $max_idx)
143			$max_idx = $pairs[$i][1];
144			}
145			return $max_idx + 1;
146			}
147			}
148
149			static function convert_unordered_to_ordered (&$edges, &$out_edges)
			0 ignored issues – show Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It is generally recommended to explicitly declare the visibility for methods. Adding explicit visibility (`private`, `protected`, or `public`) is generally recommend to communicate to other developers how, and from where this method is intended to be used. Loading history...
150			{
151			$out_edges = [];
152			for ($i = 0; $i < count($edges); ++$i)
			0 ignored issues – show Performance Best Practice introduced 2023-08-22 23:32 UTC by Report Bug Copy Issue Report It seems like you are calling the size function `count()` as part of the test condition. You might want to compute the size beforehand, and not on each iteration. If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: for ($i=0; $i<count($array); $i++) { // calls count() on each iteration } // Better for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once } Loading history...
153			{
154			$out_edges[] = [$edges[$i][0], $edges[$i][1]];
155			if ($edges[$i][0] != $edges[$i][1])
156			$out_edges[] = [$edges[$i][1], $edges[$i][0]];
157			}
158			}
159			}
160

matiasdelellis / facerecognition

ChineseWhispers A last analyzed 2025-04-04 20:37 UTC

Complexity

Size/Duplication

Test Coverage

Importance

4 Methods

Duplication Side-by-Side

Filter issues like

ChineseWhispers A
last analyzed 2025-04-04 20:37 UTC