@@ -100,9 +100,9 @@ discard block |
||
100 | 100 | ) { |
101 | 101 | if (!empty($firstString) || !empty($secondString)) { |
102 | 102 | $this->compOne = $firstString; |
103 | - $this->compOneLength = (int)mb_strlen($this->compOne, 'UTF-8'); |
|
103 | + $this->compOneLength = (int) mb_strlen($this->compOne, 'UTF-8'); |
|
104 | 104 | $this->compTwo = $secondString; |
105 | - $this->compTwoLength = (int)mb_strlen($this->compTwo, 'UTF-8'); |
|
105 | + $this->compTwoLength = (int) mb_strlen($this->compTwo, 'UTF-8'); |
|
106 | 106 | } |
107 | 107 | |
108 | 108 | $this->insCost = $insCost; |
@@ -164,10 +164,10 @@ discard block |
||
164 | 164 | |
165 | 165 | for ($i = 1; $i <= $oneSize; $i += 1) { |
166 | 166 | // Curchar for the first string |
167 | - $cOne = (string)mb_substr($this->compOne, $i - 1, 1, 'UTF-8'); |
|
167 | + $cOne = (string) mb_substr($this->compOne, $i - 1, 1, 'UTF-8'); |
|
168 | 168 | for ($j = 1; $j <= $twoSize; $j += 1) { |
169 | 169 | // Curchar for the second string |
170 | - $cTwo = (string)mb_substr($this->compTwo, $j - 1, 1, 'UTF-8'); |
|
170 | + $cTwo = (string) mb_substr($this->compTwo, $j - 1, 1, 'UTF-8'); |
|
171 | 171 | |
172 | 172 | // Compute substitution cost |
173 | 173 | if ($this->compare($cOne, $cTwo) === 0) { |
@@ -196,9 +196,9 @@ discard block |
||
196 | 196 | if ($i > 1 && $j > 1) { |
197 | 197 | // Last two |
198 | 198 | // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
199 | - $ccOne = (string)mb_substr($this->compOne, $i - 2, 1, 'UTF-8'); |
|
199 | + $ccOne = (string) mb_substr($this->compOne, $i - 2, 1, 'UTF-8'); |
|
200 | 200 | // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
201 | - $ccTwo = (string)mb_substr($this->compTwo, $j - 2, 1, 'UTF-8'); |
|
201 | + $ccTwo = (string) mb_substr($this->compTwo, $j - 2, 1, 'UTF-8'); |
|
202 | 202 | |
203 | 203 | if ($this->compare($cOne, $ccTwo) === 0 && $this->compare($ccOne, $cTwo) === 0) { |
204 | 204 | // Transposition cost is computed as minimal of two |
@@ -244,7 +244,7 @@ discard block |
||
244 | 244 | $maxCost += $extraSize * $this->insCost; |
245 | 245 | } |
246 | 246 | |
247 | - return (int)$maxCost; |
|
247 | + return (int) $maxCost; |
|
248 | 248 | } |
249 | 249 | |
250 | 250 | /** |
@@ -258,7 +258,7 @@ discard block |
||
258 | 258 | $this->setupMatrix(); |
259 | 259 | } |
260 | 260 | |
261 | - return (float)(1 - ($this->getSimilarity() / $this->getMaximalDistance())); |
|
261 | + return (float) (1 - ($this->getSimilarity() / $this->getMaximalDistance())); |
|
262 | 262 | } |
263 | 263 | |
264 | 264 | /** |
@@ -287,12 +287,12 @@ discard block |
||
287 | 287 | $oneSize = $this->compOneLength; |
288 | 288 | $twoSize = $this->compTwoLength; |
289 | 289 | |
290 | - $out = ' ' . $this->compOne . PHP_EOL; |
|
290 | + $out = ' '.$this->compOne.PHP_EOL; |
|
291 | 291 | for ($y = 0; $y <= $twoSize; $y += 1) { |
292 | 292 | if ($y - 1 < 0) { |
293 | 293 | $out .= ' '; |
294 | 294 | } else { |
295 | - $out .= (string)mb_substr($this->compTwo, $y - 1, 1, 'UTF-8'); |
|
295 | + $out .= (string) mb_substr($this->compTwo, $y - 1, 1, 'UTF-8'); |
|
296 | 296 | } |
297 | 297 | |
298 | 298 | for ($x = 0; $x <= $oneSize; $x += 1) { |
@@ -10,387 +10,387 @@ |
||
10 | 10 | class DamerauLevenshtein |
11 | 11 | { |
12 | 12 | |
13 | - /** |
|
14 | - * First string. |
|
15 | - * |
|
16 | - * @var String |
|
17 | - */ |
|
18 | - private $compOne; |
|
19 | - |
|
20 | - /** |
|
21 | - * Second string. |
|
22 | - * |
|
23 | - * @var String |
|
24 | - */ |
|
25 | - private $compTwo; |
|
26 | - |
|
27 | - /** |
|
28 | - * Length of first string. |
|
29 | - * |
|
30 | - * @var int |
|
31 | - */ |
|
32 | - private $compOneLength = 0; |
|
33 | - |
|
34 | - /** |
|
35 | - * Length of second string. |
|
36 | - * |
|
37 | - * @var int |
|
38 | - */ |
|
39 | - private $compTwoLength = 0; |
|
40 | - |
|
41 | - /** |
|
42 | - * Matrix for Damerau Levenshtein distance dynamic programming computation. |
|
43 | - * |
|
44 | - * @var int[][] |
|
45 | - */ |
|
46 | - private $matrix; |
|
47 | - |
|
48 | - /** |
|
49 | - * Boolean flag determining whether is matrix computed for input strings. |
|
50 | - * |
|
51 | - * @var bool |
|
52 | - */ |
|
53 | - private $calculated = false; |
|
54 | - |
|
55 | - /** |
|
56 | - * Cost of character insertion (to first string to match second string). |
|
57 | - * |
|
58 | - * @var int |
|
59 | - */ |
|
60 | - private $insCost = 1; |
|
61 | - |
|
62 | - /** |
|
63 | - * Cost of character deletion (from first string to match second string). |
|
64 | - * |
|
65 | - * @var int |
|
66 | - */ |
|
67 | - private $delCost = 1; |
|
68 | - |
|
69 | - /** |
|
70 | - * Substitution cost. |
|
71 | - * |
|
72 | - * @var int |
|
73 | - */ |
|
74 | - private $subCost = 1; |
|
75 | - |
|
76 | - /** |
|
77 | - * Transposition cost. |
|
78 | - * |
|
79 | - * @var int |
|
80 | - */ |
|
81 | - private $transCost = 1; |
|
82 | - |
|
83 | - /** |
|
84 | - * Constructor. |
|
85 | - * |
|
86 | - * @param string $firstString first string to compute distance |
|
87 | - * @param string $secondString second string to compute distance |
|
88 | - * @param int $insCost Cost of character insertion |
|
89 | - * @param int $delCost Cost of character deletion |
|
90 | - * @param int $subCost Substitution cost |
|
91 | - * @param int $transCost Transposition cost |
|
92 | - */ |
|
93 | - public function __construct( |
|
94 | - string $firstString, |
|
95 | - string $secondString, |
|
96 | - int $insCost = 1, |
|
97 | - int $delCost = 1, |
|
98 | - int $subCost = 1, |
|
99 | - int $transCost = 1 |
|
100 | - ) { |
|
101 | - if (!empty($firstString) || !empty($secondString)) { |
|
102 | - $this->compOne = $firstString; |
|
103 | - $this->compOneLength = (int)mb_strlen($this->compOne, 'UTF-8'); |
|
104 | - $this->compTwo = $secondString; |
|
105 | - $this->compTwoLength = (int)mb_strlen($this->compTwo, 'UTF-8'); |
|
106 | - } |
|
107 | - |
|
108 | - $this->insCost = $insCost; |
|
109 | - $this->delCost = $delCost; |
|
110 | - $this->subCost = $subCost; |
|
111 | - $this->transCost = $transCost; |
|
112 | - } |
|
113 | - |
|
114 | - /** |
|
115 | - * Returns computed matrix for given input strings. |
|
116 | - * |
|
117 | - * @return int[][] matrix |
|
118 | - */ |
|
119 | - public function getMatrix(): array |
|
120 | - { |
|
121 | - if (!$this->calculated) { |
|
122 | - $this->setupMatrix(); |
|
123 | - } |
|
124 | - |
|
125 | - return $this->matrix; |
|
126 | - } |
|
127 | - |
|
128 | - /** |
|
129 | - * Returns similarity of strings, absolute number = Damerau Levenshtein distance. |
|
130 | - * |
|
131 | - * @return int |
|
132 | - */ |
|
133 | - public function getSimilarity(): int |
|
134 | - { |
|
135 | - if (!$this->calculated) { |
|
136 | - $this->setupMatrix(); |
|
137 | - } |
|
138 | - |
|
139 | - return $this->matrix[$this->compOneLength][$this->compTwoLength]; |
|
140 | - } |
|
141 | - |
|
142 | - /** |
|
143 | - * Procedure to compute matrix for given input strings. |
|
144 | - * |
|
145 | - * @return void |
|
146 | - * @SuppressWarnings(PHPMD.CyclomaticComplexity) |
|
147 | - */ |
|
148 | - private function setupMatrix(): void |
|
149 | - { |
|
150 | - $this->matrix = [[]]; |
|
151 | - |
|
152 | - $oneSize = $this->compOneLength; |
|
153 | - $twoSize = $this->compTwoLength; |
|
154 | - |
|
155 | - for ($i = 0; $i <= $oneSize; $i += 1) { |
|
156 | - // @phan-suppress-next-line PhanTypeInvalidDimOffset |
|
157 | - $this->matrix[$i][0] = $i > 0 ? $this->matrix[$i - 1][0] + $this->delCost : 0; |
|
158 | - } |
|
159 | - |
|
160 | - for ($i = 0; $i <= $twoSize; $i += 1) { |
|
161 | - // Insertion actualy |
|
162 | - $this->matrix[0][$i] = $i > 0 ? $this->matrix[0][$i - 1] + $this->insCost : 0; |
|
163 | - } |
|
164 | - |
|
165 | - for ($i = 1; $i <= $oneSize; $i += 1) { |
|
166 | - // Curchar for the first string |
|
167 | - $cOne = (string)mb_substr($this->compOne, $i - 1, 1, 'UTF-8'); |
|
168 | - for ($j = 1; $j <= $twoSize; $j += 1) { |
|
169 | - // Curchar for the second string |
|
170 | - $cTwo = (string)mb_substr($this->compTwo, $j - 1, 1, 'UTF-8'); |
|
171 | - |
|
172 | - // Compute substitution cost |
|
173 | - if ($this->compare($cOne, $cTwo) === 0) { |
|
174 | - $cost = 0; |
|
175 | - $trans = 0; |
|
176 | - } else { |
|
177 | - $cost = $this->subCost; |
|
178 | - $trans = $this->transCost; |
|
179 | - } |
|
180 | - |
|
181 | - // Deletion cost |
|
182 | - // @phan-suppress-next-line PhanTypeInvalidDimOffset, PhanTypeInvalidLeftOperandOfAdd |
|
183 | - $del = $this->matrix[$i - 1][$j] + $this->delCost; |
|
184 | - |
|
185 | - // Insertion cost |
|
186 | - // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset PhanTypeInvalidLeftOperandOfAdd |
|
187 | - $ins = $this->matrix[$i][$j - 1] + $this->insCost; |
|
188 | - |
|
189 | - // Substitution cost, 0 if same |
|
190 | - $sub = $this->matrix[$i - 1][$j - 1] + $cost; |
|
191 | - |
|
192 | - // Compute optimal |
|
193 | - $this->matrix[$i][$j] = min($del, $ins, $sub); |
|
194 | - |
|
195 | - // Transposition cost |
|
196 | - if ($i > 1 && $j > 1) { |
|
197 | - // Last two |
|
198 | - // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
199 | - $ccOne = (string)mb_substr($this->compOne, $i - 2, 1, 'UTF-8'); |
|
200 | - // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
201 | - $ccTwo = (string)mb_substr($this->compTwo, $j - 2, 1, 'UTF-8'); |
|
202 | - |
|
203 | - if ($this->compare($cOne, $ccTwo) === 0 && $this->compare($ccOne, $cTwo) === 0) { |
|
204 | - // Transposition cost is computed as minimal of two |
|
205 | - // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
206 | - $this->matrix[$i][$j] = min($this->matrix[$i][$j], $this->matrix[$i - 2][$j - 2] + $trans); |
|
207 | - } |
|
208 | - } |
|
209 | - } |
|
210 | - } |
|
211 | - |
|
212 | - $this->calculated = true; |
|
213 | - } |
|
214 | - |
|
215 | - /** |
|
216 | - * Returns maximal possible edit Damerau Levenshtein distance between texts. |
|
217 | - * |
|
218 | - * On common substring of same length perform substitution / insert + delete |
|
219 | - * (depends on what is cheaper), then on extra characters perform insertion / deletion |
|
220 | - * |
|
221 | - * @return int |
|
222 | - */ |
|
223 | - public function getMaximalDistance(): int |
|
224 | - { |
|
225 | - $oneSize = $this->compOneLength; |
|
226 | - $twoSize = $this->compTwoLength; |
|
227 | - |
|
228 | - // Is substitution cheaper that delete + insert? |
|
229 | - $subCost = min($this->subCost, $this->delCost + $this->insCost); |
|
230 | - |
|
231 | - // Get common size |
|
232 | - $minSize = min($oneSize, $twoSize); |
|
233 | - $maxSize = max($oneSize, $twoSize); |
|
234 | - $extraSize = $maxSize - $minSize; |
|
235 | - |
|
236 | - // On common size perform substitution / delete + insert, what is cheaper |
|
237 | - $maxCost = $subCost * $minSize; |
|
238 | - |
|
239 | - // On resulting do insert/delete |
|
240 | - if ($oneSize > $twoSize) { |
|
241 | - // Delete extra characters |
|
242 | - $maxCost += $extraSize * $this->delCost; |
|
243 | - } else { |
|
244 | - // Insert extra characters |
|
245 | - $maxCost += $extraSize * $this->insCost; |
|
246 | - } |
|
247 | - |
|
248 | - return (int)$maxCost; |
|
249 | - } |
|
250 | - |
|
251 | - /** |
|
252 | - * Returns relative distance of input strings (computed with maximal possible distance). |
|
253 | - * |
|
254 | - * @return float |
|
255 | - */ |
|
256 | - public function getRelativeDistance(): float |
|
257 | - { |
|
258 | - if (!$this->calculated) { |
|
259 | - $this->setupMatrix(); |
|
260 | - } |
|
261 | - |
|
262 | - return (float)(1 - ($this->getSimilarity() / $this->getMaximalDistance())); |
|
263 | - } |
|
264 | - |
|
265 | - /** |
|
266 | - * Compares two characters from string (this method may be overridden in child class). |
|
267 | - * |
|
268 | - * @param string $firstCharacter First character |
|
269 | - * @param string $secondCharacter Second character |
|
270 | - * @return int |
|
271 | - */ |
|
272 | - protected function compare(string $firstCharacter, string $secondCharacter): int |
|
273 | - { |
|
274 | - return strcmp($firstCharacter, $secondCharacter); |
|
275 | - } |
|
276 | - |
|
277 | - /** |
|
278 | - * Returns computed matrix for given input strings (For debugging purposes). |
|
279 | - * |
|
280 | - * @return string |
|
281 | - */ |
|
282 | - public function displayMatrix(): string |
|
283 | - { |
|
284 | - if (!$this->calculated) { |
|
285 | - $this->setupMatrix(); |
|
286 | - } |
|
287 | - |
|
288 | - $oneSize = $this->compOneLength; |
|
289 | - $twoSize = $this->compTwoLength; |
|
290 | - |
|
291 | - $out = ' ' . $this->compOne . PHP_EOL; |
|
292 | - for ($y = 0; $y <= $twoSize; $y += 1) { |
|
293 | - if ($y - 1 < 0) { |
|
294 | - $out .= ' '; |
|
295 | - } else { |
|
296 | - $out .= (string)mb_substr($this->compTwo, $y - 1, 1, 'UTF-8'); |
|
297 | - } |
|
298 | - |
|
299 | - for ($x = 0; $x <= $oneSize; $x += 1) { |
|
300 | - $out .= $this->matrix[$x][$y]; |
|
301 | - } |
|
302 | - |
|
303 | - $out .= PHP_EOL; |
|
304 | - } |
|
305 | - |
|
306 | - return $out; |
|
307 | - } |
|
308 | - |
|
309 | - /** |
|
310 | - * Returns current cost of insertion operation. |
|
311 | - * |
|
312 | - * @return int |
|
313 | - */ |
|
314 | - public function getInsCost(): int |
|
315 | - { |
|
316 | - return $this->insCost; |
|
317 | - } |
|
318 | - |
|
319 | - /** |
|
320 | - * Sets cost of insertion operation (insert characters to first string to match second string). |
|
321 | - * |
|
322 | - * @param int $insCost Cost of character insertion |
|
323 | - * @return void |
|
324 | - */ |
|
325 | - public function setInsCost(int $insCost): void |
|
326 | - { |
|
327 | - $this->calculated = $insCost === $this->insCost ? $this->calculated : false; |
|
328 | - $this->insCost = $insCost; |
|
329 | - } |
|
330 | - |
|
331 | - /** |
|
332 | - * Returns current cost of deletion operation. |
|
333 | - * |
|
334 | - * @return int |
|
335 | - */ |
|
336 | - public function getDelCost(): int |
|
337 | - { |
|
338 | - return $this->delCost; |
|
339 | - } |
|
340 | - |
|
341 | - /** |
|
342 | - * Sets cost of deletion operation (delete characters from first string to match second string). |
|
343 | - * |
|
344 | - * @param int $delCost Cost of character deletion |
|
345 | - * @return void |
|
346 | - */ |
|
347 | - public function setDelCost(int $delCost): void |
|
348 | - { |
|
349 | - $this->calculated = $delCost === $this->delCost ? $this->calculated : false; |
|
350 | - $this->delCost = $delCost; |
|
351 | - } |
|
352 | - |
|
353 | - /** |
|
354 | - * Returns current cost of substitution operation. |
|
355 | - * |
|
356 | - * @return int |
|
357 | - */ |
|
358 | - public function getSubCost(): int |
|
359 | - { |
|
360 | - return $this->subCost; |
|
361 | - } |
|
362 | - |
|
363 | - /** |
|
364 | - * Sets cost of substitution operation. |
|
365 | - * |
|
366 | - * @param int $subCost Cost of character substitution |
|
367 | - * @return void |
|
368 | - */ |
|
369 | - public function setSubCost(int $subCost): void |
|
370 | - { |
|
371 | - $this->calculated = $subCost === $this->subCost ? $this->calculated : false; |
|
372 | - $this->subCost = $subCost; |
|
373 | - } |
|
374 | - |
|
375 | - /** |
|
376 | - * Returns current cost of transposition operation. |
|
377 | - * |
|
378 | - * @return int |
|
379 | - */ |
|
380 | - public function getTransCost(): int |
|
381 | - { |
|
382 | - return $this->transCost; |
|
383 | - } |
|
384 | - |
|
385 | - /** |
|
386 | - * Sets cost of transposition operation. |
|
387 | - * |
|
388 | - * @param int $transCost Cost of character transposition |
|
389 | - * @return void |
|
390 | - */ |
|
391 | - public function setTransCost(int $transCost): void |
|
392 | - { |
|
393 | - $this->calculated = $transCost === $this->transCost ? $this->calculated : false; |
|
394 | - $this->transCost = $transCost; |
|
395 | - } |
|
13 | + /** |
|
14 | + * First string. |
|
15 | + * |
|
16 | + * @var String |
|
17 | + */ |
|
18 | + private $compOne; |
|
19 | + |
|
20 | + /** |
|
21 | + * Second string. |
|
22 | + * |
|
23 | + * @var String |
|
24 | + */ |
|
25 | + private $compTwo; |
|
26 | + |
|
27 | + /** |
|
28 | + * Length of first string. |
|
29 | + * |
|
30 | + * @var int |
|
31 | + */ |
|
32 | + private $compOneLength = 0; |
|
33 | + |
|
34 | + /** |
|
35 | + * Length of second string. |
|
36 | + * |
|
37 | + * @var int |
|
38 | + */ |
|
39 | + private $compTwoLength = 0; |
|
40 | + |
|
41 | + /** |
|
42 | + * Matrix for Damerau Levenshtein distance dynamic programming computation. |
|
43 | + * |
|
44 | + * @var int[][] |
|
45 | + */ |
|
46 | + private $matrix; |
|
47 | + |
|
48 | + /** |
|
49 | + * Boolean flag determining whether is matrix computed for input strings. |
|
50 | + * |
|
51 | + * @var bool |
|
52 | + */ |
|
53 | + private $calculated = false; |
|
54 | + |
|
55 | + /** |
|
56 | + * Cost of character insertion (to first string to match second string). |
|
57 | + * |
|
58 | + * @var int |
|
59 | + */ |
|
60 | + private $insCost = 1; |
|
61 | + |
|
62 | + /** |
|
63 | + * Cost of character deletion (from first string to match second string). |
|
64 | + * |
|
65 | + * @var int |
|
66 | + */ |
|
67 | + private $delCost = 1; |
|
68 | + |
|
69 | + /** |
|
70 | + * Substitution cost. |
|
71 | + * |
|
72 | + * @var int |
|
73 | + */ |
|
74 | + private $subCost = 1; |
|
75 | + |
|
76 | + /** |
|
77 | + * Transposition cost. |
|
78 | + * |
|
79 | + * @var int |
|
80 | + */ |
|
81 | + private $transCost = 1; |
|
82 | + |
|
83 | + /** |
|
84 | + * Constructor. |
|
85 | + * |
|
86 | + * @param string $firstString first string to compute distance |
|
87 | + * @param string $secondString second string to compute distance |
|
88 | + * @param int $insCost Cost of character insertion |
|
89 | + * @param int $delCost Cost of character deletion |
|
90 | + * @param int $subCost Substitution cost |
|
91 | + * @param int $transCost Transposition cost |
|
92 | + */ |
|
93 | + public function __construct( |
|
94 | + string $firstString, |
|
95 | + string $secondString, |
|
96 | + int $insCost = 1, |
|
97 | + int $delCost = 1, |
|
98 | + int $subCost = 1, |
|
99 | + int $transCost = 1 |
|
100 | + ) { |
|
101 | + if (!empty($firstString) || !empty($secondString)) { |
|
102 | + $this->compOne = $firstString; |
|
103 | + $this->compOneLength = (int)mb_strlen($this->compOne, 'UTF-8'); |
|
104 | + $this->compTwo = $secondString; |
|
105 | + $this->compTwoLength = (int)mb_strlen($this->compTwo, 'UTF-8'); |
|
106 | + } |
|
107 | + |
|
108 | + $this->insCost = $insCost; |
|
109 | + $this->delCost = $delCost; |
|
110 | + $this->subCost = $subCost; |
|
111 | + $this->transCost = $transCost; |
|
112 | + } |
|
113 | + |
|
114 | + /** |
|
115 | + * Returns computed matrix for given input strings. |
|
116 | + * |
|
117 | + * @return int[][] matrix |
|
118 | + */ |
|
119 | + public function getMatrix(): array |
|
120 | + { |
|
121 | + if (!$this->calculated) { |
|
122 | + $this->setupMatrix(); |
|
123 | + } |
|
124 | + |
|
125 | + return $this->matrix; |
|
126 | + } |
|
127 | + |
|
128 | + /** |
|
129 | + * Returns similarity of strings, absolute number = Damerau Levenshtein distance. |
|
130 | + * |
|
131 | + * @return int |
|
132 | + */ |
|
133 | + public function getSimilarity(): int |
|
134 | + { |
|
135 | + if (!$this->calculated) { |
|
136 | + $this->setupMatrix(); |
|
137 | + } |
|
138 | + |
|
139 | + return $this->matrix[$this->compOneLength][$this->compTwoLength]; |
|
140 | + } |
|
141 | + |
|
142 | + /** |
|
143 | + * Procedure to compute matrix for given input strings. |
|
144 | + * |
|
145 | + * @return void |
|
146 | + * @SuppressWarnings(PHPMD.CyclomaticComplexity) |
|
147 | + */ |
|
148 | + private function setupMatrix(): void |
|
149 | + { |
|
150 | + $this->matrix = [[]]; |
|
151 | + |
|
152 | + $oneSize = $this->compOneLength; |
|
153 | + $twoSize = $this->compTwoLength; |
|
154 | + |
|
155 | + for ($i = 0; $i <= $oneSize; $i += 1) { |
|
156 | + // @phan-suppress-next-line PhanTypeInvalidDimOffset |
|
157 | + $this->matrix[$i][0] = $i > 0 ? $this->matrix[$i - 1][0] + $this->delCost : 0; |
|
158 | + } |
|
159 | + |
|
160 | + for ($i = 0; $i <= $twoSize; $i += 1) { |
|
161 | + // Insertion actualy |
|
162 | + $this->matrix[0][$i] = $i > 0 ? $this->matrix[0][$i - 1] + $this->insCost : 0; |
|
163 | + } |
|
164 | + |
|
165 | + for ($i = 1; $i <= $oneSize; $i += 1) { |
|
166 | + // Curchar for the first string |
|
167 | + $cOne = (string)mb_substr($this->compOne, $i - 1, 1, 'UTF-8'); |
|
168 | + for ($j = 1; $j <= $twoSize; $j += 1) { |
|
169 | + // Curchar for the second string |
|
170 | + $cTwo = (string)mb_substr($this->compTwo, $j - 1, 1, 'UTF-8'); |
|
171 | + |
|
172 | + // Compute substitution cost |
|
173 | + if ($this->compare($cOne, $cTwo) === 0) { |
|
174 | + $cost = 0; |
|
175 | + $trans = 0; |
|
176 | + } else { |
|
177 | + $cost = $this->subCost; |
|
178 | + $trans = $this->transCost; |
|
179 | + } |
|
180 | + |
|
181 | + // Deletion cost |
|
182 | + // @phan-suppress-next-line PhanTypeInvalidDimOffset, PhanTypeInvalidLeftOperandOfAdd |
|
183 | + $del = $this->matrix[$i - 1][$j] + $this->delCost; |
|
184 | + |
|
185 | + // Insertion cost |
|
186 | + // @phan-suppress-next-line PhanTypeArraySuspiciousNull, PhanTypeInvalidDimOffset PhanTypeInvalidLeftOperandOfAdd |
|
187 | + $ins = $this->matrix[$i][$j - 1] + $this->insCost; |
|
188 | + |
|
189 | + // Substitution cost, 0 if same |
|
190 | + $sub = $this->matrix[$i - 1][$j - 1] + $cost; |
|
191 | + |
|
192 | + // Compute optimal |
|
193 | + $this->matrix[$i][$j] = min($del, $ins, $sub); |
|
194 | + |
|
195 | + // Transposition cost |
|
196 | + if ($i > 1 && $j > 1) { |
|
197 | + // Last two |
|
198 | + // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
199 | + $ccOne = (string)mb_substr($this->compOne, $i - 2, 1, 'UTF-8'); |
|
200 | + // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
201 | + $ccTwo = (string)mb_substr($this->compTwo, $j - 2, 1, 'UTF-8'); |
|
202 | + |
|
203 | + if ($this->compare($cOne, $ccTwo) === 0 && $this->compare($ccOne, $cTwo) === 0) { |
|
204 | + // Transposition cost is computed as minimal of two |
|
205 | + // @phan-suppress-next-line PhanPartialTypeMismatchArgumentInternal |
|
206 | + $this->matrix[$i][$j] = min($this->matrix[$i][$j], $this->matrix[$i - 2][$j - 2] + $trans); |
|
207 | + } |
|
208 | + } |
|
209 | + } |
|
210 | + } |
|
211 | + |
|
212 | + $this->calculated = true; |
|
213 | + } |
|
214 | + |
|
215 | + /** |
|
216 | + * Returns maximal possible edit Damerau Levenshtein distance between texts. |
|
217 | + * |
|
218 | + * On common substring of same length perform substitution / insert + delete |
|
219 | + * (depends on what is cheaper), then on extra characters perform insertion / deletion |
|
220 | + * |
|
221 | + * @return int |
|
222 | + */ |
|
223 | + public function getMaximalDistance(): int |
|
224 | + { |
|
225 | + $oneSize = $this->compOneLength; |
|
226 | + $twoSize = $this->compTwoLength; |
|
227 | + |
|
228 | + // Is substitution cheaper that delete + insert? |
|
229 | + $subCost = min($this->subCost, $this->delCost + $this->insCost); |
|
230 | + |
|
231 | + // Get common size |
|
232 | + $minSize = min($oneSize, $twoSize); |
|
233 | + $maxSize = max($oneSize, $twoSize); |
|
234 | + $extraSize = $maxSize - $minSize; |
|
235 | + |
|
236 | + // On common size perform substitution / delete + insert, what is cheaper |
|
237 | + $maxCost = $subCost * $minSize; |
|
238 | + |
|
239 | + // On resulting do insert/delete |
|
240 | + if ($oneSize > $twoSize) { |
|
241 | + // Delete extra characters |
|
242 | + $maxCost += $extraSize * $this->delCost; |
|
243 | + } else { |
|
244 | + // Insert extra characters |
|
245 | + $maxCost += $extraSize * $this->insCost; |
|
246 | + } |
|
247 | + |
|
248 | + return (int)$maxCost; |
|
249 | + } |
|
250 | + |
|
251 | + /** |
|
252 | + * Returns relative distance of input strings (computed with maximal possible distance). |
|
253 | + * |
|
254 | + * @return float |
|
255 | + */ |
|
256 | + public function getRelativeDistance(): float |
|
257 | + { |
|
258 | + if (!$this->calculated) { |
|
259 | + $this->setupMatrix(); |
|
260 | + } |
|
261 | + |
|
262 | + return (float)(1 - ($this->getSimilarity() / $this->getMaximalDistance())); |
|
263 | + } |
|
264 | + |
|
265 | + /** |
|
266 | + * Compares two characters from string (this method may be overridden in child class). |
|
267 | + * |
|
268 | + * @param string $firstCharacter First character |
|
269 | + * @param string $secondCharacter Second character |
|
270 | + * @return int |
|
271 | + */ |
|
272 | + protected function compare(string $firstCharacter, string $secondCharacter): int |
|
273 | + { |
|
274 | + return strcmp($firstCharacter, $secondCharacter); |
|
275 | + } |
|
276 | + |
|
277 | + /** |
|
278 | + * Returns computed matrix for given input strings (For debugging purposes). |
|
279 | + * |
|
280 | + * @return string |
|
281 | + */ |
|
282 | + public function displayMatrix(): string |
|
283 | + { |
|
284 | + if (!$this->calculated) { |
|
285 | + $this->setupMatrix(); |
|
286 | + } |
|
287 | + |
|
288 | + $oneSize = $this->compOneLength; |
|
289 | + $twoSize = $this->compTwoLength; |
|
290 | + |
|
291 | + $out = ' ' . $this->compOne . PHP_EOL; |
|
292 | + for ($y = 0; $y <= $twoSize; $y += 1) { |
|
293 | + if ($y - 1 < 0) { |
|
294 | + $out .= ' '; |
|
295 | + } else { |
|
296 | + $out .= (string)mb_substr($this->compTwo, $y - 1, 1, 'UTF-8'); |
|
297 | + } |
|
298 | + |
|
299 | + for ($x = 0; $x <= $oneSize; $x += 1) { |
|
300 | + $out .= $this->matrix[$x][$y]; |
|
301 | + } |
|
302 | + |
|
303 | + $out .= PHP_EOL; |
|
304 | + } |
|
305 | + |
|
306 | + return $out; |
|
307 | + } |
|
308 | + |
|
309 | + /** |
|
310 | + * Returns current cost of insertion operation. |
|
311 | + * |
|
312 | + * @return int |
|
313 | + */ |
|
314 | + public function getInsCost(): int |
|
315 | + { |
|
316 | + return $this->insCost; |
|
317 | + } |
|
318 | + |
|
319 | + /** |
|
320 | + * Sets cost of insertion operation (insert characters to first string to match second string). |
|
321 | + * |
|
322 | + * @param int $insCost Cost of character insertion |
|
323 | + * @return void |
|
324 | + */ |
|
325 | + public function setInsCost(int $insCost): void |
|
326 | + { |
|
327 | + $this->calculated = $insCost === $this->insCost ? $this->calculated : false; |
|
328 | + $this->insCost = $insCost; |
|
329 | + } |
|
330 | + |
|
331 | + /** |
|
332 | + * Returns current cost of deletion operation. |
|
333 | + * |
|
334 | + * @return int |
|
335 | + */ |
|
336 | + public function getDelCost(): int |
|
337 | + { |
|
338 | + return $this->delCost; |
|
339 | + } |
|
340 | + |
|
341 | + /** |
|
342 | + * Sets cost of deletion operation (delete characters from first string to match second string). |
|
343 | + * |
|
344 | + * @param int $delCost Cost of character deletion |
|
345 | + * @return void |
|
346 | + */ |
|
347 | + public function setDelCost(int $delCost): void |
|
348 | + { |
|
349 | + $this->calculated = $delCost === $this->delCost ? $this->calculated : false; |
|
350 | + $this->delCost = $delCost; |
|
351 | + } |
|
352 | + |
|
353 | + /** |
|
354 | + * Returns current cost of substitution operation. |
|
355 | + * |
|
356 | + * @return int |
|
357 | + */ |
|
358 | + public function getSubCost(): int |
|
359 | + { |
|
360 | + return $this->subCost; |
|
361 | + } |
|
362 | + |
|
363 | + /** |
|
364 | + * Sets cost of substitution operation. |
|
365 | + * |
|
366 | + * @param int $subCost Cost of character substitution |
|
367 | + * @return void |
|
368 | + */ |
|
369 | + public function setSubCost(int $subCost): void |
|
370 | + { |
|
371 | + $this->calculated = $subCost === $this->subCost ? $this->calculated : false; |
|
372 | + $this->subCost = $subCost; |
|
373 | + } |
|
374 | + |
|
375 | + /** |
|
376 | + * Returns current cost of transposition operation. |
|
377 | + * |
|
378 | + * @return int |
|
379 | + */ |
|
380 | + public function getTransCost(): int |
|
381 | + { |
|
382 | + return $this->transCost; |
|
383 | + } |
|
384 | + |
|
385 | + /** |
|
386 | + * Sets cost of transposition operation. |
|
387 | + * |
|
388 | + * @param int $transCost Cost of character transposition |
|
389 | + * @return void |
|
390 | + */ |
|
391 | + public function setTransCost(int $transCost): void |
|
392 | + { |
|
393 | + $this->calculated = $transCost === $this->transCost ? $this->calculated : false; |
|
394 | + $this->transCost = $transCost; |
|
395 | + } |
|
396 | 396 | } |