Completed
Pull Request — master (#14)
by Mischa ter
01:32
created
src/DamerauLevenshtein.php 1 patch
Indentation   +354 added lines, -354 removed lines patch added patch discarded remove patch
@@ -10,73 +10,73 @@  discard block
 block discarded – undo
10 10
 class DamerauLevenshtein
11 11
 {
12 12
 
13
-    /**
14
-     * First string.
15
-     *
16
-     * @var String
17
-     */
18
-    private $compOne;
19
-
20
-    /**
21
-     * Second string.
22
-     *
23
-     * @var String
24
-     */
25
-    private $compTwo;
26
-
27
-    /**
28
-     * Matrix for Damerau Levenshtein distance dynamic programming computation.
29
-     *
30
-     * @var int[][]
31
-     */
32
-    private $matrix;
33
-
34
-    /**
35
-     * Boolean flag determining whether is matrix computed for input strings.
36
-     *
37
-     * @var bool
38
-     */
39
-    private $calculated = false;
40
-
41
-    /**
42
-     * Cost of character insertion (to first string to match second string).
43
-     *
44
-     * @var int
45
-     */
46
-    private $insCost = 1;
47
-
48
-    /**
49
-     * Cost of character deletion (from first string to match second string).
50
-     *
51
-     * @var int
52
-     */
53
-    private $delCost = 1;
54
-
55
-    /**
56
-     * Substitution cost.
57
-     *
58
-     * @var int
59
-     */
60
-    private $subCost = 1;
61
-
62
-    /**
63
-     * Transposition cost.
64
-     *
65
-     * @var int
66
-     */
67
-    private $transCost = 1;
68
-
69
-    /**
70
-     * Constructor.
71
-     *
72
-     * @param string $firstString first string to compute distance
73
-     * @param string $secondString second string to compute distance
74
-     * @param int $insCost Cost of character insertion
75
-     * @param int $delCost Cost of character deletion
76
-     * @param int $subCost Substitution cost
77
-     * @param int $transCost Transposition cost
78
-     */
79
-    public function __construct(
13
+	/**
14
+	 * First string.
15
+	 *
16
+	 * @var String
17
+	 */
18
+	private $compOne;
19
+
20
+	/**
21
+	 * Second string.
22
+	 *
23
+	 * @var String
24
+	 */
25
+	private $compTwo;
26
+
27
+	/**
28
+	 * Matrix for Damerau Levenshtein distance dynamic programming computation.
29
+	 *
30
+	 * @var int[][]
31
+	 */
32
+	private $matrix;
33
+
34
+	/**
35
+	 * Boolean flag determining whether is matrix computed for input strings.
36
+	 *
37
+	 * @var bool
38
+	 */
39
+	private $calculated = false;
40
+
41
+	/**
42
+	 * Cost of character insertion (to first string to match second string).
43
+	 *
44
+	 * @var int
45
+	 */
46
+	private $insCost = 1;
47
+
48
+	/**
49
+	 * Cost of character deletion (from first string to match second string).
50
+	 *
51
+	 * @var int
52
+	 */
53
+	private $delCost = 1;
54
+
55
+	/**
56
+	 * Substitution cost.
57
+	 *
58
+	 * @var int
59
+	 */
60
+	private $subCost = 1;
61
+
62
+	/**
63
+	 * Transposition cost.
64
+	 *
65
+	 * @var int
66
+	 */
67
+	private $transCost = 1;
68
+
69
+	/**
70
+	 * Constructor.
71
+	 *
72
+	 * @param string $firstString first string to compute distance
73
+	 * @param string $secondString second string to compute distance
74
+	 * @param int $insCost Cost of character insertion
75
+	 * @param int $delCost Cost of character deletion
76
+	 * @param int $subCost Substitution cost
77
+	 * @param int $transCost Transposition cost
78
+	 */
79
+	public function __construct(
80 80
 			string $firstString,
81 81
 			string $secondString,
82 82
 			int $insCost = 1,
@@ -84,291 +84,291 @@  discard block
 block discarded – undo
84 84
 			int $subCost = 1,
85 85
 			int $transCost = 1
86 86
 		) {
87
-        if (!empty($firstString) || !empty($secondString)) {
88
-            $this->compOne = $firstString;
89
-            $this->compTwo = $secondString;
90
-        }
91
-
92
-        $this->insCost = $insCost;
93
-        $this->delCost = $delCost;
94
-        $this->subCost = $subCost;
95
-        $this->transCost = $transCost;
96
-    }
97
-
98
-    /**
99
-     * Returns computed matrix for given input strings.
100
-     *
101
-     * @return int[][] matrix
102
-     */
103
-    public function getMatrix(): array
104
-    {
105
-        $this->setupMatrix();
106
-
107
-        return $this->matrix;
108
-    }
109
-
110
-    /**
111
-     * Returns similarity of strings, absolute number = Damerau Levenshtein distance.
112
-     *
113
-     * @return int
114
-     */
115
-    public function getSimilarity(): int
116
-    {
117
-        if (!$this->calculated) {
118
-            $this->setupMatrix();
119
-        }
120
-
121
-        return $this->matrix[mb_strlen($this->compOne, 'UTF-8')][mb_strlen($this->compTwo, 'UTF-8')];
122
-    }
123
-
124
-    /**
125
-     * Procedure to compute matrix for given input strings.
126
-     *
127
-     * @return void
128
-     * @SuppressWarnings(PHPMD.CyclomaticComplexity)
129
-     */
130
-    private function setupMatrix()
131
-    {
132
-        $cost = -1;
133
-        $del = 0;
134
-        $sub = 0;
135
-        $ins = 0;
136
-        $trans = 0;
137
-        $this->matrix = [[]];
138
-
139
-        $oneSize = mb_strlen($this->compOne, 'UTF-8');
140
-        $twoSize = mb_strlen($this->compTwo, 'UTF-8');
141
-        for ($i = 0; $i <= $oneSize; $i += 1) {
142
-            $this->matrix[$i][0] = $i > 0 ? $this->matrix[$i - 1][0] + $this->delCost : 0;
143
-        }
144
-
145
-        for ($i = 0; $i <= $twoSize; $i += 1) {
146
-            // Insertion actualy
147
-            $this->matrix[0][$i] = $i > 0 ? $this->matrix[0][$i - 1] + $this->insCost : 0;
148
-        }
149
-
150
-        for ($i = 1; $i <= $oneSize; $i += 1) {
151
-            // Curchar for the first string
152
-            $cOne = mb_substr($this->compOne, $i - 1, 1, 'UTF-8');
153
-            for ($j = 1; $j <= $twoSize; $j += 1) {
154
-                // Curchar for the second string
155
-                $cTwo = mb_substr($this->compTwo, $j - 1, 1, 'UTF-8');
156
-
157
-                // Compute substitution cost
158
-                if ($this->compare($cOne, $cTwo) == 0) {
159
-                    $cost = 0;
160
-                    $trans = 0;
161
-                } else {
162
-                    $cost = $this->subCost;
163
-                    $trans = $this->transCost;
164
-                }
165
-
166
-                // Deletion cost
167
-                $del = $this->matrix[$i - 1][$j] + $this->delCost;
168
-
169
-                // Insertion cost
170
-                $ins = $this->matrix[$i][$j - 1] + $this->insCost;
171
-
172
-                // Substitution cost, 0 if same
173
-                $sub = $this->matrix[$i - 1][$j - 1] + $cost;
174
-
175
-                // Compute optimal
176
-                $this->matrix[$i][$j] = min($del, $ins, $sub);
177
-
178
-                // Transposition cost
179
-                if ($i > 1 && $j > 1) {
180
-                    // Last two
181
-                    $ccOne = mb_substr($this->compOne, $i - 2, 1, 'UTF-8');
182
-                    $ccTwo = mb_substr($this->compTwo, $j - 2, 1, 'UTF-8');
183
-
184
-                    if ($this->compare($cOne, $ccTwo) == 0 && $this->compare($ccOne, $cTwo) == 0) {
185
-                        // Transposition cost is computed as minimal of two
186
-                        $this->matrix[$i][$j] = min($this->matrix[$i][$j], $this->matrix[$i - 2][$j - 2] + $trans);
187
-                    }
188
-                }
189
-            }
190
-        }
191
-
192
-        $this->calculated = true;
193
-    }
194
-
195
-    /**
196
-     * Returns maximal possible edit Damerau Levenshtein distance between texts.
197
-     *
198
-     * On common substring of same length perform substitution / insert + delete
199
-     * (depends on what is cheaper), then on extra characters perform insertion / deletion
200
-     *
201
-     * @return int
202
-     */
203
-    public function getMaximalDistance(): int
204
-    {
205
-        $oneSize = mb_strlen($this->compOne, 'UTF-8');
206
-        $twoSize = mb_strlen($this->compTwo, 'UTF-8');
207
-
208
-        // Is substitution cheaper that delete + insert?
209
-        $subCost = min($this->subCost, $this->delCost + $this->insCost);
210
-
211
-        // Get common size
212
-        $minSize = min($oneSize, $twoSize);
213
-        $maxSize = max($oneSize, $twoSize);
214
-        $extraSize = $maxSize - $minSize;
215
-
216
-        // On common size perform substitution / delete + insert, what is cheaper
217
-        $maxCost = $subCost * $minSize;
218
-
219
-        // On resulting do insert/delete
220
-        if ($oneSize > $twoSize) {
221
-            // Delete extra characters
222
-            $maxCost += $extraSize * $this->delCost;
223
-        } else {
224
-            // Insert extra characters
225
-            $maxCost += $extraSize * $this->insCost;
226
-        }
227
-
228
-        return $maxCost;
229
-    }
230
-
231
-    /**
232
-     * Returns relative distance of input strings (computed with maximal possible distance).
233
-     *
234
-     * @return int
235
-     */
236
-    public function getRelativeDistance(): int
237
-    {
238
-        if (!$this->calculated) {
239
-            $this->setupMatrix();
240
-        }
241
-
242
-        return 1 - ($this->getSimilarity() / $this->getMaximalDistance());
243
-    }
244
-
245
-    /**
246
-     * Compares two characters from string (this method may be overridden in child class).
247
-     *
248
-     * @param string $firstCharacter First character
249
-     * @param string $secondCharacter Second character
250
-     * @return int
251
-     */
252
-    protected function compare(string $firstCharacter, string $secondCharacter): int
253
-    {
254
-        return strcmp($firstCharacter, $secondCharacter);
255
-    }
256
-
257
-    /**
258
-     * Returns computed matrix for given input strings (For debugging purposes).
259
-     *
260
-     * @return string
261
-     */
262
-    public function displayMatrix(): string
263
-    {
264
-        $this->setupMatrix();
265
-
266
-        $oneSize = mb_strlen($this->compOne, 'UTF-8');
267
-        $twoSize = mb_strlen($this->compTwo, 'UTF-8');
268
-
269
-        $out = '  ' . $this->compOne . PHP_EOL;
270
-        for ($y = 0; $y <= $twoSize; $y += 1) {
271
-            if ($y - 1 < 0) {
272
-                $out .= ' ';
273
-            } else {
274
-                $out .= mb_substr($this->compTwo, $y - 1, 1, 'UTF-8');
275
-            }
276
-
277
-            for ($x = 0; $x <= $oneSize; $x += 1) {
278
-                $out .= $this->matrix[$x][$y];
279
-            }
280
-
281
-            $out .= PHP_EOL;
282
-        }
283
-
284
-        return $out;
285
-    }
286
-
287
-    /**
288
-     * Returns current cost of insertion operation.
289
-     *
290
-     * @return int
291
-     */
292
-    public function getInsCost(): int
293
-    {
294
-        return $this->insCost;
295
-    }
296
-
297
-    /**
298
-     * Sets cost of insertion operation (insert characters to first string to match second string).
299
-     *
300
-     * @param int $insCost Cost of character insertion
301
-     * @return void
302
-     */
303
-    public function setInsCost(int $insCost)
304
-    {
305
-        $this->calculated = $insCost == $this->insCost ? $this->calculated : false;
306
-        $this->insCost = $insCost;
307
-    }
308
-
309
-    /**
310
-     * Returns current cost of deletion operation.
311
-     *
312
-     * @return int
313
-     */
314
-    public function getDelCost(): int
315
-    {
316
-        return $this->delCost;
317
-    }
318
-
319
-    /**
320
-     * Sets cost of deletion operation (delete characters from first string to match second string).
321
-     *
322
-     * @param int $delCost Cost of character deletion
323
-     * @return void
324
-     */
325
-    public function setDelCost(int $delCost)
326
-    {
327
-        $this->calculated = $delCost == $this->delCost ? $this->calculated : false;
328
-        $this->delCost = $delCost;
329
-    }
330
-
331
-    /**
332
-     * Returns current cost of substitution operation.
333
-     *
334
-     * @return int
335
-     */
336
-    public function getSubCost(): int
337
-    {
338
-        return $this->subCost;
339
-    }
340
-
341
-    /**
342
-     * Sets cost of substitution operation.
343
-     *
344
-     * @param int $subCost Cost of character substitution
345
-     * @return void
346
-     */
347
-    public function setSubCost(int $subCost)
348
-    {
349
-        $this->calculated = $subCost == $this->subCost ? $this->calculated : false;
350
-        $this->subCost = $subCost;
351
-    }
352
-
353
-    /**
354
-     * Returns current cost of transposition operation.
355
-     *
356
-     * @return int
357
-     */
358
-    public function getTransCost(): int
359
-    {
360
-        return $this->transCost;
361
-    }
362
-
363
-    /**
364
-     * Sets cost of transposition operation.
365
-     *
366
-     * @param int $transCost Cost of character transposition
367
-     * @return void
368
-     */
369
-    public function setTransCost(int $transCost)
370
-    {
371
-        $this->calculated = $transCost == $this->transCost ? $this->calculated : false;
372
-        $this->transCost = $transCost;
373
-    }
87
+		if (!empty($firstString) || !empty($secondString)) {
88
+			$this->compOne = $firstString;
89
+			$this->compTwo = $secondString;
90
+		}
91
+
92
+		$this->insCost = $insCost;
93
+		$this->delCost = $delCost;
94
+		$this->subCost = $subCost;
95
+		$this->transCost = $transCost;
96
+	}
97
+
98
+	/**
99
+	 * Returns computed matrix for given input strings.
100
+	 *
101
+	 * @return int[][] matrix
102
+	 */
103
+	public function getMatrix(): array
104
+	{
105
+		$this->setupMatrix();
106
+
107
+		return $this->matrix;
108
+	}
109
+
110
+	/**
111
+	 * Returns similarity of strings, absolute number = Damerau Levenshtein distance.
112
+	 *
113
+	 * @return int
114
+	 */
115
+	public function getSimilarity(): int
116
+	{
117
+		if (!$this->calculated) {
118
+			$this->setupMatrix();
119
+		}
120
+
121
+		return $this->matrix[mb_strlen($this->compOne, 'UTF-8')][mb_strlen($this->compTwo, 'UTF-8')];
122
+	}
123
+
124
+	/**
125
+	 * Procedure to compute matrix for given input strings.
126
+	 *
127
+	 * @return void
128
+	 * @SuppressWarnings(PHPMD.CyclomaticComplexity)
129
+	 */
130
+	private function setupMatrix()
131
+	{
132
+		$cost = -1;
133
+		$del = 0;
134
+		$sub = 0;
135
+		$ins = 0;
136
+		$trans = 0;
137
+		$this->matrix = [[]];
138
+
139
+		$oneSize = mb_strlen($this->compOne, 'UTF-8');
140
+		$twoSize = mb_strlen($this->compTwo, 'UTF-8');
141
+		for ($i = 0; $i <= $oneSize; $i += 1) {
142
+			$this->matrix[$i][0] = $i > 0 ? $this->matrix[$i - 1][0] + $this->delCost : 0;
143
+		}
144
+
145
+		for ($i = 0; $i <= $twoSize; $i += 1) {
146
+			// Insertion actualy
147
+			$this->matrix[0][$i] = $i > 0 ? $this->matrix[0][$i - 1] + $this->insCost : 0;
148
+		}
149
+
150
+		for ($i = 1; $i <= $oneSize; $i += 1) {
151
+			// Curchar for the first string
152
+			$cOne = mb_substr($this->compOne, $i - 1, 1, 'UTF-8');
153
+			for ($j = 1; $j <= $twoSize; $j += 1) {
154
+				// Curchar for the second string
155
+				$cTwo = mb_substr($this->compTwo, $j - 1, 1, 'UTF-8');
156
+
157
+				// Compute substitution cost
158
+				if ($this->compare($cOne, $cTwo) == 0) {
159
+					$cost = 0;
160
+					$trans = 0;
161
+				} else {
162
+					$cost = $this->subCost;
163
+					$trans = $this->transCost;
164
+				}
165
+
166
+				// Deletion cost
167
+				$del = $this->matrix[$i - 1][$j] + $this->delCost;
168
+
169
+				// Insertion cost
170
+				$ins = $this->matrix[$i][$j - 1] + $this->insCost;
171
+
172
+				// Substitution cost, 0 if same
173
+				$sub = $this->matrix[$i - 1][$j - 1] + $cost;
174
+
175
+				// Compute optimal
176
+				$this->matrix[$i][$j] = min($del, $ins, $sub);
177
+
178
+				// Transposition cost
179
+				if ($i > 1 && $j > 1) {
180
+					// Last two
181
+					$ccOne = mb_substr($this->compOne, $i - 2, 1, 'UTF-8');
182
+					$ccTwo = mb_substr($this->compTwo, $j - 2, 1, 'UTF-8');
183
+
184
+					if ($this->compare($cOne, $ccTwo) == 0 && $this->compare($ccOne, $cTwo) == 0) {
185
+						// Transposition cost is computed as minimal of two
186
+						$this->matrix[$i][$j] = min($this->matrix[$i][$j], $this->matrix[$i - 2][$j - 2] + $trans);
187
+					}
188
+				}
189
+			}
190
+		}
191
+
192
+		$this->calculated = true;
193
+	}
194
+
195
+	/**
196
+	 * Returns maximal possible edit Damerau Levenshtein distance between texts.
197
+	 *
198
+	 * On common substring of same length perform substitution / insert + delete
199
+	 * (depends on what is cheaper), then on extra characters perform insertion / deletion
200
+	 *
201
+	 * @return int
202
+	 */
203
+	public function getMaximalDistance(): int
204
+	{
205
+		$oneSize = mb_strlen($this->compOne, 'UTF-8');
206
+		$twoSize = mb_strlen($this->compTwo, 'UTF-8');
207
+
208
+		// Is substitution cheaper that delete + insert?
209
+		$subCost = min($this->subCost, $this->delCost + $this->insCost);
210
+
211
+		// Get common size
212
+		$minSize = min($oneSize, $twoSize);
213
+		$maxSize = max($oneSize, $twoSize);
214
+		$extraSize = $maxSize - $minSize;
215
+
216
+		// On common size perform substitution / delete + insert, what is cheaper
217
+		$maxCost = $subCost * $minSize;
218
+
219
+		// On resulting do insert/delete
220
+		if ($oneSize > $twoSize) {
221
+			// Delete extra characters
222
+			$maxCost += $extraSize * $this->delCost;
223
+		} else {
224
+			// Insert extra characters
225
+			$maxCost += $extraSize * $this->insCost;
226
+		}
227
+
228
+		return $maxCost;
229
+	}
230
+
231
+	/**
232
+	 * Returns relative distance of input strings (computed with maximal possible distance).
233
+	 *
234
+	 * @return int
235
+	 */
236
+	public function getRelativeDistance(): int
237
+	{
238
+		if (!$this->calculated) {
239
+			$this->setupMatrix();
240
+		}
241
+
242
+		return 1 - ($this->getSimilarity() / $this->getMaximalDistance());
243
+	}
244
+
245
+	/**
246
+	 * Compares two characters from string (this method may be overridden in child class).
247
+	 *
248
+	 * @param string $firstCharacter First character
249
+	 * @param string $secondCharacter Second character
250
+	 * @return int
251
+	 */
252
+	protected function compare(string $firstCharacter, string $secondCharacter): int
253
+	{
254
+		return strcmp($firstCharacter, $secondCharacter);
255
+	}
256
+
257
+	/**
258
+	 * Returns computed matrix for given input strings (For debugging purposes).
259
+	 *
260
+	 * @return string
261
+	 */
262
+	public function displayMatrix(): string
263
+	{
264
+		$this->setupMatrix();
265
+
266
+		$oneSize = mb_strlen($this->compOne, 'UTF-8');
267
+		$twoSize = mb_strlen($this->compTwo, 'UTF-8');
268
+
269
+		$out = '  ' . $this->compOne . PHP_EOL;
270
+		for ($y = 0; $y <= $twoSize; $y += 1) {
271
+			if ($y - 1 < 0) {
272
+				$out .= ' ';
273
+			} else {
274
+				$out .= mb_substr($this->compTwo, $y - 1, 1, 'UTF-8');
275
+			}
276
+
277
+			for ($x = 0; $x <= $oneSize; $x += 1) {
278
+				$out .= $this->matrix[$x][$y];
279
+			}
280
+
281
+			$out .= PHP_EOL;
282
+		}
283
+
284
+		return $out;
285
+	}
286
+
287
+	/**
288
+	 * Returns current cost of insertion operation.
289
+	 *
290
+	 * @return int
291
+	 */
292
+	public function getInsCost(): int
293
+	{
294
+		return $this->insCost;
295
+	}
296
+
297
+	/**
298
+	 * Sets cost of insertion operation (insert characters to first string to match second string).
299
+	 *
300
+	 * @param int $insCost Cost of character insertion
301
+	 * @return void
302
+	 */
303
+	public function setInsCost(int $insCost)
304
+	{
305
+		$this->calculated = $insCost == $this->insCost ? $this->calculated : false;
306
+		$this->insCost = $insCost;
307
+	}
308
+
309
+	/**
310
+	 * Returns current cost of deletion operation.
311
+	 *
312
+	 * @return int
313
+	 */
314
+	public function getDelCost(): int
315
+	{
316
+		return $this->delCost;
317
+	}
318
+
319
+	/**
320
+	 * Sets cost of deletion operation (delete characters from first string to match second string).
321
+	 *
322
+	 * @param int $delCost Cost of character deletion
323
+	 * @return void
324
+	 */
325
+	public function setDelCost(int $delCost)
326
+	{
327
+		$this->calculated = $delCost == $this->delCost ? $this->calculated : false;
328
+		$this->delCost = $delCost;
329
+	}
330
+
331
+	/**
332
+	 * Returns current cost of substitution operation.
333
+	 *
334
+	 * @return int
335
+	 */
336
+	public function getSubCost(): int
337
+	{
338
+		return $this->subCost;
339
+	}
340
+
341
+	/**
342
+	 * Sets cost of substitution operation.
343
+	 *
344
+	 * @param int $subCost Cost of character substitution
345
+	 * @return void
346
+	 */
347
+	public function setSubCost(int $subCost)
348
+	{
349
+		$this->calculated = $subCost == $this->subCost ? $this->calculated : false;
350
+		$this->subCost = $subCost;
351
+	}
352
+
353
+	/**
354
+	 * Returns current cost of transposition operation.
355
+	 *
356
+	 * @return int
357
+	 */
358
+	public function getTransCost(): int
359
+	{
360
+		return $this->transCost;
361
+	}
362
+
363
+	/**
364
+	 * Sets cost of transposition operation.
365
+	 *
366
+	 * @param int $transCost Cost of character transposition
367
+	 * @return void
368
+	 */
369
+	public function setTransCost(int $transCost)
370
+	{
371
+		$this->calculated = $transCost == $this->transCost ? $this->calculated : false;
372
+		$this->transCost = $transCost;
373
+	}
374 374
 }
Please login to merge, or discard this patch.