1
|
|
|
<?php |
2
|
|
|
namespace Fisharebest\Algorithm; |
3
|
|
|
|
4
|
|
|
/** |
5
|
|
|
* @package fisharebest/algorithm |
6
|
|
|
* @author Greg Roach <[email protected]> |
7
|
|
|
* @copyright (c) 2015 Greg Roach <[email protected]> |
8
|
|
|
* @license GPL-3.0+ |
9
|
|
|
* |
10
|
|
|
* This program is free software: you can redistribute it and/or modify |
11
|
|
|
* it under the terms of the GNU General Public License as published by |
12
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
13
|
|
|
* (at your option) any later version. |
14
|
|
|
* This program is distributed in the hope that it will be useful, |
15
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
16
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17
|
|
|
* GNU General Public License for more details. |
18
|
|
|
* You should have received a copy of the GNU General Public License |
19
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
20
|
|
|
*/ |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Class MyersDiff - find the shortest edit sequence to transform one string into another. |
24
|
|
|
* |
25
|
|
|
* Based on "An O(ND) Difference Algorithm and Its Variations" by Eugene W Myers. |
26
|
|
|
* |
27
|
|
|
* http://www.xmailserver.org/diff2.pdf |
28
|
|
|
* http://www.codeproject.com/Articles/42279/Investigating-Myers-diff-algorithm-Part-of |
29
|
|
|
*/ |
30
|
|
|
class MyersDiff { |
31
|
|
|
/** Instruction to delete a token which only appears in the first sequence */ |
32
|
|
|
const DELETE = -1; |
33
|
|
|
|
34
|
|
|
/** Instruction to keep a token which is common to both sequences */ |
35
|
|
|
const KEEP = 0; |
36
|
|
|
|
37
|
|
|
/** Instruction to insert a token which only appears in the last sequence */ |
38
|
|
|
const INSERT = 1; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Backtrack through the intermediate results to extract the "snakes" that |
42
|
|
|
* are visited on the chosen "D-path". |
43
|
|
|
* |
44
|
|
|
* @param string[] $v_save Intermediate results |
45
|
|
|
* @param int $x End position |
46
|
|
|
* @param int $y End position |
47
|
|
|
* |
48
|
|
|
* @return int[][] |
49
|
|
|
*/ |
50
|
|
|
private function extractSnakes(array $v_save, $x, $y) { |
51
|
|
|
$snakes = array(); |
52
|
|
|
for ($d = count($v_save) - 1; $x >= 0 && $y >= 0; --$d) { |
53
|
|
|
array_unshift($snakes, array($x, $y)); |
54
|
|
|
|
55
|
|
|
$v = $v_save[$d]; |
56
|
|
|
$k = $x - $y; |
57
|
|
|
|
58
|
|
|
if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) { |
59
|
|
|
$k_prev = $k + 1; |
60
|
|
|
} else { |
61
|
|
|
$k_prev = $k - 1; |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
$x = $v[$k_prev]; |
65
|
|
|
$y = $x - $k_prev; |
66
|
|
|
} |
67
|
|
|
|
68
|
|
|
return $snakes; |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Convert a list of "snakes" into a set of insert/keep/delete instructions |
73
|
|
|
* |
74
|
|
|
* @param integer[][] $snakes Common subsequences |
75
|
|
|
* @param string[] $a First sequence |
76
|
|
|
* @param string[] $b Second sequence |
77
|
|
|
* |
78
|
|
|
*/ |
79
|
|
|
private function formatSolution(array $snakes, array $a, array $b) { |
80
|
|
|
$solution = array(); |
81
|
|
|
$x = 0; |
82
|
|
|
$y = 0; |
83
|
|
|
foreach ($snakes as $snake) { |
84
|
|
|
// Horizontals |
85
|
|
|
while ($snake[0] - $snake[1] > $x - $y) { |
86
|
|
|
$solution[] = array($a[$x], self::DELETE); |
87
|
|
|
++$x; |
88
|
|
|
} |
89
|
|
|
// Verticals |
90
|
|
View Code Duplication |
while ($snake[0] - $snake[1] < $x - $y) { |
|
|
|
|
91
|
|
|
$solution[] = array($b[$y], self::INSERT); |
92
|
|
|
++$y; |
93
|
|
|
} |
94
|
|
|
// Diagonals |
95
|
|
View Code Duplication |
while ($x < $snake[0]) { |
|
|
|
|
96
|
|
|
$solution[] = array($a[$x], self::KEEP); |
97
|
|
|
++$x; |
98
|
|
|
++$y; |
99
|
|
|
} |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
return $solution; |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
/** |
106
|
|
|
* Calculate the shortest edit sequence to convert $x into $y. |
107
|
|
|
* |
108
|
|
|
* @param string[] $a - tokens (characters, words or lines) |
109
|
|
|
* @param string[] $b - tokens (characters, words or lines) |
110
|
|
|
* |
111
|
|
|
* @return array[] - pairs of token and edit (-1 for delete, 0 for keep, +1 for insert) |
112
|
|
|
*/ |
113
|
|
|
public function calculate(array $a, array $b) { |
114
|
|
|
// The algorithm uses array keys numbered from zero. |
115
|
|
|
$n = count($a); |
116
|
|
|
$m = count($b); |
117
|
|
|
$a = array_values($a); |
118
|
|
|
$b = array_values($b); |
119
|
|
|
$max = $m + $n; |
120
|
|
|
|
121
|
|
|
// Keep a copy of $v after each iteration of $d. |
122
|
|
|
$v_save = array(); |
123
|
|
|
|
124
|
|
|
// Find the shortest "D-path". |
125
|
|
|
$v = array(1 => 0); |
126
|
|
|
for ($d = 0; $d <= $max; ++$d) { |
127
|
|
|
// Examine all possible "K-lines" for this "D-path". |
128
|
|
|
for ($k = -$d; $k <= $d; $k += 2) { |
129
|
|
|
if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) { |
130
|
|
|
// Move down. |
131
|
|
|
$x = $v[$k + 1]; |
132
|
|
|
} else { |
133
|
|
|
// Move right. |
134
|
|
|
$x = $v[$k - 1] + 1; |
135
|
|
|
} |
136
|
|
|
// Derive Y from X. |
137
|
|
|
$y = $x - $k; |
138
|
|
|
// Follow the diagonal. |
139
|
|
|
while ($x < $n && $y < $m && $a[$x] === $b[$y]) { |
140
|
|
|
++$x; |
141
|
|
|
++$y; |
142
|
|
|
} |
143
|
|
|
// Just store X, as we can calculate Y (from X + K). |
144
|
|
|
$v[$k] = $x; |
145
|
|
|
$v_save[$d] = $v; |
146
|
|
|
// Solution found? |
147
|
|
|
if ($x === $n && $y === $m) { |
148
|
|
|
break 2; |
149
|
|
|
} |
150
|
|
|
} |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
// Extract the solution by back-tracking through the saved results. |
154
|
|
|
$snakes = $this->extractSnakes($v_save, $n, $m); |
155
|
|
|
|
156
|
|
|
// Format the snakes as a set of instructions. |
157
|
|
|
return $this->formatSolution($snakes, $a, $b); |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.