Completed
Push — master ( b4c29c...43cfef )
by Greg
01:54
created

MyersDiff::degenerateCase()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 8
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 5
nc 2
nop 2
1
<?php
2
namespace Fisharebest\Algorithm;
3
4
/**
5
 * @package   fisharebest/algorithm
6
 * @author    Greg Roach <[email protected]>
7
 * @copyright (c) 2015 Greg Roach <[email protected]>
8
 * @license   GPL-3.0+
9
 *
10
 * This program is free software: you can redistribute it and/or modify
11
 * it under the terms of the GNU General Public License as published by
12
 * the Free Software Foundation, either version 3 of the License, or
13
 * (at your option) any later version.
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
 * GNU General Public License for more details.
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
/**
23
 * Class MyersDiff - find the shortest edit sequence to transform one string into another.
24
 *
25
 * Based on "An O(ND) Difference Algorithm and Its Variations" by Eugene W Myers.
26
 *
27
 * http://www.xmailserver.org/diff2.pdf
28
 * http://www.codeproject.com/Articles/42279/Investigating-Myers-diff-algorithm-Part-of
29
 */
30
class MyersDiff {
31
	/** Instruction to delete a token which only appears in the first sequence */
32
	const DELETE = -1;
33
34
	/** Instruction to keep a token which is common to both sequences */
35
	const KEEP = 0;
36
37
	/** Instruction to insert a token which only appears in the last sequence */
38
	const INSERT = 1;
39
40
	/**
41
	 * Backtrack through the intermediate results to extract the "snakes" that
42
	 * are visited on the chosen "D-path".
43
	 *
44
	 * @param string[] $v_save Intermediate results
45
	 * @param int      $x      End position
46
	 * @param int      $y      End position
47
	 *
48
	 * @return int[][]
49
	 */
50
	private function extractSnakes(array $v_save, $x, $y) {
51
		$snakes = array();
52
		for ($d = count($v_save) - 1; $x >= 0 && $y >= 0; --$d) {
53
			array_unshift($snakes, array($x, $y));
54
55
			$v = $v_save[$d];
56
			$k = $x - $y;
57
58
			if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) {
59
				$k_prev = $k + 1;
60
			} else {
61
				$k_prev = $k - 1;
62
			}
63
64
			$x = $v[$k_prev];
65
			$y = $x - $k_prev;
66
		}
67
68
		return $snakes;
69
	}
70
71
	/**
72
	 * Convert a list of "snakes" into a set of insert/keep/delete instructions
73
	 *
74
	 * @param integer[][] $snakes Common subsequences
75
	 * @param string[]    $a      First sequence
76
	 * @param string[]    $b      Second sequence
77
	 *
78
	 */
79
	private function formatSolution(array $snakes, array $a, array $b) {
80
		$solution = array();
81
		$x = 0;
82
		$y = 0;
83
		foreach ($snakes as $snake) {
84
			// Horizontals
85
			while ($snake[0] - $snake[1] > $x - $y) {
86
				$solution[] = array($a[$x], self::DELETE);
87
				++$x;
88
			}
89
			// Verticals
90 View Code Duplication
			while ($snake[0] - $snake[1] < $x - $y) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
91
				$solution[] = array($b[$y], self::INSERT);
92
				++$y;
93
			}
94
			// Diagonals
95 View Code Duplication
			while ($x < $snake[0]) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
96
				$solution[] = array($a[$x], self::KEEP);
97
				++$x;
98
				++$y;
99
			}
100
		}
101
102
		return $solution;
103
	}
104
105
	/**
106
	 * Calculate the shortest edit sequence to convert $x into $y.
107
	 *
108
	 * @param string[] $a - tokens (characters, words or lines)
109
	 * @param string[] $b - tokens (characters, words or lines)
110
	 *
111
	 * @return array[] - pairs of token and edit (-1 for delete, 0 for keep, +1 for insert)
112
	 */
113
	public function calculate(array $a, array $b) {
114
		// The algorithm uses array keys numbered from zero.
115
		$n = count($a);
116
		$m = count($b);
117
		$a = array_values($a);
118
		$b = array_values($b);
119
		$max = $m + $n;
120
121
		// Keep a copy of $v after each iteration of $d.
122
		$v_save = array();
123
124
		// Find the shortest "D-path".
125
		$v = array(1 => 0);
126
		for ($d = 0; $d <= $max; ++$d) {
127
			// Examine all possible "K-lines" for this "D-path".
128
			for ($k = -$d; $k <= $d; $k += 2) {
129
				if ($k === -$d || $k !== $d && $v[$k - 1] < $v[$k + 1]) {
130
					// Move down.
131
					$x = $v[$k + 1];
132
				} else {
133
					// Move right.
134
					$x = $v[$k - 1] + 1;
135
				}
136
				// Derive Y from X.
137
				$y = $x - $k;
138
				// Follow the diagonal.
139
				while ($x < $n && $y < $m && $a[$x] === $b[$y]) {
140
					++$x;
141
					++$y;
142
				}
143
				// Just store X, as we can calculate Y (from X + K).
144
				$v[$k] = $x;
145
				$v_save[$d] = $v;
146
				// Solution found?
147
				if ($x === $n && $y === $m) {
148
					break 2;
149
				}
150
			}
151
		}
152
153
		// Extract the solution by back-tracking through the saved results.
154
		$snakes = $this->extractSnakes($v_save, $n, $m);
155
156
		// Format the snakes as a set of instructions.
157
		return $this->formatSolution($snakes, $a, $b);
158
	}
159
}
160