Total Complexity | 64 |
Total Lines | 392 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like AbstractLCS often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use AbstractLCS, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | abstract class AbstractLCS |
||
14 | { |
||
15 | /** @const float 10^8, the value of N*M when to start bindnig the run time. */ |
||
16 | private const TOO_LONG = 100000000.0; |
||
17 | |||
18 | /** @const float Limit the time to D^POW_LIMIT */ |
||
19 | private const POW_LIMIT = 1.5; |
||
20 | |||
21 | /** @var int */ |
||
22 | private $maxDifferences = 0; |
||
23 | |||
24 | /** @var int */ |
||
25 | private $length = 0; |
||
26 | |||
27 | /** |
||
28 | * @return int |
||
29 | */ |
||
30 | public function getLength(): int |
||
31 | { |
||
32 | return $this->length; |
||
33 | } |
||
34 | |||
35 | /** |
||
36 | * Myers' algorithm for longest common subsequence. O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M |
||
37 | * + N) space (http://citeseer.ist.psu.edu/myers86ond.html) |
||
38 | * |
||
39 | * Note: Beyond implementing the algorithm as described in the paper I have added diagonal range compression which |
||
40 | * helps when finding the LCS of a very long and a very short sequence, also bound the running time to (N + M)^1.5 |
||
41 | * when both sequences are very long. |
||
42 | * |
||
43 | * After this method is called, the longest common subsequence is available by calling getResult() where result[0] |
||
44 | * is composed of entries from l1 and result[1] is composed of entries from l2 |
||
45 | */ |
||
46 | public function longestCommonSubsequence(): void |
||
47 | { |
||
48 | $length1 = $this->getLength1(); |
||
49 | $length2 = $this->getLength2(); |
||
50 | |||
51 | if (0 === $length1 || 0 === $length2) { |
||
52 | $this->length = 0; |
||
53 | |||
54 | return; |
||
55 | } |
||
56 | |||
57 | $this->maxDifferences = (int)(($length1 + $length2 + 1) / 2); |
||
58 | |||
59 | if ($length1 * $length2 > self::TOO_LONG) { |
||
60 | // Limit complexity to D^POW_LIMIT for long sequences. |
||
61 | $this->maxDifferences = (int)\pow($this->maxDifferences, self::POW_LIMIT - 1.0); |
||
62 | } |
||
63 | |||
64 | $this->initializeLcs($length1); |
||
65 | |||
66 | // The common prefixes and suffixes are always part of some LCS, include them now to reduce our search space. |
||
67 | $max = \min($length1, $length2); |
||
68 | |||
69 | for ( |
||
70 | $forwardBound = 0; |
||
71 | $forwardBound < $max && $this->isRangeEqual($forwardBound, $forwardBound); |
||
72 | $forwardBound++ |
||
73 | ) { |
||
74 | $this->setLcs($forwardBound, $forwardBound); |
||
75 | } |
||
76 | |||
77 | $backBoundL1 = $length1 - 1; |
||
78 | $backBoundL2 = $length2 - 1; |
||
79 | |||
80 | while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound && |
||
81 | $this->isRangeEqual($backBoundL1, $backBoundL2)) { |
||
82 | $this->setLcs($backBoundL1, $backBoundL2); |
||
83 | $backBoundL1--; |
||
84 | $backBoundL2--; |
||
85 | } |
||
86 | |||
87 | $V = \array_fill(0, 2, \array_fill(0, $length1 + $length2 + 1, 0)); |
||
88 | $snake = [0, 0, 0]; |
||
89 | $lcsRec = $this->lcsRec($forwardBound, $backBoundL1, $forwardBound, $backBoundL2, $V, $snake); |
||
90 | |||
91 | $this->length = $forwardBound + $length1 - $backBoundL1 - 1 + $lcsRec; |
||
92 | } |
||
93 | |||
94 | /** |
||
95 | * The recursive helper function for Myers' LCS. Computes the LCS of $l1[$bottomL1 .. $topL1] and $l2[$bottomL2 .. |
||
96 | * $topL2] fills in the appropriate location in lcs and returns the length. |
||
97 | * |
||
98 | * @param int $bottomL1 The first sequence |
||
99 | * @param int $topL1 Index in the first sequence to start from (inclusive) |
||
100 | * @param int $bottomL2 The second sequence |
||
101 | * @param int $topL2 Index in the second sequence to start from (inclusive) |
||
102 | * @param array $V Furthest reaching D-paths |
||
103 | * @param array $snake Beginning x, y coordinates and the length of the middle snake |
||
104 | * @return int Length of the lcs |
||
105 | */ |
||
106 | private function lcsRec(int $bottomL1, int $topL1, int $bottomL2, int $topL2, array &$V, array &$snake): int |
||
107 | { |
||
108 | // Check that both sequences are non-empty. |
||
109 | if ($bottomL1 > $topL1 || $bottomL2 > $topL2) { |
||
110 | return 0; |
||
111 | } |
||
112 | |||
113 | $d = $this->findMiddleSnake($bottomL1, $topL1, $bottomL2, $topL2, $V, $snake); |
||
114 | |||
115 | // Need to restore these so we don't lose them when they're overwritten by the recursion. |
||
116 | $len = $snake[2]; |
||
117 | $startX = $snake[0]; |
||
118 | $startY = $snake[1]; |
||
119 | |||
120 | // The middle snake is part of the LCS, store it. |
||
121 | for ($i = 0; $i < $len; $i++) { |
||
122 | $this->setLcs($startX + $i, $startY + $i); |
||
123 | } |
||
124 | |||
125 | if ($d > 1) { |
||
126 | return $len + |
||
127 | $this->lcsRec($bottomL1, $startX - 1, $bottomL2, $startY - 1, $V, $snake) + |
||
128 | $this->lcsRec($startX + $len, $topL1, $startY + $len, $topL2, $V, $snake); |
||
129 | } elseif ($d === 1) { |
||
130 | // In this case the sequences differ by exactly 1 line. We have already saved all the lines after the |
||
131 | // difference in the for loop above, now we need to save all the lines before the difference. |
||
132 | $max = \min($startX - $bottomL1, $startY - $bottomL2); |
||
133 | |||
134 | for ($i = 0; $i < $max; $i++) { |
||
135 | $this->setLcs($bottomL1 + $i, $bottomL2 + $i); |
||
136 | } |
||
137 | |||
138 | return $max + $len; |
||
139 | } |
||
140 | |||
141 | return $len; |
||
142 | } |
||
143 | |||
144 | /** |
||
145 | * Helper function for Myers' LCS algorithm to find the middle snake for $l1[$bottomL1 .. $topL1] and $l2[$bottomL2 |
||
146 | * .. $topL2] The x, y coordinates of the start of the middle snake are saved in $snake[0], $snake[1] respectively |
||
147 | * and the length of the snake is saved in $snake[2]. |
||
148 | * |
||
149 | * @param int $bottomL1 Index in the first sequence to start from (inclusive) |
||
150 | * @param int $topL1 Index in the first sequence to end from (inclusive) |
||
151 | * @param int $bottomL2 Index in the second sequence to start from (inclusive) |
||
152 | * @param int $topL2 Index in the second sequence to end from (inclusive) |
||
153 | * @param int[][] $V Array storing the furthest reaching D-paths for the LCS computation |
||
154 | * @param int[] $snake Beginning x, y coordinates and the length of the middle snake |
||
155 | * @return int |
||
156 | */ |
||
157 | private function findMiddleSnake( |
||
158 | int $bottomL1, |
||
159 | int $topL1, |
||
160 | int $bottomL2, |
||
161 | int $topL2, |
||
162 | array &$V, |
||
163 | array &$snake |
||
164 | ): int { |
||
165 | $N = $topL1 - $bottomL1 + 1; |
||
166 | $M = $topL2 - $bottomL2 + 1; |
||
167 | |||
168 | $delta = $N - $M; |
||
169 | $isEven = ($delta & 1) === 1 ? false : true; |
||
170 | |||
171 | $limit = \min($this->maxDifferences, (int)(($N + $M + 1) / 2)); |
||
172 | |||
173 | // Offset to make it odd/even. |
||
174 | // a 0 or 1 that we add to the start offset to make it odd/even |
||
175 | $valueToAddForward = ($M & 1) === 1 ? 1 : 0; |
||
176 | $valueToAddBackward = ($N & 1) === 1 ? 1 : 0; |
||
177 | |||
178 | $startForward = -$M; |
||
179 | $endForward = $N; |
||
180 | $startBackward = -$N; |
||
181 | $endBackward = $M; |
||
182 | |||
183 | $V[0][$limit + 1] = 0; |
||
184 | $V[1][$limit - 1] = $N; |
||
185 | |||
186 | for ($d = 0; $d <= $limit; $d++) { |
||
187 | $startDiag = \max($valueToAddForward + $startForward, -$d); |
||
188 | $endDiag = \min($endForward, $d); |
||
189 | $valueToAddForward = 1 - $valueToAddForward; |
||
190 | |||
191 | // Compute forward furthest reaching paths. |
||
192 | for ($k = $startDiag; $k <= $endDiag; $k += 2) { |
||
193 | if ($k === -$d || ($k < $d && $V[0][$limit + $k - 1] < $V[0][$limit + $k + 1])) { |
||
194 | $x = $V[0][$limit + $k + 1]; |
||
195 | } else { |
||
196 | $x = $V[0][$limit + $k - 1] + 1; |
||
197 | } |
||
198 | |||
199 | $y = $x - $k; |
||
200 | |||
201 | $snake[0] = $x + $bottomL1; |
||
202 | $snake[1] = $y + $bottomL2; |
||
203 | $snake[2] = 0; |
||
204 | |||
205 | while ($x < $N && $y < $M && $this->isRangeEqual($x + $bottomL1, $y + $bottomL2)) { |
||
206 | $x++; |
||
207 | $y++; |
||
208 | $snake[2]++; |
||
209 | } |
||
210 | |||
211 | $V[0][$limit + $k] = $x; |
||
212 | |||
213 | if (!$isEven && $k >= $delta - $d + 1 && $k <= $delta + $d - 1 && $x >= $V[1][$limit + $k - $delta]) { |
||
214 | return 2 * $d - 1; |
||
215 | } |
||
216 | |||
217 | // Check to see if we can cut down the diagonal range. |
||
218 | if ($x >= $N && $endForward > $k - 1) { |
||
219 | $endForward = $k - 1; |
||
220 | } elseif ($y >= $M) { |
||
221 | $startForward = $k + 1; |
||
222 | $valueToAddForward = 0; |
||
223 | } |
||
224 | } |
||
225 | |||
226 | $startDiag = \max($valueToAddBackward + $startBackward, -$d); |
||
227 | $endDiag = \min($endBackward, $d); |
||
228 | $valueToAddBackward = 1 - $valueToAddBackward; |
||
229 | |||
230 | // Compute backward furthest reaching paths. |
||
231 | for ($k = $startDiag; $k <= $endDiag; $k += 2) { |
||
232 | if ($k === $d || ($k !== -$d && $V[1][$limit + $k - 1] < $V[1][$limit + $k + 1])) { |
||
233 | $x = $V[1][$limit + $k - 1]; |
||
234 | } else { |
||
235 | $x = $V[1][$limit + $k + 1] - 1; |
||
236 | } |
||
237 | |||
238 | $y = $x - $k - $delta; |
||
239 | $snake[2] = 0; |
||
240 | |||
241 | while ($x > 0 && $y > 0 && $this->isRangeEqual($x - 1 + $bottomL1, $y - 1 + $bottomL2)) { |
||
242 | $x--; |
||
243 | $y--; |
||
244 | $snake[2]++; |
||
245 | } |
||
246 | |||
247 | $V[1][$limit + $k] = $x; |
||
248 | |||
249 | if ($isEven && $k >= -$delta - $d && $k <= $d - $delta && $x <= $V[0][$limit + $k + $delta]) { |
||
250 | $snake[0] = $bottomL1 + $x; |
||
251 | $snake[1] = $bottomL2 + $y; |
||
252 | |||
253 | return 2 * $d; |
||
254 | } |
||
255 | |||
256 | // Check to see if we can cut down our diagonal range. |
||
257 | if ($x <= 0) { |
||
258 | $startBackward = $k + 1; |
||
259 | $valueToAddBackward = 0; |
||
260 | } elseif ($y <= 0 && $endBackward > $k - 1) { |
||
261 | $endBackward = $k - 1; |
||
262 | } |
||
263 | } |
||
264 | } |
||
265 | |||
266 | // Computing the true LCS is too expensive, instead find the diagonal with the most progress and pretend a |
||
267 | // middle snake of length 0 occurs there. |
||
268 | $mostProgress = $this->findMostProgress($M, $N, $limit, $V); |
||
269 | |||
270 | $snake[0] = $bottomL1 + $mostProgress[0]; |
||
271 | $snake[1] = $bottomL2 + $mostProgress[1]; |
||
272 | $snake[2] = 0; |
||
273 | |||
274 | /* |
||
275 | * HACK: Since we didn't really finish the LCS computation we don't really know the length of the SES. We don't |
||
276 | * do anything with the result anyway, unless it's <=1. We know for a fact SES > 1 so 5 is as good a number as |
||
277 | * any to return here. |
||
278 | */ |
||
279 | |||
280 | return 5; |
||
281 | } |
||
282 | |||
283 | /** |
||
284 | * Takes the array with furthest reaching D-paths from an LCS computation and returns the x,y coordinates and |
||
285 | * progress made in the middle diagonal among those with maximum progress, both from the front and from the back. |
||
286 | * |
||
287 | * @param int $M Length of the first sequence for which LCS is being computed |
||
288 | * @param int $N Length of the second sequence for which LCS is being computed |
||
289 | * @param int $limit Number of steps made in an attempt to find the LCS from the front and back |
||
290 | * @param int[][] $V Array storing the furthest reaching D-paths for the LCS computation |
||
291 | * |
||
292 | * @return int[] Array of 3 integers where $result[0] is the x coordinate of the current location in the diagonal |
||
293 | * with the most progress, $result[1] is the y coordinate of the current location in the diagonal with |
||
294 | * the most progress and $result[2] is the amount of progress made in that diagonal. |
||
295 | */ |
||
296 | private function findMostProgress(int $M, int $N, int $limit, array &$V): array |
||
376 | } |
||
377 | |||
378 | /** |
||
379 | * @return int |
||
380 | */ |
||
381 | abstract protected function getLength1(): int; |
||
382 | |||
383 | /** |
||
384 | * @return int |
||
385 | */ |
||
386 | abstract protected function getLength2(): int; |
||
387 | |||
388 | /** |
||
389 | * @param int $i1 |
||
390 | * @param int $i2 |
||
391 | * @return bool |
||
392 | */ |
||
393 | abstract protected function isRangeEqual(int $i1, int $i2): bool; |
||
394 | |||
395 | /** |
||
396 | * @param int $lcsLength |
||
397 | */ |
||
398 | abstract protected function initializeLcs(int $lcsLength): void; |
||
399 | |||
400 | /** |
||
401 | * @param int $sl1 |
||
402 | * @param int $sl2 |
||
403 | */ |
||
404 | abstract protected function setLcs(int $sl1, int $sl2): void; |
||
405 | } |
||
406 |