Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like Text_Diff_Engine_native often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Text_Diff_Engine_native, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 337 | * @param $from_lines |
||
| 338 | * @param $to_lines |
||
| 339 | * @return array |
||
| 340 | */ |
||
| 341 | public function diff($from_lines, $to_lines) |
||
| 342 | { |
||
| 343 | $n_from = count($from_lines); |
||
| 344 | $n_to = count($to_lines); |
||
| 345 | |||
| 346 | $this->xchanged = $this->ychanged = []; |
||
| 347 | $this->xv = $this->yv = []; |
||
| 348 | $this->xind = $this->yind = []; |
||
| 349 | unset($this->seq, $this->in_seq, $this->lcs); |
||
| 350 | |||
| 351 | // Skip leading common lines. |
||
| 352 | for ($skip = 0; $skip < $n_from && $skip < $n_to; ++$skip) { |
||
| 353 | if ($from_lines[$skip] != $to_lines[$skip]) { |
||
| 354 | break; |
||
| 355 | } |
||
| 356 | $this->xchanged[$skip] = $this->ychanged[$skip] = false; |
||
| 357 | } |
||
| 358 | |||
| 359 | // Skip trailing common lines. |
||
| 360 | $xi = $n_from; |
||
| 361 | $yi = $n_to; |
||
| 362 | for ($endskip = 0; --$xi > $skip && --$yi > $skip; ++$endskip) { |
||
| 363 | if ($from_lines[$xi] != $to_lines[$yi]) { |
||
| 364 | break; |
||
| 365 | } |
||
| 366 | $this->xchanged[$xi] = $this->ychanged[$yi] = false; |
||
| 367 | } |
||
| 368 | |||
| 369 | // Ignore lines which do not exist in both files. |
||
| 370 | for ($xi = $skip; $xi < $n_from - $endskip; ++$xi) { |
||
| 371 | $xhash[$from_lines[$xi]] = 1; |
||
| 372 | } |
||
| 373 | for ($yi = $skip; $yi < $n_to - $endskip; ++$yi) { |
||
| 374 | $line = $to_lines[$yi]; |
||
| 375 | $this->ychanged[$yi] = empty($xhash[$line]); |
||
| 376 | if ($this->ychanged[$yi]) { |
||
| 377 | continue; |
||
| 378 | } |
||
| 379 | $yhash[$line] = 1; |
||
| 380 | $this->yv[] = $line; |
||
| 381 | $this->yind[] = $yi; |
||
| 382 | } |
||
| 383 | for ($xi = $skip; $xi < $n_from - $endskip; ++$xi) { |
||
| 384 | $line = $from_lines[$xi]; |
||
| 385 | $this->xchanged[$xi] = empty($yhash[$line]); |
||
| 386 | if ($this->xchanged[$xi]) { |
||
| 387 | continue; |
||
| 388 | } |
||
| 389 | $this->xv[] = $line; |
||
| 390 | $this->xind[] = $xi; |
||
| 391 | } |
||
| 392 | |||
| 393 | // Find the LCS. |
||
| 394 | $this->_compareseq(0, count($this->xv), 0, count($this->yv)); |
||
| 395 | |||
| 396 | // Merge edits when possible. |
||
| 397 | $this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged); |
||
| 398 | $this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged); |
||
| 399 | |||
| 400 | // Compute the edit operations. |
||
| 401 | $edits = []; |
||
| 402 | $xi = $yi = 0; |
||
| 403 | while ($xi < $n_from || $yi < $n_to) { |
||
| 404 | assert($yi < $n_to || $this->xchanged[$xi]); |
||
| 405 | assert($xi < $n_from || $this->ychanged[$yi]); |
||
| 406 | |||
| 407 | // Skip matching "snake". |
||
| 408 | $copy = []; |
||
| 409 | while ($xi < $n_from && $yi < $n_to && !$this->xchanged[$xi] && !$this->ychanged[$yi]) { |
||
| 410 | $copy[] = $from_lines[$xi++]; |
||
| 411 | ++$yi; |
||
| 412 | } |
||
| 413 | if ($copy) { |
||
| 414 | $edits[] = new Text_Diff_Op_copy($copy); |
||
| 415 | } |
||
| 416 | |||
| 417 | // Find deletes & adds. |
||
| 418 | $delete = []; |
||
| 419 | while ($xi < $n_from && $this->xchanged[$xi]) { |
||
| 420 | $delete[] = $from_lines[$xi++]; |
||
| 421 | } |
||
| 422 | |||
| 423 | $add = []; |
||
| 424 | while ($yi < $n_to && $this->ychanged[$yi]) { |
||
| 425 | $add[] = $to_lines[$yi++]; |
||
| 426 | } |
||
| 427 | |||
| 428 | if ($delete && $add) { |
||
| 429 | $edits[] = new Text_Diff_Op_change($delete, $add); |
||
| 430 | } elseif ($delete) { |
||
| 431 | $edits[] = new Text_Diff_Op_delete($delete); |
||
| 432 | } elseif ($add) { |
||
| 433 | $edits[] = new Text_Diff_Op_add($add); |
||
| 434 | } |
||
| 435 | } |
||
| 436 | |||
| 437 | return $edits; |
||
| 438 | } |
||
| 439 | |||
| 440 | /** |
||
| 441 | * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF, |
||
| 442 | * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized |
||
| 443 | * segments. |
||
| 444 | * |
||
| 445 | * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of |
||
| 446 | * NCHUNKS+1 (X, Y) indexes giving the diving points between sub |
||
| 447 | * sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1), |
||
| 448 | * the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) == |
||
| 449 | * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM). |
||
| 450 | * |
||
| 451 | * This function assumes that the first lines of the specified portions of |
||
| 452 | * the two files do not match, and likewise that the last lines do not |
||
| 453 | * match. The caller must trim matching lines from the beginning and end |
||
| 454 | * of the portions it is going to specify. |
||
| 455 | * @param $xoff |
||
| 456 | * @param $xlim |
||
| 457 | * @param $yoff |
||
| 458 | * @param $ylim |
||
| 459 | * @param $nchunks |
||
| 460 | * @return array |
||
| 461 | */ |
||
| 462 | public function _diag($xoff, $xlim, $yoff, $ylim, $nchunks) |
||
| 463 | { |
||
| 464 | $flip = false; |
||
| 465 | |||
| 466 | if ($xlim - $xoff > $ylim - $yoff) { |
||
| 467 | /* Things seems faster (I'm not sure I understand why) when the |
||
| 468 | * shortest sequence is in X. */ |
||
| 469 | $flip = true; |
||
| 470 | list($xoff, $xlim, $yoff, $ylim) = [$yoff, $ylim, $xoff, $xlim]; |
||
| 471 | } |
||
| 472 | |||
| 473 | if ($flip) { |
||
| 474 | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
||
| 475 | $ymatches[$this->xv[$i]][] = $i; |
||
| 476 | } |
||
| 477 | } else { |
||
| 478 | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
||
| 479 | $ymatches[$this->yv[$i]][] = $i; |
||
| 480 | } |
||
| 481 | } |
||
| 482 | |||
| 483 | $this->lcs = 0; |
||
| 484 | $this->seq[0] = $yoff - 1; |
||
| 485 | $this->in_seq = []; |
||
| 486 | $ymids[0] = []; |
||
| 487 | |||
| 488 | $numer = $xlim - $xoff + $nchunks - 1; |
||
| 489 | $x = $xoff; |
||
| 490 | for ($chunk = 0; $chunk < $nchunks; ++$chunk) { |
||
| 491 | if ($chunk > 0) { |
||
| 492 | for ($i = 0; $i <= $this->lcs; ++$i) { |
||
| 493 | $ymids[$i][$chunk - 1] = $this->seq[$i]; |
||
| 494 | } |
||
| 495 | } |
||
| 496 | |||
| 497 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks); |
||
| 498 | for (; $x < $x1; ++$x) { |
||
| 499 | $line = $flip ? $this->yv[$x] : $this->xv[$x]; |
||
| 500 | if (empty($ymatches[$line])) { |
||
| 501 | continue; |
||
| 502 | } |
||
| 503 | $matches = $ymatches[$line]; |
||
| 504 | foreach ($matches as $y) { |
||
| 505 | if (empty($this->in_seq[$y])) { |
||
| 506 | $k = $this->_lcsPos($y); |
||
| 507 | assert($k > 0); |
||
| 508 | $ymids[$k] = $ymids[$k - 1]; |
||
| 509 | break; |
||
| 510 | } |
||
| 511 | } |
||
| 512 | |||
| 513 | // while (list($junk, $y) = each($matches)) { |
||
| 514 | foreach ($matches as $junk => $y) { |
||
| 515 | if ($y > $this->seq[$k - 1]) { |
||
| 516 | assert($y < $this->seq[$k]); |
||
| 517 | /* Optimization: this is a common case: next match is |
||
| 518 | * just replacing previous match. */ |
||
| 519 | $this->in_seq[$this->seq[$k]] = false; |
||
| 520 | $this->seq[$k] = $y; |
||
| 521 | $this->in_seq[$y] = 1; |
||
| 522 | } elseif (empty($this->in_seq[$y])) { |
||
| 523 | $k = $this->_lcsPos($y); |
||
| 524 | assert($k > 0); |
||
| 525 | $ymids[$k] = $ymids[$k - 1]; |
||
| 526 | } |
||
| 527 | } |
||
| 528 | } |
||
| 529 | } |
||
| 530 | |||
| 531 | $seps[] = $flip ? [$yoff, $xoff] : [$xoff, $yoff]; |
||
| 532 | $ymid = $ymids[$this->lcs]; |
||
| 533 | for ($n = 0; $n < $nchunks - 1; ++$n) { |
||
| 534 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks); |
||
| 535 | $y1 = $ymid[$n] + 1; |
||
| 536 | $seps[] = $flip ? [$y1, $x1] : [$x1, $y1]; |
||
| 537 | } |
||
| 538 | |||
| 539 | $seps[] = $flip ? [$ylim, $xlim] : [$xlim, $ylim]; |
||
| 540 | |||
| 541 | return [$this->lcs, $seps]; |
||
| 542 | } |
||
| 543 | |||
| 544 | /** |
||
| 545 | * @param $ypos |
||
| 546 | * @return int |
||
| 547 | */ |
||
| 548 | public function _lcsPos($ypos) |
||
| 549 | { |
||
| 550 | $end = $this->lcs; |
||
| 551 | if (0 == $end || $ypos > $this->seq[$end]) { |
||
| 552 | $this->seq[++$this->lcs] = $ypos; |
||
| 553 | $this->in_seq[$ypos] = 1; |
||
| 554 | |||
| 555 | return $this->lcs; |
||
| 556 | } |
||
| 557 | |||
| 558 | $beg = 1; |
||
| 559 | while ($beg < $end) { |
||
| 560 | $mid = (int)(($beg + $end) / 2); |
||
| 561 | if ($ypos > $this->seq[$mid]) { |
||
| 562 | $beg = $mid + 1; |
||
| 563 | } else { |
||
| 564 | $end = $mid; |
||
| 565 | } |
||
| 566 | } |
||
| 567 | |||
| 568 | assert($ypos != $this->seq[$end]); |
||
| 569 | |||
| 570 | $this->in_seq[$this->seq[$end]] = false; |
||
| 571 | $this->seq[$end] = $ypos; |
||
| 572 | $this->in_seq[$ypos] = 1; |
||
| 573 | |||
| 574 | return $end; |
||
| 575 | } |
||
| 576 | |||
| 577 | /** |
||
| 578 | * Finds LCS of two sequences. |
||
| 579 | * |
||
| 580 | * The results are recorded in the vectors $this->{x,y}changed[], by |
||
| 581 | * storing a 1 in the element for each line that is an insertion or |
||
| 582 | * deletion (ie. is not in the LCS). |
||
| 583 | * |
||
| 584 | * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1. |
||
| 585 | * |
||
| 586 | * Note that XLIM, YLIM are exclusive bounds. All line numbers are |
||
| 587 | * origin-0 and discarded lines are not counted. |
||
| 588 | * @param $xoff |
||
| 589 | * @param $xlim |
||
| 590 | * @param $yoff |
||
| 591 | * @param $ylim |
||
| 592 | */ |
||
| 593 | public function _compareseq($xoff, $xlim, $yoff, $ylim) |
||
| 633 | } |
||
| 634 | } |
||
| 635 | } |
||
| 636 | |||
| 637 | /** |
||
| 638 | * Adjusts inserts/deletes of identical lines to join changes as much as |
||
| 639 | * possible. |
||
| 640 | * |
||
| 641 | * We do something when a run of changed lines include a line at one end |
||
| 642 | * and has an excluded, identical line at the other. We are free to |
||
| 643 | * choose which identical line is included. `compareseq' usually chooses |
||
| 644 | * the one at the beginning, but usually it is cleaner to consider the |
||
| 645 | * following identical line to be the "change". |
||
| 646 | * |
||
| 647 | * This is extracted verbatim from analyze.c (GNU diffutils-2.7). |
||
| 648 | * @param $lines |
||
| 649 | * @param $changed |
||
| 650 | * @param $other_changed |
||
| 651 | */ |
||
| 652 | public function _shiftBoundaries($lines, &$changed, $other_changed) |
||
| 757 | } |
||
| 758 | } |
||
| 759 | } |
||
| 760 | } |
||
| 761 | |||
| 762 | /** |
||
| 763 | * @package Text_Diff |
||
| 907 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.