Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like Text_Diff_Engine_native often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Text_Diff_Engine_native, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
337 | * @param $from_lines |
||
338 | * @param $to_lines |
||
339 | * @return array |
||
340 | */ |
||
341 | public function diff($from_lines, $to_lines) |
||
342 | { |
||
343 | $n_from = count($from_lines); |
||
344 | $n_to = count($to_lines); |
||
345 | |||
346 | $this->xchanged = $this->ychanged = []; |
||
347 | $this->xv = $this->yv = []; |
||
348 | $this->xind = $this->yind = []; |
||
349 | unset($this->seq, $this->in_seq, $this->lcs); |
||
350 | |||
351 | // Skip leading common lines. |
||
352 | for ($skip = 0; $skip < $n_from && $skip < $n_to; ++$skip) { |
||
353 | if ($from_lines[$skip] != $to_lines[$skip]) { |
||
354 | break; |
||
355 | } |
||
356 | $this->xchanged[$skip] = $this->ychanged[$skip] = false; |
||
357 | } |
||
358 | |||
359 | // Skip trailing common lines. |
||
360 | $xi = $n_from; |
||
361 | $yi = $n_to; |
||
362 | for ($endskip = 0; --$xi > $skip && --$yi > $skip; ++$endskip) { |
||
363 | if ($from_lines[$xi] != $to_lines[$yi]) { |
||
364 | break; |
||
365 | } |
||
366 | $this->xchanged[$xi] = $this->ychanged[$yi] = false; |
||
367 | } |
||
368 | |||
369 | // Ignore lines which do not exist in both files. |
||
370 | for ($xi = $skip; $xi < $n_from - $endskip; ++$xi) { |
||
371 | $xhash[$from_lines[$xi]] = 1; |
||
372 | } |
||
373 | for ($yi = $skip; $yi < $n_to - $endskip; ++$yi) { |
||
374 | $line = $to_lines[$yi]; |
||
375 | $this->ychanged[$yi] = empty($xhash[$line]); |
||
376 | if ($this->ychanged[$yi]) { |
||
377 | continue; |
||
378 | } |
||
379 | $yhash[$line] = 1; |
||
380 | $this->yv[] = $line; |
||
381 | $this->yind[] = $yi; |
||
382 | } |
||
383 | for ($xi = $skip; $xi < $n_from - $endskip; ++$xi) { |
||
384 | $line = $from_lines[$xi]; |
||
385 | $this->xchanged[$xi] = empty($yhash[$line]); |
||
386 | if ($this->xchanged[$xi]) { |
||
387 | continue; |
||
388 | } |
||
389 | $this->xv[] = $line; |
||
390 | $this->xind[] = $xi; |
||
391 | } |
||
392 | |||
393 | // Find the LCS. |
||
394 | $this->_compareseq(0, count($this->xv), 0, count($this->yv)); |
||
395 | |||
396 | // Merge edits when possible. |
||
397 | $this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged); |
||
398 | $this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged); |
||
399 | |||
400 | // Compute the edit operations. |
||
401 | $edits = []; |
||
402 | $xi = $yi = 0; |
||
403 | while ($xi < $n_from || $yi < $n_to) { |
||
404 | assert($yi < $n_to || $this->xchanged[$xi]); |
||
405 | assert($xi < $n_from || $this->ychanged[$yi]); |
||
406 | |||
407 | // Skip matching "snake". |
||
408 | $copy = []; |
||
409 | while ($xi < $n_from && $yi < $n_to && !$this->xchanged[$xi] && !$this->ychanged[$yi]) { |
||
410 | $copy[] = $from_lines[$xi++]; |
||
411 | ++$yi; |
||
412 | } |
||
413 | if ($copy) { |
||
414 | $edits[] = new Text_Diff_Op_copy($copy); |
||
415 | } |
||
416 | |||
417 | // Find deletes & adds. |
||
418 | $delete = []; |
||
419 | while ($xi < $n_from && $this->xchanged[$xi]) { |
||
420 | $delete[] = $from_lines[$xi++]; |
||
421 | } |
||
422 | |||
423 | $add = []; |
||
424 | while ($yi < $n_to && $this->ychanged[$yi]) { |
||
425 | $add[] = $to_lines[$yi++]; |
||
426 | } |
||
427 | |||
428 | if ($delete && $add) { |
||
429 | $edits[] = new Text_Diff_Op_change($delete, $add); |
||
430 | } elseif ($delete) { |
||
431 | $edits[] = new Text_Diff_Op_delete($delete); |
||
432 | } elseif ($add) { |
||
433 | $edits[] = new Text_Diff_Op_add($add); |
||
434 | } |
||
435 | } |
||
436 | |||
437 | return $edits; |
||
438 | } |
||
439 | |||
440 | /** |
||
441 | * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF, |
||
442 | * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized |
||
443 | * segments. |
||
444 | * |
||
445 | * Returns (LCS, PTS). LCS is the length of the LCS. PTS is an array of |
||
446 | * NCHUNKS+1 (X, Y) indexes giving the diving points between sub |
||
447 | * sequences. The first sub-sequence is contained in (X0, X1), (Y0, Y1), |
||
448 | * the second in (X1, X2), (Y1, Y2) and so on. Note that (X0, Y0) == |
||
449 | * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM). |
||
450 | * |
||
451 | * This function assumes that the first lines of the specified portions of |
||
452 | * the two files do not match, and likewise that the last lines do not |
||
453 | * match. The caller must trim matching lines from the beginning and end |
||
454 | * of the portions it is going to specify. |
||
455 | * @param $xoff |
||
456 | * @param $xlim |
||
457 | * @param $yoff |
||
458 | * @param $ylim |
||
459 | * @param $nchunks |
||
460 | * @return array |
||
461 | */ |
||
462 | public function _diag($xoff, $xlim, $yoff, $ylim, $nchunks) |
||
463 | { |
||
464 | $flip = false; |
||
465 | |||
466 | if ($xlim - $xoff > $ylim - $yoff) { |
||
467 | /* Things seems faster (I'm not sure I understand why) when the |
||
468 | * shortest sequence is in X. */ |
||
469 | $flip = true; |
||
470 | list($xoff, $xlim, $yoff, $ylim) = [$yoff, $ylim, $xoff, $xlim]; |
||
471 | } |
||
472 | |||
473 | if ($flip) { |
||
474 | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
||
475 | $ymatches[$this->xv[$i]][] = $i; |
||
476 | } |
||
477 | } else { |
||
478 | for ($i = $ylim - 1; $i >= $yoff; $i--) { |
||
479 | $ymatches[$this->yv[$i]][] = $i; |
||
480 | } |
||
481 | } |
||
482 | |||
483 | $this->lcs = 0; |
||
484 | $this->seq[0] = $yoff - 1; |
||
485 | $this->in_seq = []; |
||
486 | $ymids[0] = []; |
||
487 | |||
488 | $numer = $xlim - $xoff + $nchunks - 1; |
||
489 | $x = $xoff; |
||
490 | for ($chunk = 0; $chunk < $nchunks; ++$chunk) { |
||
491 | if ($chunk > 0) { |
||
492 | for ($i = 0; $i <= $this->lcs; ++$i) { |
||
493 | $ymids[$i][$chunk - 1] = $this->seq[$i]; |
||
494 | } |
||
495 | } |
||
496 | |||
497 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks); |
||
498 | for (; $x < $x1; ++$x) { |
||
499 | $line = $flip ? $this->yv[$x] : $this->xv[$x]; |
||
500 | if (empty($ymatches[$line])) { |
||
501 | continue; |
||
502 | } |
||
503 | $matches = $ymatches[$line]; |
||
504 | foreach ($matches as $y) { |
||
505 | if (empty($this->in_seq[$y])) { |
||
506 | $k = $this->_lcsPos($y); |
||
507 | assert($k > 0); |
||
508 | $ymids[$k] = $ymids[$k - 1]; |
||
509 | break; |
||
510 | } |
||
511 | } |
||
512 | |||
513 | // while (list($junk, $y) = each($matches)) { |
||
514 | foreach ($matches as $junk => $y) { |
||
515 | if ($y > $this->seq[$k - 1]) { |
||
516 | assert($y < $this->seq[$k]); |
||
517 | /* Optimization: this is a common case: next match is |
||
518 | * just replacing previous match. */ |
||
519 | $this->in_seq[$this->seq[$k]] = false; |
||
520 | $this->seq[$k] = $y; |
||
521 | $this->in_seq[$y] = 1; |
||
522 | } elseif (empty($this->in_seq[$y])) { |
||
523 | $k = $this->_lcsPos($y); |
||
524 | assert($k > 0); |
||
525 | $ymids[$k] = $ymids[$k - 1]; |
||
526 | } |
||
527 | } |
||
528 | } |
||
529 | } |
||
530 | |||
531 | $seps[] = $flip ? [$yoff, $xoff] : [$xoff, $yoff]; |
||
532 | $ymid = $ymids[$this->lcs]; |
||
533 | for ($n = 0; $n < $nchunks - 1; ++$n) { |
||
534 | $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks); |
||
535 | $y1 = $ymid[$n] + 1; |
||
536 | $seps[] = $flip ? [$y1, $x1] : [$x1, $y1]; |
||
537 | } |
||
538 | |||
539 | $seps[] = $flip ? [$ylim, $xlim] : [$xlim, $ylim]; |
||
540 | |||
541 | return [$this->lcs, $seps]; |
||
542 | } |
||
543 | |||
544 | /** |
||
545 | * @param $ypos |
||
546 | * @return int |
||
547 | */ |
||
548 | public function _lcsPos($ypos) |
||
549 | { |
||
550 | $end = $this->lcs; |
||
551 | if (0 == $end || $ypos > $this->seq[$end]) { |
||
552 | $this->seq[++$this->lcs] = $ypos; |
||
553 | $this->in_seq[$ypos] = 1; |
||
554 | |||
555 | return $this->lcs; |
||
556 | } |
||
557 | |||
558 | $beg = 1; |
||
559 | while ($beg < $end) { |
||
560 | $mid = (int)(($beg + $end) / 2); |
||
561 | if ($ypos > $this->seq[$mid]) { |
||
562 | $beg = $mid + 1; |
||
563 | } else { |
||
564 | $end = $mid; |
||
565 | } |
||
566 | } |
||
567 | |||
568 | assert($ypos != $this->seq[$end]); |
||
569 | |||
570 | $this->in_seq[$this->seq[$end]] = false; |
||
571 | $this->seq[$end] = $ypos; |
||
572 | $this->in_seq[$ypos] = 1; |
||
573 | |||
574 | return $end; |
||
575 | } |
||
576 | |||
577 | /** |
||
578 | * Finds LCS of two sequences. |
||
579 | * |
||
580 | * The results are recorded in the vectors $this->{x,y}changed[], by |
||
581 | * storing a 1 in the element for each line that is an insertion or |
||
582 | * deletion (ie. is not in the LCS). |
||
583 | * |
||
584 | * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1. |
||
585 | * |
||
586 | * Note that XLIM, YLIM are exclusive bounds. All line numbers are |
||
587 | * origin-0 and discarded lines are not counted. |
||
588 | * @param $xoff |
||
589 | * @param $xlim |
||
590 | * @param $yoff |
||
591 | * @param $ylim |
||
592 | */ |
||
593 | public function _compareseq($xoff, $xlim, $yoff, $ylim) |
||
633 | } |
||
634 | } |
||
635 | } |
||
636 | |||
637 | /** |
||
638 | * Adjusts inserts/deletes of identical lines to join changes as much as |
||
639 | * possible. |
||
640 | * |
||
641 | * We do something when a run of changed lines include a line at one end |
||
642 | * and has an excluded, identical line at the other. We are free to |
||
643 | * choose which identical line is included. `compareseq' usually chooses |
||
644 | * the one at the beginning, but usually it is cleaner to consider the |
||
645 | * following identical line to be the "change". |
||
646 | * |
||
647 | * This is extracted verbatim from analyze.c (GNU diffutils-2.7). |
||
648 | * @param $lines |
||
649 | * @param $changed |
||
650 | * @param $other_changed |
||
651 | */ |
||
652 | public function _shiftBoundaries($lines, &$changed, $other_changed) |
||
757 | } |
||
758 | } |
||
759 | } |
||
760 | } |
||
761 | |||
762 | /** |
||
763 | * @package Text_Diff |
||
907 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.