Issues (4)

lib/Caxy/HtmlDiff/ListDiffLines.php (1 issue)

1
<?php
2
3
namespace Caxy\HtmlDiff;
4
5
use Caxy\HtmlDiff\Strategy\ListItemMatchStrategy;
6
use DOMDocument;
7
use DOMElement;
8
use DOMNode;
9
use DOMText;
10
use DOMXPath;
11
use LogicException;
12
13
class ListDiffLines extends AbstractDiff
14
{
15
    private const CLASS_LIST_ITEM_ADDED   = 'normal new';
16
    private const CLASS_LIST_ITEM_DELETED = 'removed';
17
    private const CLASS_LIST_ITEM_CHANGED = 'replacement';
18
    private const CLASS_LIST_ITEM_NONE    = 'normal';
19
20
    protected const LIST_TAG_NAMES = ['ul', 'ol', 'dl'];
21
22
    /**
23
     * List of tags that should be included when retrieving
24
     * text from a single list item that will be used in
25
     * matching logic (and only in matching logic).
26
     *
27
     * @see getRelevantNodeText()
28
     *
29
     * @var array
30
     */
31
    protected static $listContentTags = [
32
        'h1', 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'br', 'hr', 'code',
33
        'input', 'form', 'img', 'span', 'a', 'i', 'b', 'strong', 'em',
34
        'font', 'big', 'del', 'tt', 'sub', 'sup', 'strike',
35
    ];
36
37
    /**
38
     * @var LcsService
39
     */
40
    protected $lcsService;
41
42
    /**
43
     * @var array<string, DOMElement>
44
     */
45
    private $nodeCache = [];
46
47
    /**
48
     * @param string              $oldText
49
     * @param string              $newText
50
     * @param HtmlDiffConfig|null $config
51
     *
52
     * @return ListDiffLines
53
     */
54 8
    public static function create($oldText, $newText, HtmlDiffConfig $config = null)
55
    {
56 8
        $diff = new self($oldText, $newText);
57
58 8
        if (null !== $config) {
59 8
            $diff->setConfig($config);
60
        }
61
62 8
        return $diff;
63
    }
64
65
    /**
66
     * {@inheritDoc}
67
     */
68 8
    public function build()
69
    {
70 8
        $this->prepare();
71
72 8
        if ($this->hasDiffCache() && $this->getDiffCache()->contains($this->oldText, $this->newText)) {
73
            $this->content = $this->getDiffCache()->fetch($this->oldText, $this->newText);
74
75
            return $this->content;
76
        }
77
78 8
        $this->lcsService = new LcsService(
79 8
            new ListItemMatchStrategy($this->stringUtil, $this->config->getMatchThreshold())
80
        );
81
82 8
        return $this->listByLines($this->oldText, $this->newText);
83
    }
84
85 8
    protected function listByLines(string $old, string $new) : string
86
    {
87 8
        $new = mb_convert_encoding($new, 'HTML-ENTITIES', "UTF-8");
88 8
        $old = mb_convert_encoding($old, 'HTML-ENTITIES', "UTF-8");
89
90 8
        $newDom = new DOMDocument();
91 8
        $newDom->loadHTML($new);
92
93 8
        $oldDom = new DOMDocument();
94 8
        $oldDom->loadHTML($old);
95
96 8
        $newListNode = $this->findListNode($newDom);
97 8
        $oldListNode = $this->findListNode($oldDom);
98
99 8
        $operations = $this->getListItemOperations($oldListNode, $newListNode);
100
101 8
        return $this->processOperations($operations, $oldListNode, $newListNode);
102
    }
103
104 8
    protected function findListNode(DOMDocument $dom) : DOMNode
105
    {
106 8
        $xPathQuery = '//' . implode('|//', self::LIST_TAG_NAMES);
107 8
        $xPath      = new DOMXPath($dom);
108 8
        $listNodes  = $xPath->query($xPathQuery);
109
110 8
        if ($listNodes->length > 0) {
111 8
            return $listNodes->item(0);
112
        }
113
114
        throw new LogicException('Unable to diff list; missing list node');
115
    }
116
117
    /**
118
     * @return Operation[]
119
     */
120 8
    protected function getListItemOperations(DOMElement $oldListNode, DOMElement $newListNode) : array
121
    {
122
        // Prepare arrays of list item content to use in LCS algorithm
123 8
        $oldListText = $this->getListTextArray($oldListNode);
124 8
        $newListText = $this->getListTextArray($newListNode);
125
126 8
        $lcsMatches = $this->lcsService->longestCommonSubsequence($oldListText, $newListText);
127
128 8
        $oldLength = count($oldListText);
129 8
        $newLength = count($newListText);
130
131 8
        $operations = array();
132 8
        $currentLineInOld = 0;
133 8
        $currentLineInNew = 0;
134 8
        $lcsMatches[$oldLength + 1] = $newLength + 1;
135 8
        foreach ($lcsMatches as $matchInOld => $matchInNew) {
136
            // No matching line in new list
137 8
            if ($matchInNew === 0) {
138 8
                continue;
139
            }
140
141 8
            $nextLineInOld = $currentLineInOld + 1;
142 8
            $nextLineInNew = $currentLineInNew + 1;
143
144 8
            if ($matchInNew > $nextLineInNew && $matchInOld > $nextLineInOld) {
145
                // Change
146 1
                $operations[] = new Operation(
147 1
                    Operation::CHANGED,
148
                    $nextLineInOld,
149 1
                    $matchInOld - 1,
150
                    $nextLineInNew,
151 1
                    $matchInNew - 1
152
                );
153 8
            } elseif ($matchInNew > $nextLineInNew && $matchInOld === $nextLineInOld) {
154
                // Add items before this
155 3
                $operations[] = new Operation(
156 3
                    Operation::ADDED,
157
                    $currentLineInOld,
158
                    $currentLineInOld,
159
                    $nextLineInNew,
160 3
                    $matchInNew - 1
161
                );
162 8
            } elseif ($matchInNew === $nextLineInNew && $matchInOld > $nextLineInOld) {
163
                // Delete items before this
164 3
                $operations[] = new Operation(
165 3
                    Operation::DELETED,
166
                    $nextLineInOld,
167 3
                    $matchInOld - 1,
168
                    $currentLineInNew,
169
                    $currentLineInNew
170
                );
171
            }
172
173 8
            $currentLineInNew = $matchInNew;
174 8
            $currentLineInOld = $matchInOld;
175
        }
176
177 8
        return $operations;
178
    }
179
180
    /**
181
     * @return string[]
182
     */
183 8
    protected function getListTextArray(DOMElement $listNode) : array
184
    {
185 8
        $output = [];
186
187 8
        foreach ($listNode->childNodes as $listItem) {
188 8
            if ($listItem instanceof DOMText) {
189 8
                continue;
190
            }
191
192 8
            $output[] = $this->getRelevantNodeText($listItem);
193
        }
194
195 8
        return $output;
196
    }
197
198 8
    protected function getRelevantNodeText(DOMNode $node) : string
199
    {
200 8
        if ($node->hasChildNodes() === false) {
201
            return $node->textContent;
202
        }
203
204 8
        $output = '';
205
206
        /** @var DOMElement $child */
207 8
        foreach ($node->childNodes as $child) {
208 8
            if ($child->hasChildNodes() === false) {
209 8
                $output .= $this->getOuterText($child);
210
211 8
                continue;
212
            }
213
214 6
            if (in_array($child->tagName, static::$listContentTags, true) === true) {
215 4
                $output .= sprintf(
216 4
                    '<%1$s>%2$s</%1$s>',
217 4
                    $child->tagName,
218 4
                    $this->getRelevantNodeText($child)
219
                );
220
            }
221
        }
222
223 8
        return $output;
224
    }
225
226 4
    protected function deleteListItem(DOMElement $li) : string
227
    {
228 4
        $this->wrapNodeContent($li, 'del');
229
230 4
        $this->appendClassToNode($li, self::CLASS_LIST_ITEM_DELETED);
231
232 4
        return $this->getOuterText($li);
233
    }
234
235 4
    protected function addListItem(DOMElement $li, bool $replacement = false) : string
236
    {
237 4
        $this->wrapNodeContent($li, 'ins');
238
239 4
        $this->appendClassToNode(
240 4
            $li,
241 4
            $replacement === true ? self::CLASS_LIST_ITEM_CHANGED : self::CLASS_LIST_ITEM_ADDED
242
        );
243
244 4
        return $this->getOuterText($li);
245
    }
246
247
    /**
248
     * @param Operation[] $operations
249
     */
250 8
    protected function processOperations(array $operations, DOMElement $oldListNode, DOMElement $newListNode) : string
251
    {
252 8
        $output = '';
253
254 8
        $indexInOld = 0;
255 8
        $indexInNew = 0;
256 8
        $lastOperation = null;
257
258 8
        foreach ($operations as $operation) {
259 6
            $replaced = false;
260 6
            while ($operation->startInOld > ($operation->action === Operation::ADDED ? $indexInOld : $indexInOld + 1)) {
261 5
                $li = $this->getChildNodeByIndex($oldListNode, $indexInOld);
262 5
                $matchingLi = null;
263 5
                if ($operation->startInNew > ($operation->action === Operation::DELETED ? $indexInNew
264 5
                        : $indexInNew + 1)
265
                ) {
266 5
                    $matchingLi = $this->getChildNodeByIndex($newListNode, $indexInNew);
267
                }
268
269 5
                if (null !== $matchingLi) {
270 5
                    $htmlDiff = HtmlDiff::create(
271 5
                        $this->getInnerHtml($li),
272 5
                        $this->getInnerHtml($matchingLi),
273 5
                        $this->config
274
                    );
275
276 5
                    $this->setInnerHtml($li, $htmlDiff->build());
277
278 5
                    $indexInNew++;
279
                }
280
281 5
                $class = self::CLASS_LIST_ITEM_NONE;
282
283 5
                if ($lastOperation === Operation::DELETED && !$replaced) {
284 2
                    $class = self::CLASS_LIST_ITEM_CHANGED;
285 2
                    $replaced = true;
286
                }
287
288 5
                $this->appendClassToNode($li, $class);
289
290 5
                $output .= $this->getOuterText($li);
291 5
                $indexInOld++;
292
            }
293
294 6
            switch ($operation->action) {
295
                case Operation::ADDED:
296 3
                    for ($i = $operation->startInNew; $i <= $operation->endInNew; $i++) {
297 3
                        $output .= $this->addListItem(
298 3
                            $this->getChildNodeByIndex($newListNode, $i - 1)
299
                        );
300
                    }
301 3
                    $indexInNew = $operation->endInNew;
302 3
                    break;
303
304
                case Operation::DELETED:
305 3
                    for ($i = $operation->startInOld; $i <= $operation->endInOld; $i++) {
306 3
                        $output .= $this->deleteListItem(
307 3
                            $this->getChildNodeByIndex($oldListNode, $i - 1)
308
                        );
309
                    }
310 3
                    $indexInOld = $operation->endInOld;
311 3
                    break;
312
313
                case Operation::CHANGED:
314 1
                    $changeDelta = 0;
315 1
                    for ($i = $operation->startInOld; $i <= $operation->endInOld; $i++) {
316 1
                        $output .= $this->deleteListItem(
317 1
                            $this->getChildNodeByIndex($oldListNode, $i - 1)
318
                        );
319 1
                        $changeDelta--;
320
                    }
321 1
                    for ($i = $operation->startInNew; $i <= $operation->endInNew; $i++) {
322 1
                        $output .= $this->addListItem(
323 1
                            $this->getChildNodeByIndex($newListNode, $i - 1),
324 1
                            ($changeDelta < 0)
325
                        );
326 1
                        $changeDelta++;
327
                    }
328 1
                    $indexInOld = $operation->endInOld;
329 1
                    $indexInNew = $operation->endInNew;
330 1
                    break;
331
            }
332
333 6
            $lastOperation = $operation->action;
334
        }
335
336 8
        $oldCount = $this->childCountWithoutTextNode($oldListNode);
337 8
        $newCount = $this->childCountWithoutTextNode($newListNode);
338
339 8
        while ($indexInOld < $oldCount) {
340 5
            $li = $this->getChildNodeByIndex($oldListNode, $indexInOld);
341 5
            $matchingLi = null;
342 5
            if ($indexInNew < $newCount) {
343 5
                $matchingLi = $this->getChildNodeByIndex($newListNode, $indexInNew);
344
            }
345
346 5
            if (null !== $matchingLi) {
347 5
                $htmlDiff = HtmlDiff::create(
348 5
                    $this->getInnerHtml($li),
349 5
                    $this->getInnerHtml($matchingLi),
350 5
                    $this->config
351
                );
352
353 5
                $this->setInnerHtml($li, $htmlDiff->build());
354
355 5
                $indexInNew++;
356
            }
357
358 5
            $class = self::CLASS_LIST_ITEM_NONE;
359
360 5
            if ($lastOperation === Operation::DELETED) {
361
                $class = self::CLASS_LIST_ITEM_CHANGED;
362
            }
363
364 5
            $this->appendClassToNode($li, $class);
365
366 5
            $output .= $this->getOuterText($li);
367 5
            $indexInOld++;
368
        }
369
370 8
        $this->setInnerHtml($newListNode, $output);
371 8
        $this->appendClassToNode($newListNode, 'diff-list');
372
373 8
        return $newListNode->ownerDocument->saveHTML($newListNode);
374
    }
375
376 8
    protected function appendClassToNode(DOMElement $node, string $class)
377
    {
378 8
        $node->setAttribute(
379 8
            'class',
380 8
            trim(sprintf('%s %s', $node->getAttribute('class'), $class))
381
        );
382 8
    }
383
384 8
    private function getOuterText(DOMNode $node) : string
385
    {
386 8
        return $node->ownerDocument->saveHTML($node);
387
    }
388
389 8
    private function getInnerHtml(DOMNode $node) : string
390
    {
391 8
        $bufferDom = new DOMDocument('1.0', 'UTF-8');
392
393 8
        foreach($node->childNodes as $childNode)
0 ignored issues
show
Expected 1 space(s) after FOREACH keyword; 0 found
Loading history...
394
        {
395 8
            $bufferDom->appendChild($bufferDom->importNode($childNode, true));
396
        }
397
398 8
        return trim($bufferDom->saveHTML());
399
    }
400
401 8
    private function setInnerHtml(DOMNode $node, string $html) : void
402
    {
403 8
        $html = sprintf('<%s>%s</%s>', 'body', $html, 'body');
404 8
        $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
405
406 8
        $node->nodeValue = '';
407
408 8
        $bufferDom = new DOMDocument('1.0', 'UTF-8');
409 8
        $bufferDom->loadHTML($html);
410
411 8
        $bodyNode = $bufferDom->getElementsByTagName('body')->item(0);
412
413 8
        foreach ($bodyNode->childNodes as $childNode) {
414 8
            $node->appendChild($node->ownerDocument->importNode($childNode, true));
415
        }
416
417 8
        $this->nodeCache = [];
418 8
    }
419
420 6
    private function wrapNodeContent(DOMElement $node, string $tagName) : void
421
    {
422 6
        $childNodes = [];
423
424 6
        foreach ($node->childNodes as $childNode) {
425 6
            $childNodes[] = $childNode;
426
        }
427
428 6
        $wrapNode = $node->ownerDocument->createElement($tagName);
429
430 6
        $node->appendChild($wrapNode);
431
432 6
        foreach ($childNodes as $childNode) {
433 6
            $wrapNode->appendChild($childNode);
434
        }
435 6
    }
436
437 8
    private function childCountWithoutTextNode(DOMNode $node) : int
438
    {
439 8
        $counter = 0;
440
441 8
        foreach ($node->childNodes as $childNode) {
442 8
            if ($childNode instanceof DOMText) {
443 8
                continue;
444
            }
445
446 8
            $counter++;
447
        }
448
449 8
        return $counter;
450
    }
451
452 8
    private function getChildNodeByIndex(DOMNode $node, int $index) : DOMElement
453
    {
454 8
        $nodeHash = spl_object_hash($node);
455
456 8
        if (isset($this->nodeCache[$nodeHash]) === true) {
457 4
            return $this->nodeCache[$nodeHash][$index];
458
        }
459
460 8
        $listCache[$nodeHash] = [];
461
462 8
        foreach ($node->childNodes as $childNode) {
463 8
            if ($childNode instanceof DOMText === false) {
464 8
                $this->nodeCache[$nodeHash][] = $childNode;
465
            }
466
        }
467
468 8
        return $this->nodeCache[$nodeHash][$index];
469
    }
470
}
471