1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Caxy\HtmlDiff; |
4
|
|
|
|
5
|
|
|
use Caxy\HtmlDiff\Util\MbStringUtil; |
6
|
|
|
use HTMLPurifier; |
7
|
|
|
use HTMLPurifier_Config; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class AbstractDiff. |
11
|
|
|
*/ |
12
|
|
|
abstract class AbstractDiff |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* @var array |
16
|
|
|
* |
17
|
|
|
* @deprecated since 0.1.0 |
18
|
|
|
*/ |
19
|
|
|
public static $defaultSpecialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p'); |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* @var array |
23
|
|
|
* |
24
|
|
|
* @deprecated since 0.1.0 |
25
|
|
|
*/ |
26
|
|
|
public static $defaultSpecialCaseChars = array('.', ',', '(', ')', '\''); |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* @var bool |
30
|
|
|
* |
31
|
|
|
* @deprecated since 0.1.0 |
32
|
|
|
*/ |
33
|
|
|
public static $defaultGroupDiffs = true; |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* @var HtmlDiffConfig |
37
|
|
|
*/ |
38
|
|
|
protected $config; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* @var string |
42
|
|
|
*/ |
43
|
|
|
protected $content; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* @var string |
47
|
|
|
*/ |
48
|
|
|
protected $oldText; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var string |
52
|
|
|
*/ |
53
|
|
|
protected $newText; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* @var array |
57
|
|
|
*/ |
58
|
|
|
protected $oldWords = array(); |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* @var array |
62
|
|
|
*/ |
63
|
|
|
protected $newWords = array(); |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* @var DiffCache[] |
67
|
|
|
*/ |
68
|
|
|
protected $diffCaches = array(); |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* @var HTMLPurifier|null |
72
|
|
|
*/ |
73
|
|
|
protected $purifier; |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* @var HTMLPurifier_Config|null |
77
|
|
|
*/ |
78
|
|
|
protected $purifierConfig = null; |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* @see array_slice_cached(); |
82
|
|
|
* @var bool |
83
|
|
|
*/ |
84
|
|
|
protected $resetCache = false; |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* @var MbStringUtil |
88
|
|
|
*/ |
89
|
|
|
protected $stringUtil; |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* AbstractDiff constructor. |
93
|
|
|
* |
94
|
|
|
* @param string $oldText |
95
|
|
|
* @param string $newText |
96
|
|
|
* @param string $encoding |
97
|
|
|
* @param null|array $specialCaseTags |
98
|
|
|
* @param null|bool $groupDiffs |
99
|
|
|
*/ |
100
|
18 |
|
public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null) |
101
|
|
|
{ |
102
|
18 |
|
$this->stringUtil = new MbStringUtil($oldText, $newText); |
103
|
|
|
|
104
|
18 |
|
$this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding)); |
105
|
|
|
|
106
|
18 |
|
if ($specialCaseTags !== null) { |
107
|
17 |
|
$this->config->setSpecialCaseTags($specialCaseTags); |
108
|
|
|
} |
109
|
|
|
|
110
|
18 |
|
if ($groupDiffs !== null) { |
111
|
|
|
$this->config->setGroupDiffs($groupDiffs); |
112
|
|
|
} |
113
|
|
|
|
114
|
18 |
|
$this->oldText = $oldText; |
115
|
18 |
|
$this->newText = $newText; |
116
|
18 |
|
$this->content = ''; |
117
|
18 |
|
} |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* @return bool|string |
121
|
|
|
*/ |
122
|
|
|
abstract public function build(); |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* Initializes HTMLPurifier with cache location. |
126
|
|
|
* |
127
|
|
|
* @param null|string $defaultPurifierSerializerCache |
128
|
|
|
*/ |
129
|
17 |
|
public function initPurifier($defaultPurifierSerializerCache = null) |
130
|
|
|
{ |
131
|
17 |
|
if (null !== $this->purifierConfig) { |
132
|
2 |
|
$HTMLPurifierConfig = $this->purifierConfig; |
133
|
|
|
} else { |
134
|
17 |
|
$HTMLPurifierConfig = HTMLPurifier_Config::createDefault(); |
135
|
|
|
} |
136
|
|
|
|
137
|
|
|
// Cache.SerializerPath defaults to Null and sets |
138
|
|
|
// the location to inside the vendor HTMLPurifier library |
139
|
|
|
// under the DefinitionCache/Serializer folder. |
140
|
17 |
|
if (!is_null($defaultPurifierSerializerCache)) { |
141
|
2 |
|
$HTMLPurifierConfig->set('Cache.SerializerPath', $defaultPurifierSerializerCache); |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
// Cache.SerializerPermissions defaults to 0744. |
145
|
|
|
// This setting allows the cache files to be deleted by any user, as they are typically |
146
|
|
|
// created by the web/php user (www-user, php-fpm, etc.) |
147
|
17 |
|
$HTMLPurifierConfig->set('Cache.SerializerPermissions', 0777); |
148
|
|
|
|
149
|
17 |
|
$this->purifier = new HTMLPurifier($HTMLPurifierConfig); |
150
|
17 |
|
} |
151
|
|
|
|
152
|
|
|
/** |
153
|
|
|
* Prepare (purify) the HTML |
154
|
|
|
* |
155
|
|
|
* @return void |
156
|
|
|
*/ |
157
|
18 |
|
protected function prepare() |
158
|
|
|
{ |
159
|
18 |
|
if (false === $this->config->isPurifierEnabled()) { |
160
|
1 |
|
return; |
161
|
|
|
} |
162
|
|
|
|
163
|
17 |
|
$this->initPurifier($this->config->getPurifierCacheLocation()); |
164
|
|
|
|
165
|
17 |
|
$this->oldText = $this->purifyHtml($this->oldText); |
|
|
|
|
166
|
17 |
|
$this->newText = $this->purifyHtml($this->newText); |
|
|
|
|
167
|
17 |
|
} |
168
|
|
|
|
169
|
|
|
/** |
170
|
|
|
* @return DiffCache|null |
171
|
|
|
*/ |
172
|
|
|
protected function getDiffCache() |
173
|
|
|
{ |
174
|
|
|
if (!$this->hasDiffCache()) { |
175
|
|
|
return null; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
$hash = spl_object_hash($this->getConfig()->getCacheProvider()); |
179
|
|
|
|
180
|
|
|
if (!array_key_exists($hash, $this->diffCaches)) { |
181
|
|
|
$this->diffCaches[$hash] = new DiffCache($this->getConfig()->getCacheProvider()); |
|
|
|
|
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
return $this->diffCaches[$hash]; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* @return bool |
189
|
|
|
*/ |
190
|
18 |
|
protected function hasDiffCache() |
191
|
|
|
{ |
192
|
18 |
|
return null !== $this->getConfig()->getCacheProvider(); |
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
/** |
196
|
|
|
* @return HtmlDiffConfig |
197
|
|
|
*/ |
198
|
18 |
|
public function getConfig() |
199
|
|
|
{ |
200
|
18 |
|
return $this->config; |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* @param HtmlDiffConfig $config |
205
|
|
|
* |
206
|
|
|
* @return AbstractDiff |
207
|
|
|
*/ |
208
|
18 |
|
public function setConfig(HtmlDiffConfig $config) |
209
|
|
|
{ |
210
|
18 |
|
$this->config = $config; |
211
|
|
|
|
212
|
18 |
|
return $this; |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
/** |
216
|
|
|
* @return int |
217
|
|
|
* |
218
|
|
|
* @deprecated since 0.1.0 |
219
|
|
|
*/ |
220
|
|
|
public function getMatchThreshold() |
221
|
|
|
{ |
222
|
|
|
return $this->config->getMatchThreshold(); |
223
|
|
|
} |
224
|
|
|
|
225
|
|
|
/** |
226
|
|
|
* @param int $matchThreshold |
227
|
|
|
* |
228
|
|
|
* @return AbstractDiff |
229
|
|
|
* |
230
|
|
|
* @deprecated since 0.1.0 |
231
|
|
|
*/ |
232
|
|
|
public function setMatchThreshold($matchThreshold) |
233
|
|
|
{ |
234
|
|
|
$this->config->setMatchThreshold($matchThreshold); |
235
|
|
|
|
236
|
|
|
return $this; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
/** |
240
|
|
|
* @param array $chars |
241
|
|
|
* |
242
|
|
|
* @deprecated since 0.1.0 |
243
|
|
|
*/ |
244
|
|
|
public function setSpecialCaseChars(array $chars) |
245
|
|
|
{ |
246
|
|
|
$this->config->setSpecialCaseChars($chars); |
247
|
|
|
} |
248
|
|
|
|
249
|
|
|
/** |
250
|
|
|
* @return array|null |
251
|
|
|
* |
252
|
|
|
* @deprecated since 0.1.0 |
253
|
|
|
*/ |
254
|
|
|
public function getSpecialCaseChars() |
255
|
|
|
{ |
256
|
|
|
return $this->config->getSpecialCaseChars(); |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
/** |
260
|
|
|
* @param string $char |
261
|
|
|
* |
262
|
|
|
* @deprecated since 0.1.0 |
263
|
|
|
*/ |
264
|
|
|
public function addSpecialCaseChar($char) |
265
|
|
|
{ |
266
|
|
|
$this->config->addSpecialCaseChar($char); |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
/** |
270
|
|
|
* @param string $char |
271
|
|
|
* |
272
|
|
|
* @deprecated since 0.1.0 |
273
|
|
|
*/ |
274
|
|
|
public function removeSpecialCaseChar($char) |
275
|
|
|
{ |
276
|
|
|
$this->config->removeSpecialCaseChar($char); |
277
|
|
|
} |
278
|
|
|
|
279
|
|
|
/** |
280
|
|
|
* @param array $tags |
281
|
|
|
* |
282
|
|
|
* @deprecated since 0.1.0 |
283
|
|
|
*/ |
284
|
|
|
public function setSpecialCaseTags(array $tags = array()) |
285
|
|
|
{ |
286
|
|
|
$this->config->setSpecialCaseChars($tags); |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
/** |
290
|
|
|
* @param string $tag |
291
|
|
|
* |
292
|
|
|
* @deprecated since 0.1.0 |
293
|
|
|
*/ |
294
|
|
|
public function addSpecialCaseTag($tag) |
295
|
|
|
{ |
296
|
|
|
$this->config->addSpecialCaseTag($tag); |
297
|
|
|
} |
298
|
|
|
|
299
|
|
|
/** |
300
|
|
|
* @param string $tag |
301
|
|
|
* |
302
|
|
|
* @deprecated since 0.1.0 |
303
|
|
|
*/ |
304
|
|
|
public function removeSpecialCaseTag($tag) |
305
|
|
|
{ |
306
|
|
|
$this->config->removeSpecialCaseTag($tag); |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
/** |
310
|
|
|
* @return array|null |
311
|
|
|
* |
312
|
|
|
* @deprecated since 0.1.0 |
313
|
|
|
*/ |
314
|
|
|
public function getSpecialCaseTags() |
315
|
|
|
{ |
316
|
|
|
return $this->config->getSpecialCaseTags(); |
317
|
|
|
} |
318
|
|
|
|
319
|
|
|
/** |
320
|
|
|
* @return string |
321
|
|
|
*/ |
322
|
|
|
public function getOldHtml() |
323
|
|
|
{ |
324
|
|
|
return $this->oldText; |
325
|
|
|
} |
326
|
|
|
|
327
|
|
|
/** |
328
|
|
|
* @return string |
329
|
|
|
*/ |
330
|
|
|
public function getNewHtml() |
331
|
|
|
{ |
332
|
|
|
return $this->newText; |
333
|
|
|
} |
334
|
|
|
|
335
|
|
|
/** |
336
|
|
|
* @return string |
337
|
|
|
*/ |
338
|
|
|
public function getDifference() |
339
|
|
|
{ |
340
|
|
|
return $this->content; |
341
|
|
|
} |
342
|
|
|
|
343
|
|
|
/** |
344
|
|
|
* Clears the diff content. |
345
|
|
|
* |
346
|
|
|
* @return void |
347
|
|
|
*/ |
348
|
|
|
public function clearContent() |
349
|
|
|
{ |
350
|
|
|
$this->content = null; |
351
|
|
|
} |
352
|
|
|
|
353
|
|
|
/** |
354
|
|
|
* @param bool $boolean |
355
|
|
|
* |
356
|
|
|
* @return $this |
357
|
|
|
* |
358
|
|
|
* @deprecated since 0.1.0 |
359
|
|
|
*/ |
360
|
|
|
public function setGroupDiffs($boolean) |
361
|
|
|
{ |
362
|
|
|
$this->config->setGroupDiffs($boolean); |
363
|
|
|
|
364
|
|
|
return $this; |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
/** |
368
|
|
|
* @return bool |
369
|
|
|
* |
370
|
|
|
* @deprecated since 0.1.0 |
371
|
|
|
*/ |
372
|
18 |
|
public function isGroupDiffs() |
373
|
|
|
{ |
374
|
18 |
|
return $this->config->isGroupDiffs(); |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
/** |
378
|
|
|
* @param HTMLPurifier_Config $config |
379
|
|
|
*/ |
380
|
2 |
|
public function setHTMLPurifierConfig(HTMLPurifier_Config $config) |
381
|
|
|
{ |
382
|
2 |
|
$this->purifierConfig = $config; |
383
|
2 |
|
} |
384
|
|
|
|
385
|
|
|
/** |
386
|
|
|
* @param string $html |
387
|
|
|
* |
388
|
|
|
* @return string |
|
|
|
|
389
|
|
|
*/ |
390
|
17 |
|
protected function purifyHtml($html) |
391
|
|
|
{ |
392
|
17 |
|
if (null === $this->purifier) { |
393
|
|
|
return $html; |
394
|
|
|
} |
395
|
|
|
|
396
|
17 |
|
return $this->purifier->purify($html); |
397
|
|
|
} |
398
|
|
|
|
399
|
18 |
|
protected function splitInputsToWords() |
400
|
|
|
{ |
401
|
18 |
|
$this->setOldWords($this->convertHtmlToListOfWords($this->oldText)); |
402
|
18 |
|
$this->setNewWords($this->convertHtmlToListOfWords($this->newText)); |
403
|
18 |
|
} |
404
|
|
|
|
405
|
|
|
/** |
406
|
|
|
* @param array $oldWords |
407
|
|
|
*/ |
408
|
18 |
|
protected function setOldWords(array $oldWords) |
409
|
|
|
{ |
410
|
18 |
|
$this->resetCache = true; |
411
|
18 |
|
$this->oldWords = $oldWords; |
412
|
18 |
|
} |
413
|
|
|
|
414
|
|
|
/** |
415
|
|
|
* @param array $newWords |
416
|
|
|
*/ |
417
|
18 |
|
protected function setNewWords(array $newWords) |
418
|
|
|
{ |
419
|
18 |
|
$this->resetCache = true; |
420
|
18 |
|
$this->newWords = $newWords; |
421
|
18 |
|
} |
422
|
|
|
|
423
|
|
|
/** |
424
|
|
|
* @return string[] |
425
|
|
|
*/ |
426
|
18 |
|
protected function convertHtmlToListOfWords(string $text) : array |
427
|
|
|
{ |
428
|
18 |
|
$words = []; |
429
|
18 |
|
$sentencesAndTags = []; |
430
|
|
|
|
431
|
18 |
|
$specialCharacters = ''; |
432
|
|
|
|
433
|
18 |
|
foreach ($this->config->getSpecialCaseChars() as $char) { |
434
|
18 |
|
$specialCharacters .= '\\' . $char; |
435
|
|
|
} |
436
|
|
|
|
437
|
|
|
// Normalize no-break-spaces to regular spaces |
438
|
18 |
|
$text = str_replace("\xc2\xa0", ' ', $text); |
439
|
|
|
|
440
|
18 |
|
preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY); |
441
|
|
|
|
442
|
18 |
|
foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) { |
443
|
18 |
|
if ($sentenceOrHtmlTag === '') { |
444
|
|
|
continue; |
445
|
|
|
} |
446
|
|
|
|
447
|
18 |
|
if ($sentenceOrHtmlTag[0] === '<') { |
448
|
17 |
|
$words[] = $sentenceOrHtmlTag; |
449
|
|
|
|
450
|
17 |
|
continue; |
451
|
|
|
} |
452
|
|
|
|
453
|
18 |
|
$sentenceOrHtmlTag = $this->normalizeWhitespaceInHtmlSentence($sentenceOrHtmlTag); |
454
|
|
|
|
455
|
18 |
|
$sentenceSplitIntoWords = []; |
456
|
|
|
|
457
|
|
|
// This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars |
458
|
|
|
// in the middle of a word, but not at the beginning or the end of a word. |
459
|
|
|
// Split regex compiles to this (in default config case); |
460
|
|
|
// /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu |
461
|
18 |
|
$regex = sprintf('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu', $specialCharacters, $specialCharacters); |
462
|
|
|
|
463
|
18 |
|
preg_match_all( |
464
|
18 |
|
$regex, |
465
|
18 |
|
$sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the last word is found by having a space behind it. |
466
|
|
|
$sentenceSplitIntoWords, |
467
|
18 |
|
PREG_SPLIT_NO_EMPTY |
468
|
|
|
); |
469
|
|
|
|
470
|
|
|
// Remove the last space, since that was added by us for the regex matcher |
471
|
18 |
|
array_pop($sentenceSplitIntoWords[0]); |
472
|
|
|
|
473
|
18 |
|
foreach ($sentenceSplitIntoWords[0] as $word) { |
474
|
18 |
|
$words[] = $word; |
475
|
|
|
} |
476
|
|
|
} |
477
|
|
|
|
478
|
18 |
|
return $words; |
479
|
|
|
} |
480
|
|
|
|
481
|
18 |
|
protected function normalizeWhitespaceInHtmlSentence(string $sentence) : string |
482
|
|
|
{ |
483
|
18 |
|
if ($this->config->isKeepNewLines() === true) { |
484
|
1 |
|
return $sentence; |
485
|
|
|
} |
486
|
|
|
|
487
|
17 |
|
$sentence = preg_replace('/\s\s+|\r+|\n+|\r\n+/', ' ', $sentence); |
488
|
|
|
|
489
|
|
|
|
490
|
17 |
|
$sentenceLength = $this->stringUtil->strlen($sentence); |
491
|
17 |
|
$firstCharacter = $this->stringUtil->substr($sentence, 0, 1); |
492
|
17 |
|
$lastCharacter = $this->stringUtil->substr($sentence, $sentenceLength -1, 1); |
493
|
|
|
|
494
|
17 |
|
if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") { |
495
|
11 |
|
$sentence = ' ' . ltrim($sentence); |
496
|
|
|
} |
497
|
|
|
|
498
|
17 |
|
if ($sentenceLength > 1 && ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n")) { |
499
|
13 |
|
$sentence = rtrim($sentence) . ' '; |
500
|
|
|
} |
501
|
|
|
|
502
|
17 |
|
return $sentence; |
503
|
|
|
} |
504
|
|
|
} |
505
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.