Passed
Push — master ( 6cbc63...08e8a6 )
by Sven
01:10 queued 10s
created

AbstractDiff::convertHtmlToListOfWords()   B

Complexity

Conditions 6
Paths 10

Size

Total Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 24
CRAP Score 6.0023

Importance

Changes 0
Metric Value
cc 6
nc 10
nop 1
dl 0
loc 54
ccs 24
cts 25
cp 0.96
crap 6.0023
rs 8.3814
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Caxy\HtmlDiff;
4
5
use Caxy\HtmlDiff\Util\MbStringUtil;
6
use HTMLPurifier;
7
use HTMLPurifier_Config;
8
9
/**
10
 * Class AbstractDiff.
11
 */
12
abstract class AbstractDiff
13
{
14
    /**
15
     * @var array
16
     *
17
     * @deprecated since 0.1.0
18
     */
19
    public static $defaultSpecialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
20
21
    /**
22
     * @var array
23
     *
24
     * @deprecated since 0.1.0
25
     */
26
    public static $defaultSpecialCaseChars = array('.', ',', '(', ')', '\'');
27
28
    /**
29
     * @var bool
30
     *
31
     * @deprecated since 0.1.0
32
     */
33
    public static $defaultGroupDiffs = true;
34
35
    /**
36
     * @var HtmlDiffConfig
37
     */
38
    protected $config;
39
40
    /**
41
     * @var string
42
     */
43
    protected $content;
44
45
    /**
46
     * @var string
47
     */
48
    protected $oldText;
49
50
    /**
51
     * @var string
52
     */
53
    protected $newText;
54
55
    /**
56
     * @var array
57
     */
58
    protected $oldWords = array();
59
60
    /**
61
     * @var array
62
     */
63
    protected $newWords = array();
64
65
    /**
66
     * @var DiffCache[]
67
     */
68
    protected $diffCaches = array();
69
70
    /**
71
     * @var HTMLPurifier|null
72
     */
73
    protected $purifier;
74
75
    /**
76
     * @var HTMLPurifier_Config|null
77
     */
78
    protected $purifierConfig = null;
79
80
    /**
81
     * @see array_slice_cached();
82
     * @var bool
83
     */
84
    protected $resetCache = false;
85
86
    /**
87
     * @var MbStringUtil
88
     */
89
    protected $stringUtil;
90
91
    /**
92
     * AbstractDiff constructor.
93
     *
94
     * @param string     $oldText
95
     * @param string     $newText
96
     * @param string     $encoding
97
     * @param null|array $specialCaseTags
98
     * @param null|bool  $groupDiffs
99
     */
100 18
    public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
101
    {
102 18
        $this->stringUtil = new MbStringUtil($oldText, $newText);
103
104 18
        $this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding));
105
106 18
        if ($specialCaseTags !== null) {
107 17
            $this->config->setSpecialCaseTags($specialCaseTags);
108
        }
109
110 18
        if ($groupDiffs !== null) {
111
            $this->config->setGroupDiffs($groupDiffs);
112
        }
113
114 18
        $this->oldText = $oldText;
115 18
        $this->newText = $newText;
116 18
        $this->content = '';
117 18
    }
118
119
    /**
120
     * @return bool|string
121
     */
122
    abstract public function build();
123
124
    /**
125
     * Initializes HTMLPurifier with cache location.
126
     *
127
     * @param null|string $defaultPurifierSerializerCache
128
     */
129 17
    public function initPurifier($defaultPurifierSerializerCache = null)
130
    {
131 17
        if (null !== $this->purifierConfig) {
132 2
            $HTMLPurifierConfig  = $this->purifierConfig;
133
        } else {
134 17
            $HTMLPurifierConfig = HTMLPurifier_Config::createDefault();
135
        }
136
137
        // Cache.SerializerPath defaults to Null and sets
138
        // the location to inside the vendor HTMLPurifier library
139
        // under the DefinitionCache/Serializer folder.
140 17
        if (!is_null($defaultPurifierSerializerCache)) {
141 2
            $HTMLPurifierConfig->set('Cache.SerializerPath', $defaultPurifierSerializerCache);
142
        }
143
144
        // Cache.SerializerPermissions defaults to 0744.
145
        // This setting allows the cache files to be deleted by any user, as they are typically
146
        // created by the web/php user (www-user, php-fpm, etc.)
147 17
        $HTMLPurifierConfig->set('Cache.SerializerPermissions', 0777);
148
149 17
        $this->purifier = new HTMLPurifier($HTMLPurifierConfig);
150 17
    }
151
152
    /**
153
     * Prepare (purify) the HTML
154
     *
155
     * @return void
156
     */
157 18
    protected function prepare()
158
    {
159 18
        if (false === $this->config->isPurifierEnabled()) {
160 1
            return;
161
        }
162
163 17
        $this->initPurifier($this->config->getPurifierCacheLocation());
164
165 17
        $this->oldText = $this->purifyHtml($this->oldText);
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->purifyHtml($this->oldText) can also be of type false. However, the property $oldText is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
166 17
        $this->newText = $this->purifyHtml($this->newText);
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->purifyHtml($this->newText) can also be of type false. However, the property $newText is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
167 17
    }
168
169
    /**
170
     * @return DiffCache|null
171
     */
172
    protected function getDiffCache()
173
    {
174
        if (!$this->hasDiffCache()) {
175
            return null;
176
        }
177
178
        $hash = spl_object_hash($this->getConfig()->getCacheProvider());
179
180
        if (!array_key_exists($hash, $this->diffCaches)) {
181
            $this->diffCaches[$hash] = new DiffCache($this->getConfig()->getCacheProvider());
0 ignored issues
show
Bug introduced by
It seems like $this->getConfig()->getCacheProvider() can be null; however, __construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
182
        }
183
184
        return $this->diffCaches[$hash];
185
    }
186
187
    /**
188
     * @return bool
189
     */
190 18
    protected function hasDiffCache()
191
    {
192 18
        return null !== $this->getConfig()->getCacheProvider();
193
    }
194
195
    /**
196
     * @return HtmlDiffConfig
197
     */
198 18
    public function getConfig()
199
    {
200 18
        return $this->config;
201
    }
202
203
    /**
204
     * @param HtmlDiffConfig $config
205
     *
206
     * @return AbstractDiff
207
     */
208 18
    public function setConfig(HtmlDiffConfig $config)
209
    {
210 18
        $this->config = $config;
211
212 18
        return $this;
213
    }
214
215
    /**
216
     * @return int
217
     *
218
     * @deprecated since 0.1.0
219
     */
220
    public function getMatchThreshold()
221
    {
222
        return $this->config->getMatchThreshold();
223
    }
224
225
    /**
226
     * @param int $matchThreshold
227
     *
228
     * @return AbstractDiff
229
     *
230
     * @deprecated since 0.1.0
231
     */
232
    public function setMatchThreshold($matchThreshold)
233
    {
234
        $this->config->setMatchThreshold($matchThreshold);
235
236
        return $this;
237
    }
238
239
    /**
240
     * @param array $chars
241
     *
242
     * @deprecated since 0.1.0
243
     */
244
    public function setSpecialCaseChars(array $chars)
245
    {
246
        $this->config->setSpecialCaseChars($chars);
247
    }
248
249
    /**
250
     * @return array|null
251
     *
252
     * @deprecated since 0.1.0
253
     */
254
    public function getSpecialCaseChars()
255
    {
256
        return $this->config->getSpecialCaseChars();
257
    }
258
259
    /**
260
     * @param string $char
261
     *
262
     * @deprecated since 0.1.0
263
     */
264
    public function addSpecialCaseChar($char)
265
    {
266
        $this->config->addSpecialCaseChar($char);
267
    }
268
269
    /**
270
     * @param string $char
271
     *
272
     * @deprecated since 0.1.0
273
     */
274
    public function removeSpecialCaseChar($char)
275
    {
276
        $this->config->removeSpecialCaseChar($char);
277
    }
278
279
    /**
280
     * @param array $tags
281
     *
282
     * @deprecated since 0.1.0
283
     */
284
    public function setSpecialCaseTags(array $tags = array())
285
    {
286
        $this->config->setSpecialCaseChars($tags);
287
    }
288
289
    /**
290
     * @param string $tag
291
     *
292
     * @deprecated since 0.1.0
293
     */
294
    public function addSpecialCaseTag($tag)
295
    {
296
        $this->config->addSpecialCaseTag($tag);
297
    }
298
299
    /**
300
     * @param string $tag
301
     *
302
     * @deprecated since 0.1.0
303
     */
304
    public function removeSpecialCaseTag($tag)
305
    {
306
        $this->config->removeSpecialCaseTag($tag);
307
    }
308
309
    /**
310
     * @return array|null
311
     *
312
     * @deprecated since 0.1.0
313
     */
314
    public function getSpecialCaseTags()
315
    {
316
        return $this->config->getSpecialCaseTags();
317
    }
318
319
    /**
320
     * @return string
321
     */
322
    public function getOldHtml()
323
    {
324
        return $this->oldText;
325
    }
326
327
    /**
328
     * @return string
329
     */
330
    public function getNewHtml()
331
    {
332
        return $this->newText;
333
    }
334
335
    /**
336
     * @return string
337
     */
338
    public function getDifference()
339
    {
340
        return $this->content;
341
    }
342
343
    /**
344
     * Clears the diff content.
345
     *
346
     * @return void
347
     */
348
    public function clearContent()
349
    {
350
        $this->content = null;
351
    }
352
353
    /**
354
     * @param bool $boolean
355
     *
356
     * @return $this
357
     *
358
     * @deprecated since 0.1.0
359
     */
360
    public function setGroupDiffs($boolean)
361
    {
362
        $this->config->setGroupDiffs($boolean);
363
364
        return $this;
365
    }
366
367
    /**
368
     * @return bool
369
     *
370
     * @deprecated since 0.1.0
371
     */
372 18
    public function isGroupDiffs()
373
    {
374 18
        return $this->config->isGroupDiffs();
375
    }
376
377
    /**
378
     * @param HTMLPurifier_Config $config
379
     */
380 2
    public function setHTMLPurifierConfig(HTMLPurifier_Config $config)
381
    {
382 2
        $this->purifierConfig = $config;
383 2
    }
384
385
    /**
386
     * @param string $html
387
     *
388
     * @return string
0 ignored issues
show
Documentation introduced by
Should the return type not be string|false|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
389
     */
390 17
    protected function purifyHtml($html)
391
    {
392 17
        if (null === $this->purifier) {
393
            return $html;
394
        }
395
396 17
        return $this->purifier->purify($html);
397
    }
398
399 18
    protected function splitInputsToWords()
400
    {
401 18
        $this->setOldWords($this->convertHtmlToListOfWords($this->oldText));
402 18
        $this->setNewWords($this->convertHtmlToListOfWords($this->newText));
403 18
    }
404
405
    /**
406
     * @param array $oldWords
407
     */
408 18
    protected function setOldWords(array $oldWords)
409
    {
410 18
        $this->resetCache = true;
411 18
        $this->oldWords   = $oldWords;
412 18
    }
413
414
    /**
415
     * @param array $newWords
416
     */
417 18
    protected function setNewWords(array $newWords)
418
    {
419 18
        $this->resetCache = true;
420 18
        $this->newWords   = $newWords;
421 18
    }
422
423
    /**
424
     * @return string[]
425
     */
426 18
    protected function convertHtmlToListOfWords(string $text) : array
427
    {
428 18
        $words            = [];
429 18
        $sentencesAndTags = [];
430
431 18
        $specialCharacters = '';
432
433 18
        foreach ($this->config->getSpecialCaseChars() as $char) {
434 18
            $specialCharacters .= '\\' . $char;
435
        }
436
437
        // Normalize no-break-spaces to regular spaces
438 18
        $text = str_replace("\xc2\xa0", ' ', $text);
439
440 18
        preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY);
441
442 18
        foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) {
443 18
            if ($sentenceOrHtmlTag === '') {
444
                continue;
445
            }
446
447 18
            if ($sentenceOrHtmlTag[0] === '<') {
448 17
                $words[] = $sentenceOrHtmlTag;
449
450 17
                continue;
451
            }
452
453 18
            $sentenceOrHtmlTag = $this->normalizeWhitespaceInHtmlSentence($sentenceOrHtmlTag);
454
455 18
            $sentenceSplitIntoWords = [];
456
457
            // This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars
458
            // in the middle of a word, but not at the beginning or the end of a word.
459
            // Split regex compiles to this (in default config case);
460
            // /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu
461 18
            $regex = sprintf('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu', $specialCharacters, $specialCharacters);
462
463 18
            preg_match_all(
464 18
                $regex,
465 18
                $sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the last word is found by having a space behind it.
466
                $sentenceSplitIntoWords,
467 18
                PREG_SPLIT_NO_EMPTY
468
            );
469
470
            // Remove the last space, since that was added by us for the regex matcher
471 18
            array_pop($sentenceSplitIntoWords[0]);
472
473 18
            foreach ($sentenceSplitIntoWords[0] as $word) {
474 18
                $words[] = $word;
475
            }
476
        }
477
478 18
        return $words;
479
    }
480
481 18
    protected function normalizeWhitespaceInHtmlSentence(string $sentence) : string
482
    {
483 18
        if ($this->config->isKeepNewLines() === true) {
484 1
            return $sentence;
485
        }
486
487 17
        $sentence = preg_replace('/\s\s+|\r+|\n+|\r\n+/', ' ', $sentence);
488
489
490 17
        $sentenceLength = $this->stringUtil->strlen($sentence);
491 17
        $firstCharacter = $this->stringUtil->substr($sentence, 0, 1);
492 17
        $lastCharacter  = $this->stringUtil->substr($sentence, $sentenceLength -1, 1);
493
494 17
        if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") {
495 11
            $sentence = ' ' . ltrim($sentence);
496
        }
497
498 17
        if ($sentenceLength > 1 && ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n")) {
499 13
            $sentence = rtrim($sentence) . ' ';
500
        }
501
502 17
        return $sentence;
503
    }
504
}
505