Passed
Pull Request — master (#102)
by Sven
02:57
created

AbstractDiff::convertHtmlToListOfWords()   C

Complexity

Conditions 12
Paths 22

Size

Total Lines 68

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 29
CRAP Score 12.0053

Importance

Changes 0
Metric Value
cc 12
nc 22
nop 1
dl 0
loc 68
ccs 29
cts 30
cp 0.9667
crap 12.0053
rs 6.2714
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace Caxy\HtmlDiff;
4
5
use Caxy\HtmlDiff\Util\MbStringUtil;
6
use HTMLPurifier;
7
use HTMLPurifier_Config;
8
9
/**
10
 * Class AbstractDiff.
11
 */
12
abstract class AbstractDiff
13
{
14
    /**
15
     * @var array
16
     *
17
     * @deprecated since 0.1.0
18
     */
19
    public static $defaultSpecialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
20
21
    /**
22
     * @var array
23
     *
24
     * @deprecated since 0.1.0
25
     */
26
    public static $defaultSpecialCaseChars = array('.', ',', '(', ')', '\'');
27
28
    /**
29
     * @var bool
30
     *
31
     * @deprecated since 0.1.0
32
     */
33
    public static $defaultGroupDiffs = true;
34
35
    /**
36
     * @var HtmlDiffConfig
37
     */
38
    protected $config;
39
40
    /**
41
     * @var string
42
     */
43
    protected $content;
44
45
    /**
46
     * @var string
47
     */
48
    protected $oldText;
49
50
    /**
51
     * @var string
52
     */
53
    protected $newText;
54
55
    /**
56
     * @var array
57
     */
58
    protected $oldWords = array();
59
60
    /**
61
     * @var array
62
     */
63
    protected $newWords = array();
64
65
    /**
66
     * @var DiffCache[]
67
     */
68
    protected $diffCaches = array();
69
70
    /**
71
     * @var HTMLPurifier|null
72
     */
73
    protected $purifier;
74
75
    /**
76
     * @var HTMLPurifier_Config|null
77
     */
78
    protected $purifierConfig = null;
79
80
    /**
81
     * @see array_slice_cached();
82
     * @var bool
83
     */
84
    protected $resetCache = false;
85
86
    /**
87
     * @var MbStringUtil
88
     */
89
    protected $stringUtil;
90
91
    /**
92
     * AbstractDiff constructor.
93
     *
94
     * @param string     $oldText
95
     * @param string     $newText
96
     * @param string     $encoding
97
     * @param null|array $specialCaseTags
98
     * @param null|bool  $groupDiffs
99
     */
100 17
    public function __construct($oldText, $newText, $encoding = 'UTF-8', $specialCaseTags = null, $groupDiffs = null)
101
    {
102 17
        $this->stringUtil = new MbStringUtil($oldText, $newText);
103
104 17
        $this->setConfig(HtmlDiffConfig::create()->setEncoding($encoding));
105
106 17
        if ($specialCaseTags !== null) {
107 16
            $this->config->setSpecialCaseTags($specialCaseTags);
108
        }
109
110 17
        if ($groupDiffs !== null) {
111
            $this->config->setGroupDiffs($groupDiffs);
112
        }
113
114 17
        $this->oldText = $oldText;
115 17
        $this->newText = $newText;
116 17
        $this->content = '';
117 17
    }
118
119
    /**
120
     * @return bool|string
121
     */
122
    abstract public function build();
123
124
    /**
125
     * Initializes HTMLPurifier with cache location.
126
     *
127
     * @param null|string $defaultPurifierSerializerCache
128
     */
129 17
    public function initPurifier($defaultPurifierSerializerCache = null)
130
    {
131 17
        if (null !== $this->purifierConfig) {
132 2
            $HTMLPurifierConfig  = $this->purifierConfig;
133
        } else {
134 17
            $HTMLPurifierConfig = HTMLPurifier_Config::createDefault();
135
        }
136
137
        // Cache.SerializerPath defaults to Null and sets
138
        // the location to inside the vendor HTMLPurifier library
139
        // under the DefinitionCache/Serializer folder.
140 17
        if (!is_null($defaultPurifierSerializerCache)) {
141 2
            $HTMLPurifierConfig->set('Cache.SerializerPath', $defaultPurifierSerializerCache);
142
        }
143
144
        // Cache.SerializerPermissions defaults to 0744.
145
        // This setting allows the cache files to be deleted by any user, as they are typically
146
        // created by the web/php user (www-user, php-fpm, etc.)
147 17
        $HTMLPurifierConfig->set('Cache.SerializerPermissions', 0777);
148
149 17
        $this->purifier = new HTMLPurifier($HTMLPurifierConfig);
150 17
    }
151
152
    /**
153
     * Prepare (purify) the HTML
154
     *
155
     * @return void
156
     */
157 17
    protected function prepare()
158
    {
159 17
        if (false === $this->config->isPurifierEnabled()) {
160
            return;
161
        }
162
163 17
        $this->initPurifier($this->config->getPurifierCacheLocation());
164
165 17
        $this->oldText = $this->purifyHtml($this->oldText);
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->purifyHtml($this->oldText) can also be of type false. However, the property $oldText is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
166 17
        $this->newText = $this->purifyHtml($this->newText);
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->purifyHtml($this->newText) can also be of type false. However, the property $newText is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
167 17
    }
168
169
    /**
170
     * @return DiffCache|null
171
     */
172
    protected function getDiffCache()
173
    {
174
        if (!$this->hasDiffCache()) {
175
            return null;
176
        }
177
178
        $hash = spl_object_hash($this->getConfig()->getCacheProvider());
179
180
        if (!array_key_exists($hash, $this->diffCaches)) {
181
            $this->diffCaches[$hash] = new DiffCache($this->getConfig()->getCacheProvider());
0 ignored issues
show
Bug introduced by
It seems like $this->getConfig()->getCacheProvider() can be null; however, __construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
182
        }
183
184
        return $this->diffCaches[$hash];
185
    }
186
187
    /**
188
     * @return bool
189
     */
190 17
    protected function hasDiffCache()
191
    {
192 17
        return null !== $this->getConfig()->getCacheProvider();
193
    }
194
195
    /**
196
     * @return HtmlDiffConfig
197
     */
198 17
    public function getConfig()
199
    {
200 17
        return $this->config;
201
    }
202
203
    /**
204
     * @param HtmlDiffConfig $config
205
     *
206
     * @return AbstractDiff
207
     */
208 17
    public function setConfig(HtmlDiffConfig $config)
209
    {
210 17
        $this->config = $config;
211
212 17
        return $this;
213
    }
214
215
    /**
216
     * @return int
217
     *
218
     * @deprecated since 0.1.0
219
     */
220
    public function getMatchThreshold()
221
    {
222
        return $this->config->getMatchThreshold();
223
    }
224
225
    /**
226
     * @param int $matchThreshold
227
     *
228
     * @return AbstractDiff
229
     *
230
     * @deprecated since 0.1.0
231
     */
232
    public function setMatchThreshold($matchThreshold)
233
    {
234
        $this->config->setMatchThreshold($matchThreshold);
235
236
        return $this;
237
    }
238
239
    /**
240
     * @param array $chars
241
     *
242
     * @deprecated since 0.1.0
243
     */
244
    public function setSpecialCaseChars(array $chars)
245
    {
246
        $this->config->setSpecialCaseChars($chars);
247
    }
248
249
    /**
250
     * @return array|null
251
     *
252
     * @deprecated since 0.1.0
253
     */
254
    public function getSpecialCaseChars()
255
    {
256
        return $this->config->getSpecialCaseChars();
257
    }
258
259
    /**
260
     * @param string $char
261
     *
262
     * @deprecated since 0.1.0
263
     */
264
    public function addSpecialCaseChar($char)
265
    {
266
        $this->config->addSpecialCaseChar($char);
267
    }
268
269
    /**
270
     * @param string $char
271
     *
272
     * @deprecated since 0.1.0
273
     */
274
    public function removeSpecialCaseChar($char)
275
    {
276
        $this->config->removeSpecialCaseChar($char);
277
    }
278
279
    /**
280
     * @param array $tags
281
     *
282
     * @deprecated since 0.1.0
283
     */
284
    public function setSpecialCaseTags(array $tags = array())
285
    {
286
        $this->config->setSpecialCaseChars($tags);
287
    }
288
289
    /**
290
     * @param string $tag
291
     *
292
     * @deprecated since 0.1.0
293
     */
294
    public function addSpecialCaseTag($tag)
295
    {
296
        $this->config->addSpecialCaseTag($tag);
297
    }
298
299
    /**
300
     * @param string $tag
301
     *
302
     * @deprecated since 0.1.0
303
     */
304
    public function removeSpecialCaseTag($tag)
305
    {
306
        $this->config->removeSpecialCaseTag($tag);
307
    }
308
309
    /**
310
     * @return array|null
311
     *
312
     * @deprecated since 0.1.0
313
     */
314
    public function getSpecialCaseTags()
315
    {
316
        return $this->config->getSpecialCaseTags();
317
    }
318
319
    /**
320
     * @return string
321
     */
322
    public function getOldHtml()
323
    {
324
        return $this->oldText;
325
    }
326
327
    /**
328
     * @return string
329
     */
330
    public function getNewHtml()
331
    {
332
        return $this->newText;
333
    }
334
335
    /**
336
     * @return string
337
     */
338
    public function getDifference()
339
    {
340
        return $this->content;
341
    }
342
343
    /**
344
     * Clears the diff content.
345
     *
346
     * @return void
347
     */
348
    public function clearContent()
349
    {
350
        $this->content = null;
351
    }
352
353
    /**
354
     * @param bool $boolean
355
     *
356
     * @return $this
357
     *
358
     * @deprecated since 0.1.0
359
     */
360
    public function setGroupDiffs($boolean)
361
    {
362
        $this->config->setGroupDiffs($boolean);
363
364
        return $this;
365
    }
366
367
    /**
368
     * @return bool
369
     *
370
     * @deprecated since 0.1.0
371
     */
372 17
    public function isGroupDiffs()
373
    {
374 17
        return $this->config->isGroupDiffs();
375
    }
376
377
    /**
378
     * @param HTMLPurifier_Config $config
379
     */
380 2
    public function setHTMLPurifierConfig(HTMLPurifier_Config $config)
381
    {
382 2
        $this->purifierConfig = $config;
383 2
    }
384
385
    /**
386
     * @param string $html
387
     *
388
     * @return string
0 ignored issues
show
Documentation introduced by
Should the return type not be string|false|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
389
     */
390 17
    protected function purifyHtml($html)
391
    {
392 17
        if (null === $this->purifier) {
393
            return $html;
394
        }
395
396 17
        return $this->purifier->purify($html);
397
    }
398
399 17
    protected function splitInputsToWords()
400
    {
401 17
        $this->setOldWords($this->convertHtmlToListOfWords($this->oldText));
402 17
        $this->setNewWords($this->convertHtmlToListOfWords($this->newText));
403 17
    }
404
405
    /**
406
     * @param array $oldWords
407
     */
408 17
    protected function setOldWords(array $oldWords)
409
    {
410 17
        $this->resetCache = true;
411 17
        $this->oldWords   = $oldWords;
412 17
    }
413
414
    /**
415
     * @param array $newWords
416
     */
417 17
    protected function setNewWords(array $newWords)
418
    {
419 17
        $this->resetCache = true;
420 17
        $this->newWords   = $newWords;
421 17
    }
422
423
    /**
424
     * @param array $characterString
0 ignored issues
show
Bug introduced by
There is no parameter named $characterString. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
425
     *
426
     * @return array
427
     */
428 17
    protected function convertHtmlToListOfWords($text)
429
    {
430 17
        $words            = [];
431 17
        $sentencesAndTags = [];
432
433 17
        $specialCharacters = '';
434
435 17
        foreach ($this->config->getSpecialCaseChars() as $char) {
436 17
            $specialCharacters .= '\\' . $char;
437
        }
438
439
        // Normalize no-break-spaces to regular spaces
440 17
        $text = str_replace("\xc2\xa0", ' ', $text);
441
442 17
        preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY);
443
444 17
        foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) {
445 17
            if ($sentenceOrHtmlTag === '') {
446
                continue;
447
            }
448
449 17
            if ($sentenceOrHtmlTag[0] === '<') {
450 16
                $words[] = $sentenceOrHtmlTag;
451
452 16
                continue;
453
            }
454
455 17
            $sentenceOrHtmlTag = preg_replace('/\r+|\n+|\r\n+/', ' ', $sentenceOrHtmlTag);
456
457
            // Normalize whitespace
458 17
            $firstCharacter = $sentenceOrHtmlTag[0];
459 17
            $lastCharacter  = $sentenceOrHtmlTag[$this->stringUtil->strlen($sentenceOrHtmlTag) - 1];
460
461 17
            if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") {
462 11
                $sentenceOrHtmlTag = ' ' . ltrim($sentenceOrHtmlTag);
463
            }
464
465 17
            if ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n") {
466 13
                $sentenceOrHtmlTag = rtrim($sentenceOrHtmlTag) . ' ';
467
            }
468
469 17
            $sentenceSplitIntoWords = [];
470
471
            // Split regex compiles to this (in default config case);
472
            // /\s|[a-zA-Z0-9\.\,\(\)\'\pL]+(?=[\.\,\(\)\'][\s\r\n])|[a-zA-Z0-9\.\,\(\)\'\pL]+|[^\s]/mu
473
            //
474
            // First group are spaces per character
475
            // Second group is alpha numeric words including special characters that also ends with a special character
476
            // that is then excluded from the group
477
            // Third group is alpha numeric words including special characters
478
            // Fourth group is a catch all of everything else.
479 17
            preg_match_all(
480 17
                '/\s|[a-zA-Z0-9' . $specialCharacters . '\pL]+(?=[' . $specialCharacters . '][\s\r\n])|[a-zA-Z0-9' . $specialCharacters . '\pL]+|[^\s]/mu',
481 17
                $sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the second regex always hits.
482
                $sentenceSplitIntoWords,
483 17
                PREG_SPLIT_NO_EMPTY
484
            );
485
486
            // Unset the last element that was injected to make sure regex matches special characters at the EOL. (Second case)
487 17
            array_pop($sentenceSplitIntoWords[0]);
488
489 17
            foreach ($sentenceSplitIntoWords[0] as $word) {
490 17
                $words[] = $word;
491
            }
492
        }
493
494 17
        return $words;
495
    }
496
}
497