Completed
Push — master ( 68f73c...f03bb1 )
by Kevin
03:04
created

Element   C

Complexity

Total Complexity 63

Size/Duplication

Total Lines 480
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 6

Test Coverage

Coverage 0%

Importance

Changes 0
Metric Value
wmc 63
lcom 1
cbo 6
dl 0
loc 480
ccs 0
cts 305
cp 0
rs 5.8893
c 0
b 0
f 0

14 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 8 1
C isClosingElementImplied() 0 69 17
B parse() 0 60 8
B parseAttribute() 0 64 7
C parseContents() 0 58 12
A parseElementName() 0 18 3
B parseForeignContents() 0 39 3
A parseNoContents() 0 20 2
A getAttributes() 0 4 1
A hasAttributes() 0 4 1
A getChildren() 0 4 1
A hasChildren() 0 4 1
A getName() 0 4 1
B toArray() 0 25 5

How to fix   Complexity   

Complex Class

Complex classes like Element often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Element, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\HtmlTokenizer;
6
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
7
8
class Element extends AbstractToken
9
{
10
    /** @var array */
11
    private $attributes;
12
13
    /** @var array[Token] */
14
    private $children;
15
16
    /** @var string */
17
    private $name;
18
19
    public function __construct(Token $parent = null, bool $throwOnError = true)
20
    {
21
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
22
23
        $this->attributes = array();
24
        $this->children = array();
25
        $this->name = null;
26
    }
27
28
    /**
29
     * Does the parent have an implied closing tag?
30
     *
31
     * @param string $html
32
     *
33
     * @return boolean
34
     */
35
    public function isClosingElementImplied(string $html) : bool
36
    {
37
        $parent = $this->getParent();
38
        if ($parent === null || !($parent instanceof self)) {
39
            return false;
40
        }
41
42
        $name = $this->parseElementName($html);
43
        $parentName = $parent->getName();
44
45
        // HEAD: no closing tag.
46
        if ($name === 'body' && $parentName === 'head') {
47
            return true;
48
        }
49
50
        // P
51
        $elementsNotChildrenOfP = array(
52
            'address',
53
            'article',
54
            'aside',
55
            'blockquote',
56
            'details',
57
            'div',
58
            'dl',
59
            'fieldset',
60
            'figcaption',
61
            'figure',
62
            'footer',
63
            'form',
64
            'h1',
65
            'h2',
66
            'h3',
67
            'h4',
68
            'h5',
69
            'h6',
70
            'header',
71
            'hgroup',
72
            'hr',
73
            'main',
74
            'menu',
75
            'nav',
76
            'ol',
77
            'p',
78
            'pre',
79
            'section',
80
            'table',
81
            'ul'
82
        );
83
        if ($parentName === 'p' && array_search($name, $elementsNotChildrenOfP) !== false) {
84
            return true;
85
        }
86
87
        // LI
88
        if ($parentName === 'li' && $name === 'li') {
89
            return true;
90
        }
91
92
        // DT and DD
93
        if (($parentName === 'dt' || $parentName === 'dd') && ($name === 'dt' || $name === 'dd')) {
94
            return true;
95
        }
96
97
        // RP and RT
98
        if (($parentName === 'rp' || $parentName === 'rt') && ($name === 'rp' || $name === 'rt')) {
99
            return true;
100
        }
101
102
        return false;
103
    }
104
105
    /**
106
     * Will parse this element.
107
     *
108
     * @param string $html
109
     *
110
     * @return string Remaining HTML.
111
     */
112
    public function parse(string $html) : string
113
    {
114
        $html = ltrim($html);
115
116
        // Get token position.
117
        $positionArray = HtmlTokenizer::getPosition($html);
118
        $this->line = $positionArray['line'];
119
        $this->position = $positionArray['position'];
120
121
        // Parse name.
122
        $this->name = $this->parseElementName($html);
123
124
        // Parse attributes.
125
        $remainingHtml = mb_substr($html, mb_strlen($this->name) + 1);
126
        while (mb_strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
127
            $remainingHtml = $this->parseAttribute($remainingHtml);
128
        }
129
130
        // Find position of end of tag.
131
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
132
        if ($posOfClosingBracket === false) {
133
            if ($this->getThrowOnError()) {
134
                throw new ParseException('Invalid element: missing closing bracket.');
135
            }
136
137
            return '';
138
        }
139
140
        // Is self-closing?
141
        $posOfSelfClosingBracket = mb_strpos($remainingHtml, '/>');
142
        $remainingHtml = mb_substr($remainingHtml, $posOfClosingBracket + 1);
143
        if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket == $posOfClosingBracket - 1) {
144
            // Self-closing element. (Note: $this->valuue is unchanged.)
145
            return $remainingHtml;
146
        }
147
148
        // Lets close those closed-only elements that are left open.
149
        $closedOnlyElements = array(
150
            'area',
151
            'base',
152
            'br',
153
            'col',
154
            'embed',
155
            'hr',
156
            'img',
157
            'input',
158
            'link',
159
            'meta',
160
            'param',
161
            'source',
162
            'track',
163
            'wbr'
164
        );
165
        if (array_search($this->name, $closedOnlyElements) !== false) {
166
            return $remainingHtml;
167
        }
168
169
        // Open element.
170
        return $this->parseContents($remainingHtml);
171
    }
172
173
    /**
174
     * Will parse attributes.
175
     *
176
     * @param string $html
177
     *
178
     * @return string Remaining HTML.
179
     */
180
    private function parseAttribute(string $html) : string
181
    {
182
        $remainingHtml = ltrim($html);
183
184
        // Will match the first entire name/value attribute pair.
185
        preg_match(
186
            "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
187
            $remainingHtml,
188
            $attributeMatches
189
        );
190
191
        $name = $attributeMatches[2];
192
        $remainingHtml = mb_substr(mb_strstr($remainingHtml, $name), mb_strlen($name));
193
        if (preg_match("/^\s*=\s*/", $remainingHtml) === 0) {
194
            // Valueless attribute.
195
            $this->attributes[trim($name)] = true;
196
        } else {
197
            $remainingHtml = ltrim($remainingHtml, ' =');
198
            if ($remainingHtml[0] === "'" || $remainingHtml[0] === '"') {
199
                // Quote enclosed attribute value.
200
                $valueMatchSuccessful = preg_match(
201
                    "/" . $remainingHtml[0] . "(.*?(?<!\\\))" . $remainingHtml[0] . "/s",
202
                    $remainingHtml,
203
                    $valueMatches
204
                );
205
                if ($valueMatchSuccessful !== 1) {
206
                    if ($this->getThrowOnError()) {
207
                        throw new ParseException('Invalid value encapsulation.');
208
                    }
209
210
                    return '';
211
                }
212
213
                $value = $valueMatches[1];
214
            } else {
215
                // No quotes enclosing the attribute value.
216
                preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
217
                $value = $valueMatches[2];
218
            }
219
220
            $this->attributes[trim($name)] = $value;
221
222
            // Determine remaining html.
223
            if ($value == '') {
224
                $remainingHtml = ltrim(mb_substr(ltrim($html), mb_strlen($name) + 3));
225
            } else {
226
                $remainingHtml = ltrim($html);
227
228
                // Remove attribute name.
229
                $remainingHtml = mb_substr($remainingHtml, mb_strlen($name));
230
                $posOfAttributeValue = mb_strpos($remainingHtml, $value);
231
                $remainingHtml = ltrim(
232
                    mb_substr(
233
                        $remainingHtml,
234
                        $posOfAttributeValue + mb_strlen($value)
235
                    )
236
                );
237
            }
238
239
            $remainingHtml = ltrim($remainingHtml, '\'"/ ');
240
        }
241
242
        return $remainingHtml;
243
    }
244
245
    /**
246
     * Will parse the contents of this element.
247
     *
248
     * @param string $html
249
     *
250
     * @return string Remaining HTML.
251
     */
252
    private function parseContents(string $html) : string
253
    {
254
        if (trim($html) == '') {
255
            return '';
256
        }
257
258
        // Determine value.
259
        $this->value = $html;
260
        if (preg_match("/(.*)<\/\s*" . $this->name . "\s*>/iU", $html, $valueMatches) === 1) {
261
            $this->value = $valueMatches[1];
262
        }
263
264
        // Don't parse contents of "iframe" element.
265
        if ($this->name === 'iframe') {
266
            return $this->parseNoContents('iframe', $html);
267
        }
268
269
        // Only TEXT inside a "script" element.
270
        if ($this->name === 'script') {
271
            return $this->parseForeignContents('script', $html);
272
        }
273
274
        // Only TEXT inside a "style" element.
275
        if ($this->name === 'style') {
276
            return $this->parseForeignContents('style', $html);
277
        }
278
279
        // Parse contents one token at a time.
280
        $remainingHtml = $html;
281
        while (preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0) {
282
            $token = TokenFactory::buildFromHtml(
283
                $remainingHtml,
284
                $this,
285
                $this->getThrowOnError()
286
            );
287
288
            if (!$token instanceof Token || $token->isClosingElementImplied($remainingHtml)) {
289
                return $remainingHtml;
290
            }
291
292
            $remainingHtml = $token->parse($remainingHtml);
293
            $this->children[] = $token;
294
        }
295
296
        // Remove last token if contains only whitespace.
297
        if (!empty($this->children)) {
298
            $lastChildArray = array_slice($this->children, -1);
299
            $lastChild = array_pop($lastChildArray);
300
            if ($lastChild->isText() && trim($lastChild->getValue()) == '') {
301
                array_pop($this->children);
302
            }
303
        }
304
305
        // Remove remaining closing tag.
306
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
307
308
        return mb_substr($remainingHtml, $posOfClosingBracket + 1);
309
    }
310
311
    /**
312
     * Will get the element name from the html string.
313
     *
314
     * @param $html string
315
     *
316
     * @return string The element name.
317
     */
318
    private function parseElementName(string $html) : string
319
    {
320
        $html = trim($html);
321
        $elementMatchSuccessful = preg_match(
322
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
323
            $html,
324
            $elementMatches
325
        );
326
        if ($elementMatchSuccessful !== 1) {
327
            if ($this->getThrowOnError()) {
328
                throw new ParseException('Invalid element name. Truncated html = ' . mb_substr($html, 0, 20));
329
            }
330
331
            return '';
332
        }
333
334
        return mb_strtolower($elementMatches[2]);
335
    }
336
337
    /**
338
     * Will parse the script and style contents correctly.
339
     *
340
     * @param $tag  string
341
     * @param $html string
342
     *
343
     * @return string The remaining HTML.
344
     */
345
    private function parseForeignContents(string $tag, string $html) : string
346
    {
347
        $remainingHtml = ltrim($html);
348
349
        // Get token position.
350
        $positionArray = HtmlTokenizer::getPosition($remainingHtml);
351
352
        // Find all contents.
353
        $matchingResult = preg_match(
354
            "/(<\/\s*" . $tag . "\s*>)/i",
355
            $html,
356
            $endOfScriptMatches
357
        );
358
        if ($matchingResult === 0) {
359
            $this->value = trim($remainingHtml);
360
            $remainingHtml = '';
361
        } else {
362
            $closingTag = $endOfScriptMatches[1];
363
            $this->value = trim(
364
                mb_substr($remainingHtml, 0, mb_strpos($remainingHtml, $closingTag))
365
            );
366
            $remainingHtml = mb_substr(
367
                mb_strstr($remainingHtml, $closingTag),
368
                mb_strlen($closingTag)
369
            );
370
        }
371
372
        // Handle no contents.
373
        if ($this->value == '') {
374
            return $remainingHtml;
375
        }
376
377
        $text = new Text($this, $this->getThrowOnError(), $this->value);
378
        $text->line = $positionArray['line'];
0 ignored issues
show
Bug introduced by
The property line cannot be accessed from this context as it is declared protected in class Kevintweber\HtmlTokenizer\Tokens\AbstractToken.

This check looks for access to properties that are not accessible from the current context.

If you need to make a property accessible to another context you can either raise its visibility level or provide an accessible getter in the defining class.

Loading history...
379
        $text->position = $positionArray['position'];
0 ignored issues
show
Bug introduced by
The property position cannot be accessed from this context as it is declared protected in class Kevintweber\HtmlTokenizer\Tokens\AbstractToken.

This check looks for access to properties that are not accessible from the current context.

If you need to make a property accessible to another context you can either raise its visibility level or provide an accessible getter in the defining class.

Loading history...
380
        $this->children[] = $text;
381
382
        return $remainingHtml;
383
    }
384
385
    /**
386
     * Will not parse the contents of an element.
387
     *
388
     * "iframe" elements.
389
     *
390
     * @param $tag  string
391
     * @param $html string
392
     *
393
     * @return string The remaining HTML.
394
     */
395
    private function parseNoContents(string $tag, string $html) : string
396
    {
397
        $remainingHtml = ltrim($html);
398
        $matchingResult = preg_match(
399
            "/(<\/\s*" . $tag . "\s*>)/i",
400
            $html,
401
            $endOfScriptMatches
402
        );
403
        if ($matchingResult === 0) {
404
            return '';
405
        }
406
407
        $closingTag = $endOfScriptMatches[1];
408
        $this->value = mb_substr($remainingHtml, 0, mb_strpos($html, $closingTag));
409
410
        return mb_substr(
411
            mb_strstr($remainingHtml, $closingTag),
412
            mb_strlen($closingTag)
413
        );
414
    }
415
416
    /**
417
     * Getter for 'attributes'.
418
     *
419
     * @return array
420
     */
421
    public function getAttributes() : array
422
    {
423
        return $this->attributes;
424
    }
425
426
    /**
427
     * @return boolean
428
     */
429
    public function hasAttributes() : bool
430
    {
431
        return !empty($this->attributes);
432
    }
433
434
    /**
435
     * Getter for 'children'.
436
     *
437
     * @return array
438
     */
439
    public function getChildren() : array
440
    {
441
        return $this->children;
442
    }
443
444
    /**
445
     * @return boolean
446
     */
447
    public function hasChildren() : bool
448
    {
449
        return !empty($this->children);
450
    }
451
452
    /**
453
     * Getter for 'name'.
454
     *
455
     * @return string
456
     */
457
    public function getName() : string
458
    {
459
        return $this->name;
460
    }
461
462
    public function toArray() : array
463
    {
464
        $result = array(
465
            'type' => 'element',
466
            'name' => $this->name,
467
            'line' => $this->getLine(),
468
            'position' => $this->getPosition()
469
        );
470
471
        if (!empty($this->attributes)) {
472
            $result['attributes'] = array();
473
            foreach ($this->attributes as $name => $value) {
474
                $result['attributes'][$name] = $value;
475
            }
476
        }
477
478
        if (!empty($this->children)) {
479
            $result['children'] = array();
480
            foreach ($this->children as $child) {
481
                $result['children'][] = $child->toArray();
482
            }
483
        }
484
485
        return $result;
486
    }
487
}
488