Element::parseAttribute()   B
last analyzed

Complexity

Conditions 4
Paths 14

Size

Total Lines 29
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 16
CRAP Score 4

Importance

Changes 0
Metric Value
dl 0
loc 29
ccs 16
cts 16
cp 1
rs 8.5806
c 0
b 0
f 0
cc 4
eloc 17
nc 14
nop 1
crap 4
1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
6
7
class Element extends AbstractToken
8
{
9
    /** @var array */
10
    private $attributes;
11
12
    /** @var array[Token] */
13
    private $children;
14
15
    /** @var string */
16
    private $name;
17
18 71
    public function __construct(Token $parent = null, bool $throwOnError = true)
19
    {
20 71
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
21
22 71
        $this->attributes = array();
23 71
        $this->children = array();
24 71
        $this->name = null;
25 71
    }
26
27
    /**
28
     * Does the parent have an implied closing tag?
29
     *
30
     * @param string $html
31
     *
32
     * @return boolean
33
     */
34 27
    public function isClosingElementImplied(string $html) : bool
35
    {
36 27
        $parent = $this->getParent();
37 27
        if ($parent === null || !($parent instanceof self)) {
38 1
            return false;
39
        }
40
41 26
        $name = $this->parseElementName($html);
42 26
        $parentName = $parent->getName();
43
44
        // HEAD: no closing tag.
45 26
        if ($name === 'body' && $parentName === 'head') {
46 2
            return true;
47
        }
48
49
        // P
50
        $elementsNotChildrenOfP = array(
51 25
            'address',
52
            'article',
53
            'aside',
54
            'blockquote',
55
            'details',
56
            'div',
57
            'dl',
58
            'fieldset',
59
            'figcaption',
60
            'figure',
61
            'footer',
62
            'form',
63
            'h1',
64
            'h2',
65
            'h3',
66
            'h4',
67
            'h5',
68
            'h6',
69
            'header',
70
            'hgroup',
71
            'hr',
72
            'main',
73
            'menu',
74
            'nav',
75
            'ol',
76
            'p',
77
            'pre',
78
            'section',
79
            'table',
80
            'ul'
81
        );
82 25
        if ($parentName === 'p' && in_array($name, $elementsNotChildrenOfP)) {
83 3
            return true;
84
        }
85
86
        // LI
87 22
        if ($parentName === 'li' && $name === 'li') {
88 1
            return true;
89
        }
90
91
        // DT and DD
92 21
        if (($parentName === 'dt' || $parentName === 'dd') && ($name === 'dt' || $name === 'dd')) {
93 4
            return true;
94
        }
95
96
        // RP and RT
97 17
        if (($parentName === 'rp' || $parentName === 'rt') && ($name === 'rp' || $name === 'rt')) {
98 4
            return true;
99
        }
100
101 13
        return false;
102
    }
103
104
    /**
105
     * Will parse this element.
106
     *
107
     * @param string $html
108
     *
109
     * @return string Remaining HTML.
110
     */
111 55
    public function parse(string $html) : string
112
    {
113 55
        $html = ltrim($html);
114 55
        $this->setTokenPosition($html);
115
116
        try {
117 55
            $this->name = $this->parseElementName($html);
118 55
            $remainingHtml = $this->parseAttributes($html);
119 55
            $posOfClosingBracket = $this->getPositionOfElementEndTag($remainingHtml);
120
121
            // Is self-closing?
122 51
            $posOfSelfClosingBracket = mb_strpos($remainingHtml, '/>');
123 51
            $remainingHtml = mb_substr($remainingHtml, $posOfClosingBracket + 1);
124 51
            if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket === $posOfClosingBracket - 1) {
125
                // Self-closing element. (Note: $this->valuue is unchanged.)
126 7
                return $remainingHtml;
127
            }
128
129
            // Lets close those closed-only elements that are left open.
130
            $closedOnlyElements = array(
131 45
                'area',
132
                'base',
133
                'br',
134
                'col',
135
                'embed',
136
                'hr',
137
                'img',
138
                'input',
139
                'link',
140
                'meta',
141
                'param',
142
                'source',
143
                'track',
144
                'wbr'
145
            );
146 45
            if (in_array($this->name, $closedOnlyElements)) {
147 6
                return $remainingHtml;
148
            }
149
150
            // Open element.
151 44
            return $this->parseContents($remainingHtml);
152 4
        } catch (ParseException $e) {
153 4
            if ($this->getThrowOnError()) {
154 3
                throw $e;
155
            }
156
        }
157
158 3
        return '';
159
    }
160
161
    /**
162
     * @param string $html
163
     *
164
     * @return string
165
     */
166 55
    private function parseAttributes(string $html) : string
167
    {
168 55
        $remainingHtml = mb_substr($html, mb_strlen($this->name) + 1);
169 55
        while (mb_strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
170 26
            $remainingHtml = $this->parseAttribute($remainingHtml);
171
        }
172
173 55
        return $remainingHtml;
174
    }
175
176
    /**
177
     * Will parse attributes.
178
     *
179
     * @param string $html
180
     *
181
     * @return string Remaining HTML.
182
     */
183 26
    private function parseAttribute(string $html) : string
184
    {
185 26
        $remainingHtml = ltrim($html);
186
187
        try {
188
            // Will match the first entire name/value attribute pair.
189 26
            preg_match(
190 26
                "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
191 26
                $remainingHtml,
192 26
                $attributeMatches
193
            );
194
195 26
            $attributeName = $attributeMatches[2];
196 26
            $remainingHtml = mb_substr(mb_strstr($remainingHtml, $attributeName), mb_strlen($attributeName));
197 26
            if ($this->isAttributeValueless($remainingHtml)) {
198 4
                $this->attributes[trim($attributeName)] = true;
199
200 4
                return $remainingHtml;
201
            }
202
203 24
            return $this->parseAttributeValue($html, $remainingHtml, $attributeName);
204 1
        } catch (ParseException $e) {
205 1
            if ($this->getThrowOnError()) {
206 1
                throw $e;
207
            }
208
        }
209
210 1
        return '';
211
    }
212
213 24
    private function parseAttributeValue(string $html, string $remainingHtml, string $attributeName) : string
214
    {
215 24
        $remainingHtml = ltrim($remainingHtml, ' =');
216 24
        if ($this->isAttributeValueQuoteEnclosed($remainingHtml)) {
217 21
            $attributeValue = $this->extractQuoteEnclosedAttributeValue($remainingHtml);
218
        } else {
219
            // No quotes enclosing the attribute value.
220 7
            $attributeValue = $this->extractQuotelessAttributeValue($remainingHtml);
221
        }
222
223 23
        $this->attributes[trim($attributeName)] = $attributeValue;
224 23
        $remainingHtml = $this->parseAttributeDetermineRemainingHtml($html, $attributeName, $attributeValue);
225
226 23
        return $remainingHtml;
227
    }
228
229
    /**
230
     * Will parse the contents of this element.
231
     *
232
     * @param string $html
233
     *
234
     * @return string Remaining HTML.
235
     */
236 44
    private function parseContents(string $html) : string
237
    {
238 44
        if (trim($html) === '') {
239 13
            return '';
240
        }
241
242
        // Determine value.
243 31
        $this->value = $html;
244 31
        if (preg_match("/(.*)<\/\s*" . $this->name . "\s*>/iU", $html, $valueMatches) === 1) {
245 28
            $this->value = $valueMatches[1];
246
        }
247
248
        // Don't parse contents of "iframe" element.
249 31
        if ($this->name === 'iframe') {
250 3
            return $this->parseNoContents('iframe', $html);
251
        }
252
253
        // Only TEXT inside a "script" element.
254 30
        if ($this->name === 'script') {
255 5
            return $this->parseForeignContents('script', $html);
256
        }
257
258
        // Only TEXT inside a "style" element.
259 27
        if ($this->name === 'style') {
260 2
            return $this->parseForeignContents('style', $html);
261
        }
262
263
        // Parse contents one token at a time.
264 25
        $remainingHtml = $html;
265 25
        while ($this->isAnotherTokenPresent($remainingHtml)) {
266 23
            $token = TokenFactory::buildFromHtml(
267 23
                $remainingHtml,
268 23
                $this,
269 23
                $this->getThrowOnError()
270
            );
271
272 23
            if (!$token instanceof Token || $token->isClosingElementImplied($remainingHtml)) {
273 2
                return $remainingHtml;
274
            }
275
276 23
            $remainingHtml = $token->parse($remainingHtml);
277 23
            $this->children[] = $token;
278
        }
279
280 24
        $this->removeLastTokenIfContainsOnlyWhitespace();
281
282
        // Remove remaining closing tag.
283 24
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
284
285 24
        return mb_substr($remainingHtml, $posOfClosingBracket + 1);
286
    }
287
288
    /**
289
     * Will get the element name from the html string.
290
     *
291
     * @param $html string
292
     *
293
     * @return string The element name.
294
     */
295 69
    private function parseElementName(string $html) : string
296
    {
297 69
        $html = trim($html);
298 69
        $elementMatchSuccessful = preg_match(
299 69
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
300 69
            $html,
301 69
            $elementMatches
302
        );
303 69
        if ($elementMatchSuccessful !== 1) {
304 1
            if ($this->getThrowOnError()) {
305 1
                throw new ParseException('Invalid element name. Truncated html = ' . mb_substr($html, 0, 20));
306
            }
307
308 1
            return '';
309
        }
310
311 68
        return mb_strtolower($elementMatches[2]);
312
    }
313
314
    /**
315
     * Will parse the script and style contents correctly.
316
     *
317
     * @param $tag  string
318
     * @param $html string
319
     *
320
     * @return string The remaining HTML.
321
     */
322 7
    private function parseForeignContents(string $tag, string $html) : string
323
    {
324 7
        $remainingHtml = ltrim($html);
325
326
        // Find all contents.
327 7
        $remainingHtml = $this->determineRemainingHtmlOfForeignContents(
328 7
            $tag,
329 7
            $html,
330 7
            $remainingHtml
331
        );
332
333
        // Handle no contents.
334 7
        if ($this->value === '') {
335 1
            return $remainingHtml;
336
        }
337
338 7
        $text = new Text($this, $this->getThrowOnError(), $this->value);
339 7
        $this->children[] = $text;
340
341 7
        return $remainingHtml;
342
    }
343
344
    /**
345
     * Will not parse the contents of an element.
346
     *
347
     * "iframe" elements.
348
     *
349
     * @param $tag  string
350
     * @param $html string
351
     *
352
     * @return string The remaining HTML.
353
     */
354 3 View Code Duplication
    private function parseNoContents(string $tag, string $html) : string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
355
    {
356 3
        $remainingHtml = ltrim($html);
357 3
        $matchingResult = preg_match(
358 3
            "/(<\/\s*" . $tag . "\s*>)/i",
359 3
            $html,
360 3
            $endOfScriptMatches
361
        );
362 3
        if ($matchingResult === 0) {
363 1
            return '';
364
        }
365
366 2
        $closingTag = $endOfScriptMatches[1];
367 2
        $this->value = mb_substr($remainingHtml, 0, mb_strpos($html, $closingTag));
368
369 2
        return mb_substr(
370 2
            mb_strstr($remainingHtml, $closingTag),
371 2
            mb_strlen($closingTag)
372
        );
373
    }
374
375
    /**
376
     * Getter for 'attributes'.
377
     *
378
     * @return array
379
     */
380 1
    public function getAttributes() : array
381
    {
382 1
        return $this->attributes;
383
    }
384
385
    /**
386
     * @return boolean
387
     */
388 1
    public function hasAttributes() : bool
389
    {
390 1
        return !empty($this->attributes);
391
    }
392
393
    /**
394
     * Getter for 'children'.
395
     *
396
     * @return array
397
     */
398 1
    public function getChildren() : array
399
    {
400 1
        return $this->children;
401
    }
402
403
    /**
404
     * @return boolean
405
     */
406 1
    public function hasChildren() : bool
407
    {
408 1
        return !empty($this->children);
409
    }
410
411
    /**
412
     * Getter for 'name'.
413
     *
414
     * @return string
415
     */
416 43
    public function getName() : string
417
    {
418 43
        return $this->name;
419
    }
420
421 30
    public function toArray() : array
422
    {
423
        $result = array(
424 30
            'type' => 'element',
425 30
            'name' => $this->name,
426 30
            'line' => $this->getLine(),
427 30
            'position' => $this->getPosition()
428
        );
429
430 30
        if (!empty($this->attributes)) {
431 17
            $result['attributes'] = array();
432 17
            foreach ($this->attributes as $name => $value) {
433 17
                $result['attributes'][$name] = $value;
434
            }
435
        }
436
437 30
        if (!empty($this->children)) {
438 17
            $result['children'] = array();
439 17
            foreach ($this->children as $child) {
440 17
                $result['children'][] = $child->toArray();
441
            }
442
        }
443
444 30
        return $result;
445
    }
446
447 21
    private function determineRemainingHtmlByRemovingAttributeName(string $html, string $name, string $value) : string
448
    {
449 21
        $remainingHtml = ltrim($html);
450
451 21
        $remainingHtml = mb_substr($remainingHtml, mb_strlen($name));
452 21
        $posOfAttributeValue = mb_strpos($remainingHtml, $value);
453 21
        $remainingHtml = ltrim(
454 21
            mb_substr(
455 21
                $remainingHtml,
456 21
                $posOfAttributeValue + mb_strlen($value)
457
            )
458
        );
459
460 21
        return $remainingHtml;
461
    }
462
463 23
    private function parseAttributeDetermineRemainingHtml(string $html, string $attributeName, string $value) : string
464
    {
465 23
        if ($value === '') {
466 2
            $remainingHtml = ltrim(mb_substr(ltrim($html), mb_strlen($attributeName) + 3));
467
        } else {
468 21
            $remainingHtml = $this->determineRemainingHtmlByRemovingAttributeName($html, $attributeName, $value);
469
        }
470
471 23
        return ltrim($remainingHtml, '\'"/ ');
472
    }
473
474 26
    private function isAttributeValueless(string $remainingHtml) : bool
475
    {
476 26
        return preg_match("/^\s*=\s*/", $remainingHtml) === 0;
477
    }
478
479 55
    private function getPositionOfElementEndTag(string $remainingHtml) : int
480
    {
481 55
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
482 55
        if ($posOfClosingBracket === false) {
483 4
            throw new ParseException('Invalid element: missing closing bracket.');
484
        }
485
486 51
        return $posOfClosingBracket;
487
    }
488
489 24
    private function removeLastTokenIfContainsOnlyWhitespace()
490
    {
491 24
        if (!empty($this->children)) {
492 22
            $lastChildArray = array_slice($this->children, -1);
493 22
            $lastChild = array_pop($lastChildArray);
494 22
            if ($lastChild->isText() && trim($lastChild->getValue()) === '') {
495 3
                array_pop($this->children);
496
            }
497
        }
498 24
    }
499
500 25
    private function isAnotherTokenPresent($remainingHtml) : bool
501
    {
502 25
        return preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0;
503
    }
504
505 21
    private function extractQuoteEnclosedAttributeValue(string $remainingHtml) : string
506
    {
507 21
        $quoteCharacter = $remainingHtml[0];
508 21
        $valueMatchSuccessful = preg_match(
509 21
            '/' . $quoteCharacter . "(.*?(?<!\\\))" . $quoteCharacter . "/s",
510 21
            $remainingHtml,
511 21
            $valueMatches
512
        );
513 21
        if ($valueMatchSuccessful !== 1) {
514 1
            throw new ParseException('Invalid quote enclosed attribute value encapsulation.');
515
        }
516
517 20
        return $valueMatches[1];
518
    }
519
520 7
    private function extractQuotelessAttributeValue(string $remainingHtml) : string
521
    {
522 7
        $valueMatchSuccessful = preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
523 7
        if ($valueMatchSuccessful !== 1) {
524
            throw new ParseException('Invalid quoteless attribute value encapsulation.');
525
        }
526
527 7
        return $valueMatches[2];
528
    }
529
530 24
    private function isAttributeValueQuoteEnclosed(string $remainingHtml) : bool
531
    {
532 24
        return $remainingHtml[0] === "'" || $remainingHtml[0] === '"';
533
    }
534
535 7 View Code Duplication
    private function determineRemainingHtmlOfForeignContents(string $tag, string $html, string $remainingHtml) : string
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
536
    {
537 7
        $matchingResult = preg_match(
538 7
            "/(<\/\s*" . $tag . "\s*>)/i",
539 7
            $html,
540 7
            $endOfScriptMatches
541
        );
542 7
        if ($matchingResult === 0) {
543 2
            $this->value = trim($remainingHtml);
544
545 2
            return '';
546
        }
547
548 5
        $closingTag = $endOfScriptMatches[1];
549 5
        $this->value = trim(
550 5
            mb_substr($remainingHtml, 0, mb_strpos($remainingHtml, $closingTag))
551
        );
552
553 5
        return mb_substr(
554 5
            mb_strstr($remainingHtml, $closingTag),
555 5
            mb_strlen($closingTag)
556
        );
557
    }
558
}
559