Completed
Push — master ( 15134a...d6e1fc )
by Kevin
02:45
created

Element::getAttributes()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
c 0
b 0
f 0
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\HtmlTokenizer;
6
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
7
8
class Element extends AbstractToken
9
{
10
    /** @var array */
11
    private $attributes;
12
13
    /** @var array[Token] */
14
    private $children;
15
16
    /** @var string */
17
    private $name;
18
19 71
    public function __construct(Token $parent = null, bool $throwOnError = true)
20
    {
21 71
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
22
23 71
        $this->attributes = array();
24 71
        $this->children = array();
25 71
        $this->name = null;
26 71
    }
27
28
    /**
29
     * Does the parent have an implied closing tag?
30
     *
31
     * @param string $html
32
     *
33
     * @return boolean
34
     */
35 27
    public function isClosingElementImplied(string $html) : bool
36
    {
37 27
        $parent = $this->getParent();
38 27
        if ($parent === null || !($parent instanceof self)) {
39 1
            return false;
40
        }
41
42 26
        $name = $this->parseElementName($html);
43 26
        $parentName = $parent->getName();
44
45
        // HEAD: no closing tag.
46 26
        if ($name === 'body' && $parentName === 'head') {
47 2
            return true;
48
        }
49
50
        // P
51
        $elementsNotChildrenOfP = array(
52 25
            'address',
53
            'article',
54
            'aside',
55
            'blockquote',
56
            'details',
57
            'div',
58
            'dl',
59
            'fieldset',
60
            'figcaption',
61
            'figure',
62
            'footer',
63
            'form',
64
            'h1',
65
            'h2',
66
            'h3',
67
            'h4',
68
            'h5',
69
            'h6',
70
            'header',
71
            'hgroup',
72
            'hr',
73
            'main',
74
            'menu',
75
            'nav',
76
            'ol',
77
            'p',
78
            'pre',
79
            'section',
80
            'table',
81
            'ul'
82
        );
83 25
        if ($parentName === 'p' && array_search($name, $elementsNotChildrenOfP) !== false) {
84 3
            return true;
85
        }
86
87
        // LI
88 22
        if ($parentName === 'li' && $name === 'li') {
89 1
            return true;
90
        }
91
92
        // DT and DD
93 21
        if (($parentName === 'dt' || $parentName === 'dd') && ($name === 'dt' || $name === 'dd')) {
94 4
            return true;
95
        }
96
97
        // RP and RT
98 17
        if (($parentName === 'rp' || $parentName === 'rt') && ($name === 'rp' || $name === 'rt')) {
99 4
            return true;
100
        }
101
102 13
        return false;
103
    }
104
105
    /**
106
     * Will parse this element.
107
     *
108
     * @param string $html
109
     *
110
     * @return string Remaining HTML.
111
     */
112 55
    public function parse(string $html) : string
113
    {
114 55
        $html = ltrim($html);
115
116
        // Get token position.
117 55
        $positionArray = HtmlTokenizer::getPosition($html);
118 55
        $this->line = $positionArray['line'];
119 55
        $this->position = $positionArray['position'];
120
121
        // Parse name.
122 55
        $this->name = $this->parseElementName($html);
123
124
        // Parse attributes.
125 55
        $remainingHtml = mb_substr($html, mb_strlen($this->name) + 1);
126 55
        while (mb_strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
127 26
            $remainingHtml = $this->parseAttribute($remainingHtml);
128
        }
129
130
        // Find position of end of tag.
131 55
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
132 55
        if ($posOfClosingBracket === false) {
133 4
            if ($this->getThrowOnError()) {
134 1
                throw new ParseException('Invalid element: missing closing bracket.');
135
            }
136
137 3
            return '';
138
        }
139
140
        // Is self-closing?
141 51
        $posOfSelfClosingBracket = mb_strpos($remainingHtml, '/>');
142 51
        $remainingHtml = mb_substr($remainingHtml, $posOfClosingBracket + 1);
143 51
        if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket == $posOfClosingBracket - 1) {
144
            // Self-closing element. (Note: $this->valuue is unchanged.)
145 7
            return $remainingHtml;
146
        }
147
148
        // Lets close those closed-only elements that are left open.
149
        $closedOnlyElements = array(
150 45
            'area',
151
            'base',
152
            'br',
153
            'col',
154
            'embed',
155
            'hr',
156
            'img',
157
            'input',
158
            'link',
159
            'meta',
160
            'param',
161
            'source',
162
            'track',
163
            'wbr'
164
        );
165 45
        if (array_search($this->name, $closedOnlyElements) !== false) {
166 6
            return $remainingHtml;
167
        }
168
169
        // Open element.
170 44
        return $this->parseContents($remainingHtml);
171
    }
172
173
    /**
174
     * Will parse attributes.
175
     *
176
     * @param string $html
177
     *
178
     * @return string Remaining HTML.
179
     */
180 26
    private function parseAttribute(string $html) : string
181
    {
182 26
        $remainingHtml = ltrim($html);
183
184
        // Will match the first entire name/value attribute pair.
185 26
        preg_match(
186 26
            "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
187
            $remainingHtml,
188
            $attributeMatches
189
        );
190
191 26
        $name = $attributeMatches[2];
192 26
        $remainingHtml = mb_substr(mb_strstr($remainingHtml, $name), mb_strlen($name));
193 26
        if (preg_match("/^\s*=\s*/", $remainingHtml) === 0) {
194
            // Valueless attribute.
195 4
            $this->attributes[trim($name)] = true;
196
        } else {
197 24
            $remainingHtml = ltrim($remainingHtml, ' =');
198 24
            if ($remainingHtml[0] === "'" || $remainingHtml[0] === '"') {
199
                // Quote enclosed attribute value.
200 21
                $valueMatchSuccessful = preg_match(
201 21
                    "/" . $remainingHtml[0] . "(.*?(?<!\\\))" . $remainingHtml[0] . "/s",
202
                    $remainingHtml,
203
                    $valueMatches
204
                );
205 21
                if ($valueMatchSuccessful !== 1) {
206 1
                    if ($this->getThrowOnError()) {
207 1
                        throw new ParseException('Invalid value encapsulation.');
208
                    }
209
210 1
                    return '';
211
                }
212
213 20
                $value = $valueMatches[1];
214
            } else {
215
                // No quotes enclosing the attribute value.
216 7
                preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
217 7
                $value = $valueMatches[2];
218
            }
219
220 23
            $this->attributes[trim($name)] = $value;
221
222
            // Determine remaining html.
223 23
            if ($value == '') {
224 2
                $remainingHtml = ltrim(mb_substr(ltrim($html), mb_strlen($name) + 3));
225
            } else {
226 21
                $remainingHtml = ltrim($html);
227
228
                // Remove attribute name.
229 21
                $remainingHtml = mb_substr($remainingHtml, mb_strlen($name));
230 21
                $posOfAttributeValue = mb_strpos($remainingHtml, $value);
231 21
                $remainingHtml = ltrim(
232
                    mb_substr(
233
                        $remainingHtml,
234 21
                        $posOfAttributeValue + mb_strlen($value)
235
                    )
236
                );
237
            }
238
239 23
            $remainingHtml = ltrim($remainingHtml, '\'"/ ');
240
        }
241
242 25
        return $remainingHtml;
243
    }
244
245
    /**
246
     * Will parse the contents of this element.
247
     *
248
     * @param string $html
249
     *
250
     * @return string Remaining HTML.
251
     */
252 44
    private function parseContents(string $html) : string
253
    {
254 44
        if (trim($html) == '') {
255 13
            return '';
256
        }
257
258
        // Determine value.
259 31
        $this->value = $html;
260 31
        if (preg_match("/(.*)<\/\s*" . $this->name . "\s*>/iU", $html, $valueMatches) === 1) {
261 28
            $this->value = $valueMatches[1];
262
        }
263
264
        // Don't parse contents of "iframe" element.
265 31
        if ($this->name === 'iframe') {
266 3
            return $this->parseNoContents('iframe', $html);
267
        }
268
269
        // Only TEXT inside a "script" element.
270 30
        if ($this->name === 'script') {
271 5
            return $this->parseForeignContents('script', $html);
272
        }
273
274
        // Only TEXT inside a "style" element.
275 27
        if ($this->name === 'style') {
276 2
            return $this->parseForeignContents('style', $html);
277
        }
278
279
        // Parse contents one token at a time.
280 25
        $remainingHtml = $html;
281 25
        while (preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0) {
282 23
            $token = TokenFactory::buildFromHtml(
283
                $remainingHtml,
284
                $this,
285 23
                $this->getThrowOnError()
286
            );
287
288 23
            if (!$token instanceof Token || $token->isClosingElementImplied($remainingHtml)) {
289 2
                return $remainingHtml;
290
            }
291
292 23
            $remainingHtml = $token->parse($remainingHtml);
293 23
            $this->children[] = $token;
294
        }
295
296
        // Remove last token if contains only whitespace.
297 24
        if (!empty($this->children)) {
298 22
            $lastChildArray = array_slice($this->children, -1);
299 22
            $lastChild = array_pop($lastChildArray);
300 22
            if ($lastChild->isText() && trim($lastChild->getValue()) == '') {
301 3
                array_pop($this->children);
302
            }
303
        }
304
305
        // Remove remaining closing tag.
306 24
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
307
308 24
        return mb_substr($remainingHtml, $posOfClosingBracket + 1);
309
    }
310
311
    /**
312
     * Will get the element name from the html string.
313
     *
314
     * @param $html string
315
     *
316
     * @return string The element name.
317
     */
318 69
    private function parseElementName(string $html) : string
319
    {
320 69
        $html = trim($html);
321 69
        $elementMatchSuccessful = preg_match(
322 69
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
323
            $html,
324
            $elementMatches
325
        );
326 69
        if ($elementMatchSuccessful !== 1) {
327 1
            if ($this->getThrowOnError()) {
328 1
                throw new ParseException('Invalid element name. Truncated html = ' . mb_substr($html, 0, 20));
329
            }
330
331 1
            return '';
332
        }
333
334 68
        return mb_strtolower($elementMatches[2]);
335
    }
336
337
    /**
338
     * Will parse the script and style contents correctly.
339
     *
340
     * @param $tag  string
341
     * @param $html string
342
     *
343
     * @return string The remaining HTML.
344
     */
345 7
    private function parseForeignContents(string $tag, string $html) : string
346
    {
347 7
        $remainingHtml = ltrim($html);
348
349
        // Get token position.
350 7
        $positionArray = HtmlTokenizer::getPosition($remainingHtml);
0 ignored issues
show
Unused Code introduced by
$positionArray is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
351
352
        // Find all contents.
353 7
        $matchingResult = preg_match(
354 7
            "/(<\/\s*" . $tag . "\s*>)/i",
355
            $html,
356
            $endOfScriptMatches
357
        );
358 7
        if ($matchingResult === 0) {
359 2
            $this->value = trim($remainingHtml);
360 2
            $remainingHtml = '';
361
        } else {
362 5
            $closingTag = $endOfScriptMatches[1];
363 5
            $this->value = trim(
364 5
                mb_substr($remainingHtml, 0, mb_strpos($remainingHtml, $closingTag))
365
            );
366 5
            $remainingHtml = mb_substr(
367
                mb_strstr($remainingHtml, $closingTag),
368
                mb_strlen($closingTag)
369
            );
370
        }
371
372
        // Handle no contents.
373 7
        if ($this->value == '') {
374 1
            return $remainingHtml;
375
        }
376
377 7
        $text = new Text($this, $this->getThrowOnError(), $this->value);
378 7
        $this->children[] = $text;
379
380 7
        return $remainingHtml;
381
    }
382
383
    /**
384
     * Will not parse the contents of an element.
385
     *
386
     * "iframe" elements.
387
     *
388
     * @param $tag  string
389
     * @param $html string
390
     *
391
     * @return string The remaining HTML.
392
     */
393 3
    private function parseNoContents(string $tag, string $html) : string
394
    {
395 3
        $remainingHtml = ltrim($html);
396 3
        $matchingResult = preg_match(
397 3
            "/(<\/\s*" . $tag . "\s*>)/i",
398
            $html,
399
            $endOfScriptMatches
400
        );
401 3
        if ($matchingResult === 0) {
402 1
            return '';
403
        }
404
405 2
        $closingTag = $endOfScriptMatches[1];
406 2
        $this->value = mb_substr($remainingHtml, 0, mb_strpos($html, $closingTag));
407
408 2
        return mb_substr(
409
            mb_strstr($remainingHtml, $closingTag),
410
            mb_strlen($closingTag)
411
        );
412
    }
413
414
    /**
415
     * Getter for 'attributes'.
416
     *
417
     * @return array
418
     */
419 1
    public function getAttributes() : array
420
    {
421 1
        return $this->attributes;
422
    }
423
424
    /**
425
     * @return boolean
426
     */
427 1
    public function hasAttributes() : bool
428
    {
429 1
        return !empty($this->attributes);
430
    }
431
432
    /**
433
     * Getter for 'children'.
434
     *
435
     * @return array
436
     */
437 1
    public function getChildren() : array
438
    {
439 1
        return $this->children;
440
    }
441
442
    /**
443
     * @return boolean
444
     */
445 1
    public function hasChildren() : bool
446
    {
447 1
        return !empty($this->children);
448
    }
449
450
    /**
451
     * Getter for 'name'.
452
     *
453
     * @return string
454
     */
455 43
    public function getName() : string
456
    {
457 43
        return $this->name;
458
    }
459
460 30
    public function toArray() : array
461
    {
462
        $result = array(
463 30
            'type' => 'element',
464 30
            'name' => $this->name,
465 30
            'line' => $this->getLine(),
466 30
            'position' => $this->getPosition()
467
        );
468
469 30
        if (!empty($this->attributes)) {
470 17
            $result['attributes'] = array();
471 17
            foreach ($this->attributes as $name => $value) {
472 17
                $result['attributes'][$name] = $value;
473
            }
474
        }
475
476 30
        if (!empty($this->children)) {
477 17
            $result['children'] = array();
478 17
            foreach ($this->children as $child) {
479 17
                $result['children'][] = $child->toArray();
480
            }
481
        }
482
483 30
        return $result;
484
    }
485
}
486