Completed
Push — master ( 3a1632...359021 )
by Kevin
02:23
created

Element::getAttributes()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
6
7
class Element extends AbstractToken
8
{
9
    /** @var array */
10
    private $attributes;
11
12
    /** @var array[Token] */
13
    private $children;
14
15
    /** @var string */
16
    private $name;
17
18 65
    public function __construct(Token $parent = null, $throwOnError = false)
19
    {
20 65
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
21
22 65
        $this->attributes = array();
23 65
        $this->children = array();
24 65
        $this->name = null;
25 65
    }
26
27
    /**
28
     * Does the parent have an implied closing tag?
29
     *
30
     * @param string $html
31
     *
32
     * @return boolean
33
     */
34 24
    public function isClosingElementImplied($html)
35
    {
36 24
        $parent = $this->getParent();
37 24
        if ($parent === null || !($parent instanceof self)) {
38 1
            return false;
39
        }
40
41 23
        $name = $this->parseElementName($html);
42 23
        $parentName = $parent->getName();
43
44
        // HEAD: no closing tag.
45 23
        if ($name === 'body' && $parentName === 'head') {
46 2
            return true;
47
        }
48
49
        // P
50
        $elementsNotChildrenOfP = array(
51 22
            'address',
52 22
            'article',
53 22
            'aside',
54 22
            'blockquote',
55 22
            'details',
56 22
            'div',
57 22
            'dl',
58 22
            'fieldset',
59 22
            'figcaption',
60 22
            'figure',
61 22
            'footer',
62 22
            'form',
63 22
            'h1',
64 22
            'h2',
65 22
            'h3',
66 22
            'h4',
67 22
            'h5',
68 22
            'h6',
69 22
            'header',
70 22
            'hgroup',
71 22
            'hr',
72 22
            'main',
73 22
            'menu',
74 22
            'nav',
75 22
            'ol',
76 22
            'p',
77 22
            'pre',
78 22
            'section',
79 22
            'table',
80
            'ul'
81 22
        );
82 22
        if ($parentName === 'p' && array_search($name, $elementsNotChildrenOfP) !== false) {
83 3
            return true;
84
        }
85
86
        // LI
87 19
        if ($parentName == 'li' && $name == 'li') {
88 1
            return true;
89
        }
90
91
        // DT and DD
92 18
        if (($parentName == 'dt' || $parentName == 'dd') && ($name == 'dt' || $name == 'dd')) {
93 4
            return true;
94
        }
95
96
        // RP and RT
97 14
        if (($parentName == 'rp' || $parentName == 'rt') && ($name == 'rp' || $name == 'rt')) {
98 4
            return true;
99
        }
100
101 10
        return false;
102
    }
103
104
    /**
105
     * Will parse this element.
106
     *
107
     * @param string $html
108
     *
109
     * @return string Remaining HTML.
110
     */
111 49
    public function parse($html)
112
    {
113 49
        $html = ltrim($html);
114 49
        $this->name = $this->parseElementName($html);
115
116
        // Parse attributes.
117 49
        $remainingHtml = mb_substr($html, mb_strlen($this->name) + 1);
118 49
        while (mb_strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
119 26
            $remainingHtml = $this->parseAttribute($remainingHtml);
120 26
        }
121
122
        // Find position of end of tag.
123 49
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
124 49
        if ($posOfClosingBracket === false) {
125 4
            if ($this->getThrowOnError()) {
126 1
                throw new ParseException('Invalid element: missing closing bracket.');
127
            }
128
129 3
            return '';
130
        }
131
132
        // Is self-closing?
133 45
        $posOfSelfClosingBracket = mb_strpos($remainingHtml, '/>');
134 45
        $remainingHtml = mb_substr($remainingHtml, $posOfClosingBracket + 1);
135 45
        if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket == $posOfClosingBracket - 1) {
136
            // Self-closing element.
137 7
            return $remainingHtml;
138
        }
139
140
        // Lets close those closed-only elements that are left open.
141
        $closedOnlyElements = array(
142 39
            'area',
143 39
            'base',
144 39
            'br',
145 39
            'col',
146 39
            'embed',
147 39
            'hr',
148 39
            'img',
149 39
            'input',
150 39
            'link',
151 39
            'meta',
152 39
            'param',
153 39
            'source',
154 39
            'track',
155
            'wbr'
156 39
        );
157 39
        if (array_search($this->name, $closedOnlyElements) !== false) {
158 6
            return $remainingHtml;
159
        }
160
161
        // Open element.
162 38
        return $this->parseContents($remainingHtml);
163
    }
164
165
    /**
166
     * Will parse attributes.
167
     *
168
     * @param string $html
169
     *
170
     * @return string Remaining HTML.
171
     */
172 26
    private function parseAttribute($html)
173
    {
174 26
        $remainingHtml = ltrim($html);
175
176
        // Will match the first entire name/value attribute pair.
177 26
        preg_match(
178 26
            "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
179 26
            $remainingHtml,
180
            $attributeMatches
181 26
        );
182
183 26
        $name = $attributeMatches[2];
184 26
        $remainingHtml = mb_substr(mb_strstr($remainingHtml, $name), mb_strlen($name));
185 26
        if (preg_match("/^\s*=\s*/", $remainingHtml) === 0) {
186
            // Valueless attribute.
187 4
            $this->attributes[trim($name)] = true;
188 4
        } else {
189 24
            $remainingHtml = ltrim($remainingHtml, ' =');
190 24
            if ($remainingHtml[0] === "'" || $remainingHtml[0] === '"') {
191
                // Quote enclosed attribute value.
192 21
                $valueMatchSuccessful = preg_match(
193 21
                    "/" . $remainingHtml[0] . "(.*?(?<!\\\))" . $remainingHtml[0] . "/s",
194 21
                    $remainingHtml,
195
                    $valueMatches
196 21
                );
197 21
                if ($valueMatchSuccessful !== 1) {
198 1
                    if ($this->getThrowOnError()) {
199 1
                        throw new ParseException('Invalid value encapsulation.');
200
                    }
201
202 1
                    return '';
203
                }
204
205 20
                $value = $valueMatches[1];
206 20
            } else {
207
                // No quotes enclosing the attribute value.
208 7
                preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
209 7
                $value = $valueMatches[2];
210
            }
211
212 23
            $this->attributes[trim($name)] = $value;
213
214
            // Determine remaining html.
215 23
            if ($value == '') {
216 2
                $remainingHtml = ltrim(mb_substr(ltrim($html), mb_strlen($name) + 3));
217 2
            } else {
218 21
                $remainingHtml = ltrim($html);
219
220
                // Remove attribute name.
221 21
                $remainingHtml = mb_substr($remainingHtml, mb_strlen($name));
222 21
                $posOfAttributeValue = mb_strpos($remainingHtml, $value);
223 21
                $remainingHtml = ltrim(
224 21
                    mb_substr(
225 21
                        $remainingHtml,
226 21
                        $posOfAttributeValue + mb_strlen($value)
227 21
                    )
228 21
                );
229
            }
230
231 23
            $remainingHtml = ltrim($remainingHtml, '\'"/ ');
232
        }
233
234 25
        return $remainingHtml;
235
    }
236
237
    /**
238
     * Will parse the contents of this element.
239
     *
240
     * @param string $html
241
     *
242
     * @return string Remaining HTML.
243
     */
244 38
    private function parseContents($html)
245
    {
246 38
        if (trim($html) == '') {
247 13
            return '';
248
        }
249
250
        // Don't parse contents of "iframe" element.
251 25
        if ($this->name == 'iframe') {
252 2
            return $this->parseNoContents('iframe', $html);
253
        }
254
255
        // Only TEXT inside a "script" element.
256 25
        if ($this->name == 'script') {
257 4
            return $this->parseForeignContents('script', $html);
258
        }
259
260
        // Only TEXT inside a "style" element.
261 23
        if ($this->name == 'style') {
262 2
            return $this->parseForeignContents('style', $html);
263
        }
264
265
        // Parse contents one token at a time.
266 21
        $remainingHtml = $html;
267 21
        while (preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0) {
268 19
            $token = TokenFactory::buildFromHtml(
269 19
                $remainingHtml,
270 19
                $this,
271 19
                $this->getThrowOnError()
272 19
            );
273
274 19
            if ($token === false || $token->isClosingElementImplied($remainingHtml)) {
275 2
                return $remainingHtml;
276
            }
277
278 19
            $remainingHtml = $token->parse($remainingHtml);
279 19
            $this->children[] = $token;
280 19
        }
281
282
        // Remove last token if contains only whitespace.
283 20
        if (!empty($this->children)) {
284 18
            $lastChildArray = array_slice($this->children, -1);
285 18
            $lastChild = array_pop($lastChildArray);
286 18
            if ($lastChild->isText() && trim($lastChild->getValue()) == '') {
287 2
                array_pop($this->children);
288 2
            }
289 18
        }
290
291
        // Remove remaining closing tag.
292 20
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
293
294 20
        return mb_substr($remainingHtml, $posOfClosingBracket + 1);
295
    }
296
297
    /**
298
     * Will get the element name from the html string.
299
     *
300
     * @param $html string
301
     *
302
     * @return string The element name.
303
     */
304 63
    private function parseElementName($html)
305
    {
306 63
        $elementMatchSuccessful = preg_match(
307 63
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
308 63
            $html,
309
            $elementMatches
310 63
        );
311 63
        if ($elementMatchSuccessful !== 1) {
312 2
            if ($this->getThrowOnError()) {
313 1
                throw new ParseException('Invalid element name.');
314
            }
315
316 2
            return '';
317
        }
318
319 62
        return mb_strtolower($elementMatches[2]);
320
    }
321
322
    /**
323
     * Will parse the script and style contents correctly.
324
     *
325
     * @param $tag  string
326
     * @param $html string
327
     *
328
     * @return string The remaining HTML.
329
     */
330 6
    private function parseForeignContents($tag, $html)
331
    {
332 6
        $remainingHtml = ltrim($html);
333 6
        $matchingResult = preg_match(
334 6
            "/(<\/\s*" . $tag . "\s*>)/i",
335 6
            $html,
336
            $endOfScriptMatches
337 6
        );
338 6
        if ($matchingResult === 0) {
339 2
            $value = trim($remainingHtml);
340 2
            $remainingHtml = '';
341 2
        } else {
342 4
            $closingTag = $endOfScriptMatches[1];
343 4
            $value = trim(
344 4
                mb_substr($remainingHtml, 0, mb_strpos($remainingHtml, $closingTag))
345 4
            );
346 4
            $remainingHtml = mb_substr(
347 4
                mb_strstr($remainingHtml, $closingTag),
348 4
                mb_strlen($closingTag)
349 4
            );
350
        }
351
352
        // Handle no contents.
353 6
        if ($value == '') {
354 1
            return $remainingHtml;
355
        }
356
357 6
        $text = new Text($this, $this->getThrowOnError(), $value);
358 6
        $this->children[] = $text;
359
360 6
        return $remainingHtml;
361
    }
362
363
    /**
364
     * Will not parse the contents of an element.
365
     *
366
     * "iframe" elements.
367
     *
368
     * @param $tag  string
369
     * @param $html string
370
     *
371
     * @return string The remaining HTML.
372
     */
373 2
    private function parseNoContents($tag, $html)
374
    {
375 2
        $remainingHtml = ltrim($html);
376 2
        $matchingResult = preg_match(
377 2
            "/(<\/\s*" . $tag . "\s*>)/i",
378 2
            $html,
379
            $endOfScriptMatches
380 2
        );
381 2
        if ($matchingResult === 0) {
382 1
            return '';
383
        }
384
385 1
        $closingTag = $endOfScriptMatches[1];
386
387 1
        return mb_substr(
388 1
            mb_strstr($remainingHtml, $closingTag),
389 1
            mb_strlen($closingTag)
390 1
        );
391
    }
392
393
    /**
394
     * Getter for 'attributes'.
395
     *
396
     * @return array
397
     */
398 1
    public function getAttributes()
399
    {
400 1
        return $this->attributes;
401
    }
402
403
    /**
404
     * @return boolean
405
     */
406 1
    public function hasAttributes()
407
    {
408 1
        return !empty($this->attributes);
409
    }
410
411
    /**
412
     * Getter for 'children'.
413
     *
414
     * @return array
415
     */
416 1
    public function getChildren()
417
    {
418 1
        return $this->children;
419
    }
420
421
    /**
422
     * @return boolean
423
     */
424 1
    public function hasChildren()
425
    {
426 1
        return !empty($this->children);
427
    }
428
429
    /**
430
     * Getter for 'name'.
431
     *
432
     * @return string
433
     */
434 37
    public function getName()
435
    {
436 37
        return $this->name;
437
    }
438
439 30
    public function toArray()
440
    {
441
        $result = array(
442 30
            'type' => 'element',
443 30
            'name' => $this->name
444 30
        );
445
446 30
        if (!empty($this->attributes)) {
447 17
            $result['attributes'] = array();
448 17
            foreach ($this->attributes as $name => $value) {
449 17
                $result['attributes'][$name] = $value;
450 17
            }
451 17
        }
452
453 30
        if (!empty($this->children)) {
454 17
            $result['children'] = array();
455 17
            foreach ($this->children as $child) {
456 17
                $result['children'][] = $child->toArray();
457 17
            }
458 17
        }
459
460 30
        return $result;
461
    }
462
}
463