Completed
Push — master ( f4f956...ee0d4f )
by Kevin
02:18
created

Element::parseNoContents()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 20
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 20
ccs 13
cts 13
cp 1
rs 9.4285
cc 2
eloc 12
nc 2
nop 2
crap 2
1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
6
7
class Element extends AbstractToken
8
{
9
    /** @var array */
10
    private $attributes;
11
12
    /** @var array[Token] */
13
    private $children;
14
15
    /** @var string */
16
    private $name;
17
18 58
    public function __construct(Token $parent = null, $throwOnError = false)
19
    {
20 58
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
21
22 58
        $this->attributes = array();
23 58
        $this->children = array();
24 58
        $this->name = null;
25 58
    }
26
27
    /**
28
     * Does the parent have an implied closing tag?
29
     *
30
     * @param string $html
31
     *
32
     * @return boolean
33
     */
34 22
    public function isClosingElementImplied($html)
35
    {
36 22
        $parent = $this->getParent();
37 22
        if ($parent === null || !($parent instanceof self)) {
38 1
            return false;
39
        }
40
41 21
        $name = $this->parseElementName($html);
42 21
        $parentName = $parent->getName();
43
44
        // HEAD: no closing tag.
45 21
        if ($name === 'body' && $parentName === 'head') {
46 2
            return true;
47
        }
48
49
        // P
50
        $elementsNotChildrenOfP = array(
51 20
            'address',
52 20
            'article',
53 20
            'aside',
54 20
            'blockquote',
55 20
            'details',
56 20
            'div',
57 20
            'dl',
58 20
            'fieldset',
59 20
            'figcaption',
60 20
            'figure',
61 20
            'footer',
62 20
            'form',
63 20
            'h1',
64 20
            'h2',
65 20
            'h3',
66 20
            'h4',
67 20
            'h5',
68 20
            'h6',
69 20
            'header',
70 20
            'hgroup',
71 20
            'hr',
72 20
            'main',
73 20
            'menu',
74 20
            'nav',
75 20
            'ol',
76 20
            'p',
77 20
            'pre',
78 20
            'section',
79 20
            'table',
80
            'ul'
81 20
        );
82 20
        if ($parentName === 'p' && array_search($name, $elementsNotChildrenOfP) !== false) {
83 3
            return true;
84
        }
85
86
        // LI
87 17
        if ($parentName == 'li' && $name == 'li') {
88 1
            return true;
89
        }
90
91
        // DT and DD
92 16
        if (($parentName == 'dt' || $parentName == 'dd') && ($name == 'dt' || $name == 'dd')) {
93 4
            return true;
94
        }
95
96
        // RP and RT
97 12
        if (($parentName == 'rp' || $parentName == 'rt') && ($name == 'rp' || $name == 'rt')) {
98 4
            return true;
99
        }
100
101 8
        return false;
102
    }
103
104
    /**
105
     * Will parse this element.
106
     *
107
     * @param string $html
108
     *
109
     * @return string Remaining HTML.
110
     */
111 42
    public function parse($html)
112
    {
113 42
        $this->name = $this->parseElementName($html);
114
115
        // Parse attributes.
116 42
        $remainingHtml = mb_substr($html, mb_strlen($this->name) + 1);
117 42
        while (mb_strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
118 19
            $remainingHtml = $this->parseAttribute(trim($remainingHtml));
119 19
        }
120
121
        // Find position of end of tag.
122 42
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
123 42
        if ($posOfClosingBracket === false) {
124 4
            if ($this->getThrowOnError()) {
125 1
                throw new ParseException('Invalid element: missing closing bracket.');
126
            }
127
128 3
            return '';
129
        }
130
131
        // Is self-closing?
132 38
        $posOfSelfClosingBracket = mb_strpos($remainingHtml, '/>');
133 38
        $remainingHtml = trim(mb_substr($remainingHtml, $posOfClosingBracket + 1));
134 38
        if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket == $posOfClosingBracket - 1) {
135
            // Self-closing element.
136 6
            return $remainingHtml;
137
        }
138
139
        // Lets close those closed-only elements that are left open.
140
        $closedOnlyElements = array(
141 33
            'base',
142 33
            'link',
143 33
            'meta',
144 33
            'hr',
145
            'br'
146 33
        );
147 33
        if (array_search($this->name, $closedOnlyElements) !== false) {
148 4
            return $remainingHtml;
149
        }
150
151
        // Open element.
152 33
        return $this->parseContents($remainingHtml);
153
    }
154
155
    /**
156
     * Will parse attributes.
157
     *
158
     * @param string $html
159
     *
160
     * @return string Remaining HTML.
161
     */
162 19
    private function parseAttribute($html)
163
    {
164 19
        $remainingHtml = trim($html);
165
166
        // Will match the first entire name/value attribute pair.
167 19
        preg_match(
168 19
            "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
169 19
            $remainingHtml,
170
            $attributeMatches
171 19
        );
172
173 19
        $name = $attributeMatches[2];
174 19
        $remainingHtml = mb_substr(mb_strstr($remainingHtml, $name), mb_strlen($name));
175 19
        if (preg_match("/^\s*=\s*/", $remainingHtml) === 0) {
176
            // Valueless attribute.
177 3
            $this->attributes[trim($name)] = true;
178 3
        } else {
179 18
            $remainingHtml = ltrim($remainingHtml, ' =');
180 18
            if ($remainingHtml[0] === "'" || $remainingHtml[0] === '"') {
181
                // Quote enclosed attribute value.
182 15
                $valueMatchSuccessful = preg_match(
183 15
                    "/" . $remainingHtml[0] . "(.*?(?<!\\\))" . $remainingHtml[0] . "/s",
184 15
                    $remainingHtml,
185
                    $valueMatches
186 15
                );
187 15
                if ($valueMatchSuccessful !== 1) {
188 1
                    if ($this->getThrowOnError()) {
189 1
                        throw new ParseException('Invalid value encapsulation.');
190
                    }
191
192 1
                    return '';
193
                }
194
195 14
                $value = $valueMatches[1];
196 14
            } else {
197
                // No quotes enclosing the attribute value.
198 6
                preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
199 6
                $value = $valueMatches[2];
200
            }
201
202 17
            $this->attributes[trim($name)] = $value;
203
204
            // Determine remaining html.
205 17
            $posOfAttributeValue = mb_strpos($html, $value);
206 17
            $remainingHtml = trim(
207 17
                mb_substr($html, $posOfAttributeValue + mb_strlen($value))
208 17
            );
209 17
            $remainingHtml = ltrim($remainingHtml, '\'"/ ');
210
        }
211
212 18
        return $remainingHtml;
213
    }
214
215
    /**
216
     * Will parse the contents of this element.
217
     *
218
     * @param string $html
219
     *
220
     * @return string Remaining HTML.
221
     */
222 33
    private function parseContents($html)
223
    {
224 33
        $remainingHtml = trim($html);
225 33
        if ($remainingHtml == '') {
226 10
            return '';
227
        }
228
229
        // Don't parse contents of "iframe" element.
230 23
        if ($this->name == 'iframe') {
231 2
            return $this->parseNoContents('iframe', $remainingHtml);
232
        }
233
234
        // Only TEXT inside a "script" element.
235 23
        if ($this->name == 'script') {
236 3
            return $this->parseForeignContents('script', $remainingHtml);
237
        }
238
239
        // Only TEXT inside a "style" element.
240 21
        if ($this->name == 'style') {
241 2
            return $this->parseForeignContents('style', $remainingHtml);
242
        }
243
244
        // Parse contents one token at a time.
245 19
        while (preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0) {
246 17
            $token = TokenFactory::buildFromHtml(
247 17
                $remainingHtml,
248 17
                $this,
249 17
                $this->getThrowOnError()
250 17
            );
251
252 17
            if ($token === false || $token->isClosingElementImplied($remainingHtml)) {
253 3
                return $remainingHtml;
254
            }
255
256 17
            $remainingHtml = trim($token->parse($remainingHtml));
257 17
            $this->children[] = $token;
258 17
        }
259
260
        // Remove remaining closing tag.
261 18
        $posOfClosingBracket = mb_strpos($remainingHtml, '>');
262
263 18
        return mb_substr($remainingHtml, $posOfClosingBracket + 1);
264
    }
265
266
    /**
267
     * Will get the element name from the html string.
268
     *
269
     * @param $html string
270
     *
271
     * @return string The element name.
272
     */
273 56
    private function parseElementName($html)
274
    {
275 56
        $elementMatchSuccessful = preg_match(
276 56
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
277 56
            $html,
278
            $elementMatches
279 56
        );
280 56
        if ($elementMatchSuccessful !== 1) {
281 1
            if ($this->getThrowOnError()) {
282 1
                throw new ParseException('Invalid element name.');
283
            }
284
285 1
            return '';
286
        }
287
288 55
        return mb_strtolower($elementMatches[2]);
289
    }
290
291
    /**
292
     * Will parse the script and style contents correctly.
293
     *
294
     * @param $tag  string
295
     * @param $html string
296
     *
297
     * @return string The remaining HTML.
298
     */
299 5
    private function parseForeignContents($tag, $html)
300
    {
301 5
        $remainingHtml = trim($html);
302
303 5
        $matchingResult = preg_match(
304 5
            "/(<\/\s*" . $tag . "\s*>)/i",
305 5
            $html,
306
            $endOfScriptMatches
307 5
        );
308 5
        if ($matchingResult === 0) {
309 2
            $value = $remainingHtml;
310 2
            $remainingHtml = '';
311 2
        } else {
312 3
            $closingTag = $endOfScriptMatches[1];
313 3
            $value = trim(
314 3
                mb_substr($remainingHtml, 0, mb_strpos($remainingHtml, $closingTag))
315 3
            );
316 3
            $remainingHtml = mb_substr(
317 3
                mb_strstr($remainingHtml, $closingTag),
318 3
                mb_strlen($closingTag)
319 3
            );
320
        }
321
322
        // Handle no contents.
323 5
        if ($value == '') {
324 1
            return $remainingHtml;
325
        }
326
327 5
        $text = new Text($this, $this->getThrowOnError(), $value);
328 5
        $this->children[] = $text;
329
330 5
        return $remainingHtml;
331
    }
332
333
    /**
334
     * Will not parse the contents of an element.
335
     *
336
     * "iframe" elements.
337
     *
338
     * @param $tag  string
339
     * @param $html string
340
     *
341
     * @return string The remaining HTML.
342
     */
343 2
    private function parseNoContents($tag, $html)
344
    {
345 2
        $remainingHtml = trim($html);
346
347 2
        $matchingResult = preg_match(
348 2
            "/(<\/\s*" . $tag . "\s*>)/i",
349 2
            $html,
350
            $endOfScriptMatches
351 2
        );
352 2
        if ($matchingResult === 0) {
353 1
            return '';
354
        }
355
356 1
        $closingTag = $endOfScriptMatches[1];
357
358 1
        return mb_substr(
359 1
            mb_strstr($remainingHtml, $closingTag),
360 1
            mb_strlen($closingTag)
361 1
        );
362
    }
363
364
    /**
365
     * Getter for 'attributes'.
366
     *
367
     * @return array
368
     */
369 1
    public function getAttributes()
370
    {
371 1
        return $this->attributes;
372
    }
373
374
    /**
375
     * @return boolean
376
     */
377 1
    public function hasAttributes()
378
    {
379 1
        return !empty($this->attributes);
380
    }
381
382
    /**
383
     * Getter for 'children'.
384
     *
385
     * @return array
386
     */
387 1
    public function getChildren()
388
    {
389 1
        return $this->children;
390
    }
391
392
    /**
393
     * @return boolean
394
     */
395 1
    public function hasChildren()
396
    {
397 1
        return !empty($this->children);
398
    }
399
400
    /**
401
     * Getter for 'name'.
402
     *
403
     * @return string
404
     */
405 32
    public function getName()
406
    {
407 32
        return $this->name;
408
    }
409
410 26
    public function toArray()
411
    {
412
        $result = array(
413 26
            'type' => 'element',
414 26
            'name' => $this->name
415 26
        );
416
417 26
        if (!empty($this->attributes)) {
418 13
            $result['attributes'] = array();
419 13
            foreach ($this->attributes as $name => $value) {
420 13
                $result['attributes'][$name] = $value;
421 13
            }
422 13
        }
423
424 26
        if (!empty($this->children)) {
425 15
            $result['children'] = array();
426 15
            foreach ($this->children as $child) {
427 15
                $result['children'][] = $child->toArray();
428 15
            }
429 15
        }
430
431 26
        return $result;
432
    }
433
}
434