Completed
Push — master ( a6e461...7a71e4 )
by Kevin
02:10
created

Element::parse()   C

Complexity

Conditions 8
Paths 10

Size

Total Lines 43
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 23
CRAP Score 8

Importance

Changes 7
Bugs 3 Features 5
Metric Value
c 7
b 3
f 5
dl 0
loc 43
ccs 23
cts 23
cp 1
rs 5.3846
cc 8
eloc 23
nc 10
nop 1
crap 8
1
<?php
2
3
namespace Kevintweber\HtmlTokenizer\Tokens;
4
5
use Kevintweber\HtmlTokenizer\Exceptions\ParseException;
6
7
class Element extends AbstractToken
8
{
9
    /** @var array[Token] */
10
    private $attributes;
11
12
    /** @var array[Token] */
13
    private $children;
14
15
    /** @var string */
16
    private $name;
17
18 51
    public function __construct(Token $parent = null, $throwOnError = false)
19
    {
20 51
        parent::__construct(Token::ELEMENT, $parent, $throwOnError);
21
22 51
        $this->attributes = array();
23 51
        $this->children = array();
24 51
        $this->name = null;
25 51
    }
26
27
    /**
28
     * Does the parent have an implied closing tag?
29
     *
30
     * @param string $html
31
     *
32
     * @return boolean
33
     */
34 20
    public function isClosingElementImplied($html)
35
    {
36 20
        $parent = $this->getParent();
37 20
        if ($parent === null || !($parent instanceof self)) {
38 1
            return false;
39
        }
40
41 19
        $name = $this->parseElementName($html);
42 19
        $parentName = $parent->getName();
43
44
        // HEAD: no closing tag.
45 19
        if ($name === 'body' && $parentName === 'head') {
46 2
            return true;
47
        }
48
49
        // P
50
        $elementsNotChildrenOfP = array(
51 18
            'address',
52 18
            'article',
53 18
            'aside',
54 18
            'blockquote',
55 18
            'details',
56 18
            'div',
57 18
            'dl',
58 18
            'fieldset',
59 18
            'figcaption',
60 18
            'figure',
61 18
            'footer',
62 18
            'form',
63 18
            'h1',
64 18
            'h2',
65 18
            'h3',
66 18
            'h4',
67 18
            'h5',
68 18
            'h6',
69 18
            'header',
70 18
            'hgroup',
71 18
            'hr',
72 18
            'main',
73 18
            'menu',
74 18
            'nav',
75 18
            'ol',
76 18
            'p',
77 18
            'pre',
78 18
            'section',
79 18
            'table',
80
            'ul'
81 18
        );
82 18
        if ($parentName === 'p' && array_search($name, $elementsNotChildrenOfP) !== false) {
83 3
            return true;
84
        }
85
86
        // LI
87 15
        if ($parentName == 'li' && $name == 'li') {
88 1
            return true;
89
        }
90
91
        // DT and DD
92 14
        if (($parentName == 'dt' || $parentName == 'dd') && ($name == 'dt' || $name == 'dd')) {
93 4
            return true;
94
        }
95
96
        // RP and RT
97 10
        if (($parentName == 'rp' || $parentName == 'rt') && ($name == 'rp' || $name == 'rt')) {
98 4
            return true;
99
        }
100
101 6
        return false;
102
    }
103
104
    /**
105
     * Will parse this element.
106
     *
107
     * @param string $html
108
     *
109
     * @return string Remaining HTML.
110
     */
111 35
    public function parse($html)
112
    {
113 35
        $this->name = $this->parseElementName($html);
114
115
        // Parse attributes.
116 35
        $remainingHtml = substr($html, strlen($this->name) + 1);
117 35
        while (strpos($remainingHtml, '>') !== false && preg_match("/^\s*[\/]?>/", $remainingHtml) === 0) {
118 18
            $remainingHtml = $this->parseAttribute(trim($remainingHtml));
119 18
        }
120
121
        // Find position of end of tag.
122 35
        $posOfClosingBracket = strpos($remainingHtml, '>');
123 35
        if ($posOfClosingBracket === false) {
124 4
            if ($this->getThrowOnError()) {
125 1
                throw new ParseException('Invalid element: missing closing bracket.');
126
            }
127
128 3
            return '';
129
        }
130
131
        // Is self-closing?
132 31
        $posOfSelfClosingBracket = strpos($remainingHtml, '/>');
133 31
        $remainingHtml = trim(substr($remainingHtml, $posOfClosingBracket + 1));
134 31
        if ($posOfSelfClosingBracket !== false && $posOfSelfClosingBracket == $posOfClosingBracket - 1) {
135
            // Self-closing element.
136 6
            return $remainingHtml;
137
        }
138
139
        // Lets close those closed-only elements that are left open.
140
        $closedOnlyElements = array(
141 26
            'base',
142 26
            'link',
143 26
            'meta',
144 26
            'hr',
145
            'br'
146 26
        );
147 26
        if (array_search($this->name, $closedOnlyElements) !== false) {
148 4
            return $remainingHtml;
149
        }
150
151
        // Open element.
152 26
        return $this->parseContents($remainingHtml);
153
    }
154
155
    /**
156
     * Will parse attributes.
157
     *
158
     * @param string $html
159
     *
160
     * @return string Remaining HTML.
161
     */
162 18
    private function parseAttribute($html)
163
    {
164 18
        $remainingHtml = trim($html);
165
166
        // Will match the first entire name/value attribute pair.
167 18
        preg_match(
168 18
            "/((([a-z0-9\-_]+:)?[a-z0-9\-_]+)(\s*=\s*)?)/i",
169 18
            $remainingHtml,
170
            $attributeMatches
171 18
        );
172
173 18
        $name = $attributeMatches[2];
174 18
        $remainingHtml = substr(strstr($remainingHtml, $name), strlen($name));
175 18
        if (preg_match("/^\s*=\s*/", $remainingHtml) === 0) {
176
            // Valueless attribute.
177 3
            $this->attributes[trim($name)] = true;
178 3
        } else {
179 17
            $remainingHtml = ltrim($remainingHtml, ' =');
180 17
            if ($remainingHtml[0] === "'" || $remainingHtml[0] === '"') {
181
                // Quote enclosed attribute value.
182 14
                $valueMatchSuccessful = preg_match(
183 14
                    "/" . $remainingHtml[0] . "(.*?(?<!\\\))" . $remainingHtml[0] . "/s",
184 14
                    $remainingHtml,
185
                    $valueMatches
186 14
                );
187 14
                if ($valueMatchSuccessful !== 1) {
188 1
                    if ($this->getThrowOnError()) {
189 1
                        throw new ParseException('Invalid value encapsulation.');
190
                    }
191
192 1
                    return '';
193
                }
194
195 13
                $value = $valueMatches[1];
196 13
            } else {
197
                // No quotes enclosing the attribute value.
198 6
                preg_match("/(\s*([^>\s]*(?<!\/)))/", $remainingHtml, $valueMatches);
199 6
                $value = $valueMatches[2];
200
            }
201
202 16
            $this->attributes[trim($name)] = $value;
203
204
            // Determine remaining html.
205 16
            $posOfAttributeValue = strpos($html, $value);
206 16
            $remainingHtml = trim(
207 16
                substr($html, $posOfAttributeValue + strlen($value))
208 16
            );
209 16
            $remainingHtml = ltrim($remainingHtml, '\'"/ ');
210
        }
211
212 17
        return $remainingHtml;
213
    }
214
215
    /**
216
     * Will parse the contents of this element.
217
     *
218
     * @param string $html
219
     *
220
     * @return string Remaining HTML.
221
     */
222 26
    private function parseContents($html)
223
    {
224 26
        $remainingHtml = trim($html);
225 26
        if ($remainingHtml == '') {
226 10
            return '';
227
        }
228
229
        // Nothing to parse inside a script tag.
230 16
        if ($this->name == 'script') {
231 3
            return $this->parseScriptContents($remainingHtml);
232
        }
233
234
        // Parse contents one token at a time.
235 14
        while (preg_match("/^<\/\s*" . $this->name . "\s*>/is", $remainingHtml) === 0) {
236 12
            $token = TokenFactory::buildFromHtml(
237 12
                $remainingHtml,
238 12
                $this,
239 12
                $this->getThrowOnError()
240 12
            );
241
242 12
            if ($token === false || $token->isClosingElementImplied($remainingHtml)) {
243 3
                return $remainingHtml;
244
            }
245
246 11
            $remainingHtml = trim($token->parse($remainingHtml));
247 11
            $this->children[] = $token;
248 11
        }
249
250
        // Remove remaining closing tag.
251 13
        $posOfClosingBracket = strpos($remainingHtml, '>');
252
253 13
        return substr($remainingHtml, $posOfClosingBracket + 1);
254
    }
255
256
    /**
257
     * Will get the element name from the html string.
258
     *
259
     * @param $html string
260
     *
261
     * @return string The element name.
262
     */
263 49
    private function parseElementName($html)
264
    {
265 49
        $elementMatchSuccessful = preg_match(
266 49
            "/^(<(([a-z0-9\-]+:)?[a-z0-9\-]+))/i",
267 49
            $html,
268
            $elementMatches
269 49
        );
270 49
        if ($elementMatchSuccessful !== 1) {
271 1
            if ($this->getThrowOnError()) {
272 1
                throw new ParseException('Invalid element name.');
273
            }
274
275 1
            return '';
276
        }
277
278 48
        return strtolower($elementMatches[2]);
279
    }
280
281
    /**
282
     * Will parse the script contents correctly.
283
     *
284
     * @param $html string
285
     *
286
     * @return string The remaining HTML.
287
     */
288 3
    private function parseScriptContents($html)
289
    {
290 3
        $remainingHtml = trim($html);
291
292 3
        $matchingResult = preg_match("/(<\/\s*script\s*>)/i", $html, $endOfScriptMatches);
293 3
        if ($matchingResult === 0) {
294 1
            $value = $remainingHtml;
295 1
            $remainingHtml = '';
296 1
        } else {
297 2
            $closingTag = $endOfScriptMatches[1];
298 2
            $value = trim(
299 2
                substr($remainingHtml, 0, strpos($remainingHtml, $closingTag))
300 2
            );
301 2
            $remainingHtml = substr(
302 2
                strstr($remainingHtml, $closingTag),
303 2
                strlen($closingTag)
304 2
            );
305
        }
306
307
        // Handle no contents.
308 3
        if ($value == '') {
309 1
            return $remainingHtml;
310
        }
311
312 3
        $text = new Text($this, $this->getThrowOnError(), $value);
313 3
        $this->children[] = $text;
314
315 3
        return $remainingHtml;
316
    }
317
318
    /**
319
     * Getter for 'attributes'.
320
     *
321
     * @return array
322
     */
323 1
    public function getAttributes()
324
    {
325 1
        return $this->attributes;
326
    }
327
328
    /**
329
     * @return boolean
330
     */
331 1
    public function hasAttributes()
332
    {
333 1
        return !empty($this->attributes);
334
    }
335
336
    /**
337
     * Getter for 'children'.
338
     *
339
     * @return array
340
     */
341 1
    public function getChildren()
342
    {
343 1
        return $this->children;
344
    }
345
346
    /**
347
     * @return boolean
348
     */
349 1
    public function hasChildren()
350
    {
351 1
        return !empty($this->children);
352
    }
353
354
    /**
355
     * Getter for 'name'.
356
     *
357
     * @return string
358
     */
359 29
    public function getName()
360
    {
361 29
        return $this->name;
362
    }
363
364 20
    public function toArray()
365
    {
366
        $result = array(
367 20
            'type' => 'element',
368 20
            'name' => $this->name
369 20
        );
370
371 20
        if (!empty($this->attributes)) {
372 12
            $result['attributes'] = array();
373 12
            foreach ($this->attributes as $name => $value) {
374 12
                $result['attributes'][$name] = $value;
375 12
            }
376 12
        }
377
378 20
        if (!empty($this->children)) {
379 9
            $result['children'] = array();
380 9
            foreach ($this->children as $child) {
381 9
                $result['children'][] = $child->toArray();
382 9
            }
383 9
        }
384
385 20
        return $result;
386
    }
387
}
388