Passed
Push — master ( 668c77...c11634 )
by Gilles
02:19
created

Parser::parseTag()   B

Complexity

Conditions 11
Paths 22

Size

Total Lines 66
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 11.0176

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 11
eloc 39
c 1
b 0
f 0
nc 22
nop 3
dl 0
loc 66
ccs 36
cts 38
cp 0.9474
crap 11.0176
rs 7.3166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 288
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 288
        $root = new HtmlNode('root');
37 288
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 288
        $activeNode = $root;
39 288
        while ($activeNode !== null) {
40 288
            if ($activeNode && $activeNode->tag->name() === 'script'
41 288
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 288
                $str = $content->copyUntil('<');
46
            }
47 288
            if ($str == '') {
48 288
                $tagDTO = $this->parseTag($options, $content, $size);
49 288
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 282
                    $activeNode = null;
52 282
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 282
                if ($tagDTO->isClosing()) {
57 252
                    $foundOpeningTag = true;
58 252
                    $originalNode = $activeNode;
59 252
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 252
                    if ($foundOpeningTag) {
69 252
                        $activeNode = $activeNode->getParent();
70
                    }
71 252
                    continue;
72
                }
73
74 282
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 282
                $node = $tagDTO->getNode();
80 282
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 282
                if (!$node->getTag()->isSelfClosing()) {
84 282
                    $activeNode = $node;
85
                }
86 249
            } elseif ($options->isWhitespaceTextNode() ||
87 249
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 246
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 246
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 246
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 282
        return $root;
97
    }
98
99
    /**
100
     * Attempts to detect the charset that the html was sent in.
101
     *
102
     * @throws ChildNotFoundException
103
     */
104 282
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
105
    {
106
        // set the default
107 282
        $encode = new Encode();
108 282
        $encode->from($defaultCharset);
109 282
        $encode->to($defaultCharset);
110
111 282
        $enforceEncoding = $options->getEnforceEncoding();
112 282
        if ($enforceEncoding !== null) {
113
            //  they want to enforce the given encoding
114
            $encode->from($enforceEncoding);
115
            $encode->to($enforceEncoding);
116
117
            return false;
118
        }
119
120
        /** @var AbstractNode $meta */
121 282
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
122 282
        if ($meta == null) {
123 252
            if (!$this->detectHTML5Charset($encode, $root)) {
124
                // could not find meta tag
125 249
                $root->propagateEncoding($encode);
126
127 249
                return false;
128
            }
129
130 3
            return true;
131
        }
132 30
        $content = $meta->getAttribute('content');
133 30
        if (\is_null($content)) {
134
            // could not find content
135
            $root->propagateEncoding($encode);
136
137
            return false;
138
        }
139 30
        $matches = [];
140 30
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
141 30
            $encode->from(\trim($matches[1]));
142 30
            $root->propagateEncoding($encode);
143
144 30
            return true;
145
        }
146
147
        // no charset found
148
        $root->propagateEncoding($encode);
149
150
        return false;
151
    }
152
153
    /**
154
     * Attempt to parse a tag out of the content.
155
     *
156
     * @throws StrictException
157
     * @throws ContentLengthException
158
     * @throws LogicalException
159
     * @throws StrictException
160
     */
161 288
    private function parseTag(Options $options, Content $content, int $size): TagDTO
162
    {
163 288
        $return = [];
164 288
        if ($content->char() != '<') {
165
            // we are not at the beginning of a tag
166 279
            return new TagDTO();
167
        }
168
169
        // check if this is a closing tag
170
        try {
171 282
            $content->fastForward(1);
172
        } catch (ContentLengthException $exception) {
173
            // we are at the end of the file
174
            return new TagDTO();
175
        }
176 282
        if ($content->char() == '/') {
177 252
            return $this->makeEndTag($content, $options);
178
        }
179 282
        if ($content->char() == '?') {
180
            // special setting tag
181 3
            $tag = $content->fastForward(1)
182 3
                ->copyByToken(StringToken::SLASH(), true);
183 3
            $tag = (new Tag($tag))
184 3
                ->setOpening('<?')
185 3
                ->setClosing(' ?>')
186 3
                ->selfClosing();
187
        } else {
188 282
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
189 282
            if (\trim($tag) == '') {
190
                // no tag found, invalid < found
191 3
                return new TagDTO();
192
            }
193
        }
194 282
        $node = new HtmlNode($tag);
195 282
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
196 282
        $this->setUpAttributes($content, $size, $node, $options, $tag);
197
198 282
        $content->skipByToken(StringToken::BLANK());
199 282
        if ($content->char() == '/') {
200
            // self closing tag
201 117
            $node->getTag()->selfClosing();
202 117
            $content->fastForward(1);
203 279
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
204
            // Should be a self closing tag, check if we are strict
205 144
            if ($options->isStrict()) {
206 3
                $character = $content->getPosition();
207 3
                throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
208
            }
209
210
            // We force self closing on this tag.
211 141
            $node->getTag()->selfClosing();
212
213
            // Should this tag use a trailing slash?
214 141
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
215 3
                $node->getTag()->noTrailingSlash();
216
            }
217
        }
218
219 282
        if ($content->canFastForward(1)) {
220 279
            $content->fastForward(1);
221
        }
222
223 282
        $return['status'] = true;
224 282
        $return['node'] = $node;
225
226 282
        return new TagDTO($return);
227
    }
228
229
    /**
230
     * @throws ChildNotFoundException
231
     */
232 252
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
233
    {
234
        /** @var AbstractNode|null $meta */
235 252
        $meta = $root->find('meta[charset]', 0);
236 252
        if ($meta == null) {
237 249
            return false;
238
        }
239
240 3
        $encode->from(\trim($meta->getAttribute('charset')));
241 3
        $root->propagateEncoding($encode);
242
243 3
        return true;
244
    }
245
246
    /**
247
     * @throws ContentLengthException
248
     * @throws LogicalException
249
     */
250 252
    private function makeEndTag(Content $content, Options $options): TagDTO
251
    {
252 252
        $return = [];
253 252
        $tag = $content->fastForward(1)
254 252
            ->copyByToken(StringToken::SLASH(), true);
255
        // move to end of tag
256 252
        $content->copyUntil('>');
257 252
        $content->fastForward(1);
258
259
        // check if this closing tag counts
260 252
        $tag = \strtolower($tag);
261 252
        if (\in_array($tag, $options->getSelfClosing(), true)) {
262 57
            $return['status'] = true;
263
264 57
            return new TagDTO($return);
265
        }
266 252
        $return['status'] = true;
267 252
        $return['closing'] = true;
268 252
        $return['tag'] = \strtolower($tag);
269
270 252
        return new TagDTO($return);
271
    }
272
273
    /**
274
     * @param string|Tag $tag
275
     *
276
     * @throws ContentLengthException
277
     * @throws LogicalException
278
     * @throws StrictException
279
     */
280 282
    private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
281
    {
282
        while (
283 282
            $content->char() != '>' &&
284 282
            $content->char() != '/'
285
        ) {
286 264
            $space = $content->skipByToken(StringToken::BLANK(), true);
287 264
            if (empty($space)) {
288
                try {
289 12
                    $content->fastForward(1);
290 3
                } catch (ContentLengthException $exception) {
291
                    // reached the end of the content
292 3
                    break;
293
                }
294 12
                continue;
295
            }
296
297 264
            $name = $content->copyByToken(StringToken::EQUAL(), true);
298 264
            if ($name == '/') {
299
                break;
300
            }
301
302 264
            if (empty($name)) {
303 120
                $content->skipByToken(StringToken::BLANK());
304 120
                continue;
305
            }
306
307 261
            $content->skipByToken(StringToken::BLANK());
308 261
            if ($content->char() == '=') {
309 255
                $content->fastForward(1)
310 255
                    ->skipByToken(StringToken::BLANK());
311 255
                switch ($content->char()) {
312 255
                    case '"':
313 240
                        $content->fastForward(1);
314 240
                        $string = $content->copyUntil('"', true);
315
                        do {
316 240
                            $moreString = $content->copyUntilUnless('"', '=>');
317 240
                            $string .= $moreString;
318 240
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
319 240
                        $content->fastForward(1);
320 240
                        $node->getTag()->setAttribute($name, $string);
321 240
                        break;
322 21
                    case "'":
323 18
                        $content->fastForward(1);
324 18
                        $string = $content->copyUntil("'", true);
325
                        do {
326 18
                            $moreString = $content->copyUntilUnless("'", '=>');
327 18
                            $string .= $moreString;
328 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
329 18
                        $content->fastForward(1);
330 18
                        $node->getTag()->setAttribute($name, $string, false);
331 18
                        break;
332
                    default:
333 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
334 255
                        break;
335
                }
336
            } else {
337
                // no value attribute
338 81
                if ($options->isStrict()) {
339
                    // can't have this in strict html
340 3
                    $character = $content->getPosition();
341 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
342
                }
343 78
                $node->getTag()->setAttribute($name, null);
344 78
                if ($content->char() != '>') {
345 21
                    $content->rewind(1);
346
                }
347
            }
348
        }
349 282
    }
350
}
351