Parser::parseTag()   C
last analyzed

Complexity

Conditions 12
Paths 31

Size

Total Lines 70
Code Lines 43

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 40
CRAP Score 12.0155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 12
eloc 43
nc 31
nop 3
dl 0
loc 70
ccs 40
cts 42
cp 0.9524
crap 12.0155
rs 6.9666
c 1
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 294
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 294
        $root = new HtmlNode('root');
37 294
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 294
        $activeNode = $root;
39 294
        while ($activeNode !== null) {
40 294
            if ($activeNode && $activeNode->tag->name() === 'script'
41 294
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 294
                $str = $content->copyUntil('<');
46
            }
47 294
            if ($str == '') {
48 294
                $tagDTO = $this->parseTag($options, $content, $size);
49 294
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 288
                    $activeNode = null;
52 288
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 288
                if ($tagDTO->isClosing()) {
57 255
                    $foundOpeningTag = true;
58 255
                    $originalNode = $activeNode;
59 255
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 255
                    if ($foundOpeningTag) {
69 255
                        $activeNode = $activeNode->getParent();
70
                    }
71 255
                    continue;
72
                }
73
74 288
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 288
                $node = $tagDTO->getNode();
80 288
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 288
                if (!$node->getTag()->isSelfClosing()) {
84 288
                    $activeNode = $node;
85
                }
86 252
            } elseif ($options->isWhitespaceTextNode() ||
87 252
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 249
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 249
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 249
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 288
        return $root;
97
    }
98
99
    /**
100
     * Attempts to detect the charset that the html was sent in.
101
     *
102
     * @throws ChildNotFoundException
103
     */
104 288
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
105
    {
106
        // set the default
107 288
        $encode = new Encode();
108 288
        $encode->from($defaultCharset);
109 288
        $encode->to($defaultCharset);
110
111 288
        $enforceEncoding = $options->getEnforceEncoding();
112 288
        if ($enforceEncoding !== null) {
113
            //  they want to enforce the given encoding
114
            $encode->from($enforceEncoding);
115
            $encode->to($enforceEncoding);
116
117
            return false;
118
        }
119
120
        /** @var AbstractNode $meta */
121 288
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
122 288
        if ($meta == null) {
123 255
            if (!$this->detectHTML5Charset($encode, $root)) {
124
                // could not find meta tag
125 252
                $root->propagateEncoding($encode);
126
127 252
                return false;
128
            }
129
130 3
            return true;
131
        }
132 33
        $content = $meta->getAttribute('content');
133 33
        if (\is_null($content)) {
134
            // could not find content
135
            $root->propagateEncoding($encode);
136
137
            return false;
138
        }
139 33
        $matches = [];
140 33
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
141 33
            $encode->from(\trim($matches[1]));
142 33
            $root->propagateEncoding($encode);
143
144 33
            return true;
145
        }
146
147
        // no charset found
148
        $root->propagateEncoding($encode);
149
150
        return false;
151
    }
152
153
    /**
154
     * Attempt to parse a tag out of the content.
155
     *
156
     * @throws StrictException
157
     * @throws ContentLengthException
158
     * @throws LogicalException
159
     * @throws StrictException
160
     */
161 294
    private function parseTag(Options $options, Content $content, int $size): TagDTO
162
    {
163 294
        if ($content->char() != '<') {
164
            // we are not at the beginning of a tag
165 285
            return TagDTO::makeFromPrimitives();
166
        }
167
168
        // check if this is a closing tag
169
        try {
170 288
            $content->fastForward(1);
171
        } catch (ContentLengthException $exception) {
172
            // we are at the end of the file
173
            return TagDTO::makeFromPrimitives();
174
        }
175 288
        if ($content->char() == '/') {
176 255
            return $this->makeEndTag($content, $options);
177
        }
178 288
        if ($content->char() == '?') {
179
            // special setting tag
180 3
            $tag = $content->fastForward(1)
181 3
                ->copyByToken(StringToken::SLASH(), true);
182 3
            $tag = (new Tag($tag))
183 3
                ->setOpening('<?')
184 3
                ->setClosing(' ?>')
185 3
                ->selfClosing();
186 288
        } elseif($content->string(3) == '!--') {
187
            // comment tag
188 6
            $tag = $content->fastForward(3)
189 6
                ->copyByToken(StringToken::CLOSECOMMENT(), true);
190 6
            $tag = (new Tag($tag))
191 6
                ->setOpening('<!--')
192 6
                ->setClosing('-->')
193 6
                ->selfClosing();
194
        } else {
195 285
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
196 285
            if (\trim($tag) == '') {
197
                // no tag found, invalid < found
198 3
                return TagDTO::makeFromPrimitives();
199
            }
200
        }
201 288
        $node = new HtmlNode($tag);
202 288
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
203 288
        $this->setUpAttributes($content, $size, $node, $options, $tag);
204
205 288
        $content->skipByToken(StringToken::BLANK());
206 288
        if ($content->char() == '/') {
207
            // self closing tag
208 117
            $node->getTag()->selfClosing();
209 117
            $content->fastForward(1);
210 285
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
211
            // Should be a self closing tag, check if we are strict
212 147
            if ($options->isStrict()) {
213 3
                $character = $content->getPosition();
214 3
                throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
215
            }
216
217
            // We force self closing on this tag.
218 144
            $node->getTag()->selfClosing();
219
220
            // Should this tag use a trailing slash?
221 144
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
222 3
                $node->getTag()->noTrailingSlash();
223
            }
224
        }
225
226 288
        if ($content->canFastForward(1)) {
227 285
            $content->fastForward(1);
228
        }
229
230 288
        return TagDTO::makeFromPrimitives(true, false, $node);
231
    }
232
233
    /**
234
     * @throws ChildNotFoundException
235
     */
236 255
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
237
    {
238
        /** @var AbstractNode|null $meta */
239 255
        $meta = $root->find('meta[charset]', 0);
240 255
        if ($meta == null) {
241 252
            return false;
242
        }
243
244 3
        $encode->from(\trim($meta->getAttribute('charset')));
245 3
        $root->propagateEncoding($encode);
246
247 3
        return true;
248
    }
249
250
    /**
251
     * @throws ContentLengthException
252
     * @throws LogicalException
253
     */
254 255
    private function makeEndTag(Content $content, Options $options): TagDTO
255
    {
256 255
        $tag = $content->fastForward(1)
257 255
            ->copyByToken(StringToken::SLASH(), true);
258
        // move to end of tag
259 255
        $content->copyUntil('>');
260 255
        $content->fastForward(1);
261
262
        // check if this closing tag counts
263 255
        $tag = \strtolower($tag);
264 255
        if (\in_array($tag, $options->getSelfClosing(), true)) {
265 57
            return TagDTO::makeFromPrimitives(true);
266
        }
267
268 255
        return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
269
    }
270
271
    /**
272
     * @param string|Tag $tag
273
     *
274
     * @throws ContentLengthException
275
     * @throws LogicalException
276
     * @throws StrictException
277
     */
278 288
    private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
279
    {
280
        while (
281 288
            $content->char() != '>' &&
282 288
            $content->char() != '/'
283
        ) {
284 270
            $space = $content->skipByToken(StringToken::BLANK(), true);
285 270
            if (empty($space)) {
286
                try {
287 15
                    $content->fastForward(1);
288 3
                } catch (ContentLengthException $exception) {
289
                    // reached the end of the content
290 3
                    break;
291
                }
292 15
                continue;
293
            }
294
295 267
            $name = $content->copyByToken(StringToken::EQUAL(), true);
296 267
            if ($name == '/') {
297
                break;
298
            }
299
300 267
            if (empty($name)) {
301 120
                $content->skipByToken(StringToken::BLANK());
302 120
                continue;
303
            }
304
305 264
            $content->skipByToken(StringToken::BLANK());
306 264
            if ($content->char() == '=') {
307 258
                $content->fastForward(1)
308 258
                    ->skipByToken(StringToken::BLANK());
309 258
                switch ($content->char()) {
310 258
                    case '"':
311 243
                        $content->fastForward(1);
312 243
                        $string = $content->copyUntil('"', true);
313
                        do {
314 243
                            $moreString = $content->copyUntilUnless('"', '=>');
315 243
                            $string .= $moreString;
316 243
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
317 243
                        $content->fastForward(1);
318 243
                        $node->getTag()->setAttribute($name, $string);
319 243
                        break;
320 21
                    case "'":
321 18
                        $content->fastForward(1);
322 18
                        $string = $content->copyUntil("'", true);
323
                        do {
324 18
                            $moreString = $content->copyUntilUnless("'", '=>');
325 18
                            $string .= $moreString;
326 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
327 18
                        $content->fastForward(1);
328 18
                        $node->getTag()->setAttribute($name, $string, false);
329 18
                        break;
330
                    default:
331 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
332 258
                        break;
333
                }
334
            } else {
335
                // no value attribute
336 81
                if ($options->isStrict()) {
337
                    // can't have this in strict html
338 3
                    $character = $content->getPosition();
339 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
340
                }
341 78
                $node->getTag()->setAttribute($name, null);
342 78
                if ($content->char() != '>') {
343 21
                    $content->rewind(1);
344
                }
345
            }
346
        }
347 288
    }
348
}
349