Parser::setUpAttributes()   C
last analyzed

Complexity

Conditions 16
Paths 11

Size

Total Lines 66
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 45
CRAP Score 16.0026

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 1
b 0
f 0
nc 11
nop 5
dl 0
loc 66
ccs 45
cts 46
cp 0.9783
crap 16.0026
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 294
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 294
        $root = new HtmlNode('root');
37 294
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 294
        $activeNode = $root;
39 294
        while ($activeNode !== null) {
40 294
            if ($activeNode && $activeNode->tag->name() === 'script'
41 294
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 294
                $str = $content->copyUntil('<');
46
            }
47 294
            if ($str == '') {
48 294
                $tagDTO = $this->parseTag($options, $content, $size);
49 294
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 288
                    $activeNode = null;
52 288
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 288
                if ($tagDTO->isClosing()) {
57 255
                    $foundOpeningTag = true;
58 255
                    $originalNode = $activeNode;
59 255
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 255
                    if ($foundOpeningTag) {
69 255
                        $activeNode = $activeNode->getParent();
70
                    }
71 255
                    continue;
72
                }
73
74 288
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 288
                $node = $tagDTO->getNode();
80 288
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 288
                if (!$node->getTag()->isSelfClosing()) {
84 288
                    $activeNode = $node;
85
                }
86 252
            } elseif ($options->isWhitespaceTextNode() ||
87 252
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 249
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 249
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 249
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 288
        return $root;
97
    }
98
99
    /**
100
     * Attempts to detect the charset that the html was sent in.
101
     *
102
     * @throws ChildNotFoundException
103
     */
104 288
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
105
    {
106
        // set the default
107 288
        $encode = new Encode();
108 288
        $encode->from($defaultCharset);
109 288
        $encode->to($defaultCharset);
110
111 288
        $enforceEncoding = $options->getEnforceEncoding();
112 288
        if ($enforceEncoding !== null) {
113
            //  they want to enforce the given encoding
114
            $encode->from($enforceEncoding);
115
            $encode->to($enforceEncoding);
116
117
            return false;
118
        }
119
120
        /** @var AbstractNode $meta */
121 288
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
122 288
        if ($meta == null) {
123 255
            if (!$this->detectHTML5Charset($encode, $root)) {
124
                // could not find meta tag
125 252
                $root->propagateEncoding($encode);
126
127 252
                return false;
128
            }
129
130 3
            return true;
131
        }
132 33
        $content = $meta->getAttribute('content');
133 33
        if (\is_null($content)) {
134
            // could not find content
135
            $root->propagateEncoding($encode);
136
137
            return false;
138
        }
139 33
        $matches = [];
140 33
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
141 33
            $encode->from(\trim($matches[1]));
142 33
            $root->propagateEncoding($encode);
143
144 33
            return true;
145
        }
146
147
        // no charset found
148
        $root->propagateEncoding($encode);
149
150
        return false;
151
    }
152
153
    /**
154
     * Attempt to parse a tag out of the content.
155
     *
156
     * @throws StrictException
157
     * @throws ContentLengthException
158
     * @throws LogicalException
159
     * @throws StrictException
160
     */
161 294
    private function parseTag(Options $options, Content $content, int $size): TagDTO
162
    {
163 294
        if ($content->char() != '<') {
164
            // we are not at the beginning of a tag
165 285
            return TagDTO::makeFromPrimitives();
166
        }
167
168
        // check if this is a closing tag
169
        try {
170 288
            $content->fastForward(1);
171
        } catch (ContentLengthException $exception) {
172
            // we are at the end of the file
173
            return TagDTO::makeFromPrimitives();
174
        }
175 288
        if ($content->char() == '/') {
176 255
            return $this->makeEndTag($content, $options);
177
        }
178 288
        if ($content->char() == '?') {
179
            // special setting tag
180 3
            $tag = $content->fastForward(1)
181 3
                ->copyByToken(StringToken::SLASH(), true);
182 3
            $tag = (new Tag($tag))
183 3
                ->setOpening('<?')
184 3
                ->setClosing(' ?>')
185 3
                ->selfClosing();
186 288
        } elseif($content->string(3) == '!--') {
187
            // comment tag
188 6
            $tag = $content->fastForward(3)
189 6
                ->copyByToken(StringToken::CLOSECOMMENT(), true);
190 6
            $tag = (new Tag($tag))
191 6
                ->setOpening('<!--')
192 6
                ->setClosing('-->')
193 6
                ->selfClosing();
194
        } else {
195 285
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
196 285
            if (\trim($tag) == '') {
197
                // no tag found, invalid < found
198 3
                return TagDTO::makeFromPrimitives();
199
            }
200
        }
201 288
        $node = new HtmlNode($tag);
202 288
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
203 288
        $this->setUpAttributes($content, $size, $node, $options, $tag);
204
205 288
        $content->skipByToken(StringToken::BLANK());
206 288
        if ($content->char() == '/') {
207
            // self closing tag
208 117
            $node->getTag()->selfClosing();
209 117
            $content->fastForward(1);
210 285
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
211
            // Should be a self closing tag, check if we are strict
212 147
            if ($options->isStrict()) {
213 3
                $character = $content->getPosition();
214 3
                throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
215
            }
216
217
            // We force self closing on this tag.
218 144
            $node->getTag()->selfClosing();
219
220
            // Should this tag use a trailing slash?
221 144
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
222 3
                $node->getTag()->noTrailingSlash();
223
            }
224
        }
225
226 288
        if ($content->canFastForward(1)) {
227 285
            $content->fastForward(1);
228
        }
229
230 288
        return TagDTO::makeFromPrimitives(true, false, $node);
231
    }
232
233
    /**
234
     * @throws ChildNotFoundException
235
     */
236 255
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
237
    {
238
        /** @var AbstractNode|null $meta */
239 255
        $meta = $root->find('meta[charset]', 0);
240 255
        if ($meta == null) {
241 252
            return false;
242
        }
243
244 3
        $encode->from(\trim($meta->getAttribute('charset')));
245 3
        $root->propagateEncoding($encode);
246
247 3
        return true;
248
    }
249
250
    /**
251
     * @throws ContentLengthException
252
     * @throws LogicalException
253
     */
254 255
    private function makeEndTag(Content $content, Options $options): TagDTO
255
    {
256 255
        $tag = $content->fastForward(1)
257 255
            ->copyByToken(StringToken::SLASH(), true);
258
        // move to end of tag
259 255
        $content->copyUntil('>');
260 255
        $content->fastForward(1);
261
262
        // check if this closing tag counts
263 255
        $tag = \strtolower($tag);
264 255
        if (\in_array($tag, $options->getSelfClosing(), true)) {
265 57
            return TagDTO::makeFromPrimitives(true);
266
        }
267
268 255
        return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
269
    }
270
271
    /**
272
     * @param string|Tag $tag
273
     *
274
     * @throws ContentLengthException
275
     * @throws LogicalException
276
     * @throws StrictException
277
     */
278 288
    private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
279
    {
280
        while (
281 288
            $content->char() != '>' &&
282 288
            $content->char() != '/'
283
        ) {
284 270
            $space = $content->skipByToken(StringToken::BLANK(), true);
285 270
            if (empty($space)) {
286
                try {
287 15
                    $content->fastForward(1);
288 3
                } catch (ContentLengthException $exception) {
289
                    // reached the end of the content
290 3
                    break;
291
                }
292 15
                continue;
293
            }
294
295 267
            $name = $content->copyByToken(StringToken::EQUAL(), true);
296 267
            if ($name == '/') {
297
                break;
298
            }
299
300 267
            if (empty($name)) {
301 120
                $content->skipByToken(StringToken::BLANK());
302 120
                continue;
303
            }
304
305 264
            $content->skipByToken(StringToken::BLANK());
306 264
            if ($content->char() == '=') {
307 258
                $content->fastForward(1)
308 258
                    ->skipByToken(StringToken::BLANK());
309 258
                switch ($content->char()) {
310 258
                    case '"':
311 243
                        $content->fastForward(1);
312 243
                        $string = $content->copyUntil('"', true);
313
                        do {
314 243
                            $moreString = $content->copyUntilUnless('"', '=>');
315 243
                            $string .= $moreString;
316 243
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
317 243
                        $content->fastForward(1);
318 243
                        $node->getTag()->setAttribute($name, $string);
319 243
                        break;
320 21
                    case "'":
321 18
                        $content->fastForward(1);
322 18
                        $string = $content->copyUntil("'", true);
323
                        do {
324 18
                            $moreString = $content->copyUntilUnless("'", '=>');
325 18
                            $string .= $moreString;
326 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
327 18
                        $content->fastForward(1);
328 18
                        $node->getTag()->setAttribute($name, $string, false);
329 18
                        break;
330
                    default:
331 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
332 258
                        break;
333
                }
334
            } else {
335
                // no value attribute
336 81
                if ($options->isStrict()) {
337
                    // can't have this in strict html
338 3
                    $character = $content->getPosition();
339 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
340
                }
341 78
                $node->getTag()->setAttribute($name, null);
342 78
                if ($content->char() != '>') {
343 21
                    $content->rewind(1);
344
                }
345
            }
346
        }
347 288
    }
348
}
349