Passed
Push — master ( 382b98...cf0bb6 )
by Gilles
08:40 queued 01:24
created

Parser::setUpAttributes()   C

Complexity

Conditions 16
Paths 11

Size

Total Lines 66
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 45
CRAP Score 16.0026

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 1
b 0
f 0
nc 11
nop 5
dl 0
loc 66
ccs 45
cts 46
cp 0.9783
crap 16.0026
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 288
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 288
        $root = new HtmlNode('root');
37 288
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 288
        $activeNode = $root;
39 288
        while ($activeNode !== null) {
40 288
            if ($activeNode && $activeNode->tag->name() === 'script'
41 288
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 288
                $str = $content->copyUntil('<');
46
            }
47 288
            if ($str == '') {
48 288
                $tagDTO = $this->parseTag($options, $content, $size);
49 288
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 282
                    $activeNode = null;
52 282
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 282
                if ($tagDTO->isClosing()) {
57 252
                    $foundOpeningTag = true;
58 252
                    $originalNode = $activeNode;
59 252
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 252
                    if ($foundOpeningTag) {
69 252
                        $activeNode = $activeNode->getParent();
70
                    }
71 252
                    continue;
72
                }
73
74 282
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 282
                $node = $tagDTO->getNode();
80 282
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 282
                if (!$node->getTag()->isSelfClosing()) {
84 282
                    $activeNode = $node;
85
                }
86 249
            } elseif ($options->isWhitespaceTextNode() ||
87 249
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 246
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 246
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 246
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 282
        return $root;
97
    }
98
99
    /**
100
     * Attempts to detect the charset that the html was sent in.
101
     *
102
     * @throws ChildNotFoundException
103
     */
104 282
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
105
    {
106
        // set the default
107 282
        $encode = new Encode();
108 282
        $encode->from($defaultCharset);
109 282
        $encode->to($defaultCharset);
110
111 282
        $enforceEncoding = $options->getEnforceEncoding();
112 282
        if ($enforceEncoding !== null) {
113
            //  they want to enforce the given encoding
114
            $encode->from($enforceEncoding);
115
            $encode->to($enforceEncoding);
116
117
            return false;
118
        }
119
120
        /** @var AbstractNode $meta */
121 282
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
122 282
        if ($meta == null) {
123 252
            if (!$this->detectHTML5Charset($encode, $root)) {
124
                // could not find meta tag
125 249
                $root->propagateEncoding($encode);
126
127 249
                return false;
128
            }
129
130 3
            return true;
131
        }
132 30
        $content = $meta->getAttribute('content');
133 30
        if (\is_null($content)) {
134
            // could not find content
135
            $root->propagateEncoding($encode);
136
137
            return false;
138
        }
139 30
        $matches = [];
140 30
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
141 30
            $encode->from(\trim($matches[1]));
142 30
            $root->propagateEncoding($encode);
143
144 30
            return true;
145
        }
146
147
        // no charset found
148
        $root->propagateEncoding($encode);
149
150
        return false;
151
    }
152
153
    /**
154
     * Attempt to parse a tag out of the content.
155
     *
156
     * @throws StrictException
157
     * @throws ContentLengthException
158
     * @throws LogicalException
159
     * @throws StrictException
160
     */
161 288
    private function parseTag(Options $options, Content $content, int $size): TagDTO
162
    {
163 288
        if ($content->char() != '<') {
164
            // we are not at the beginning of a tag
165 279
            return TagDTO::makeFromPrimitives();
166
        }
167
168
        // check if this is a closing tag
169
        try {
170 282
            $content->fastForward(1);
171
        } catch (ContentLengthException $exception) {
172
            // we are at the end of the file
173
            return TagDTO::makeFromPrimitives();
174
        }
175 282
        if ($content->char() == '/') {
176 252
            return $this->makeEndTag($content, $options);
177
        }
178 282
        if ($content->char() == '?') {
179
            // special setting tag
180 3
            $tag = $content->fastForward(1)
181 3
                ->copyByToken(StringToken::SLASH(), true);
182 3
            $tag = (new Tag($tag))
183 3
                ->setOpening('<?')
184 3
                ->setClosing(' ?>')
185 3
                ->selfClosing();
186
        } else {
187 282
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
188 282
            if (\trim($tag) == '') {
189
                // no tag found, invalid < found
190 3
                return TagDTO::makeFromPrimitives();
191
            }
192
        }
193 282
        $node = new HtmlNode($tag);
194 282
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
195 282
        $this->setUpAttributes($content, $size, $node, $options, $tag);
196
197 282
        $content->skipByToken(StringToken::BLANK());
198 282
        if ($content->char() == '/') {
199
            // self closing tag
200 117
            $node->getTag()->selfClosing();
201 117
            $content->fastForward(1);
202 279
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
203
            // Should be a self closing tag, check if we are strict
204 144
            if ($options->isStrict()) {
205 3
                $character = $content->getPosition();
206 3
                throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
207
            }
208
209
            // We force self closing on this tag.
210 141
            $node->getTag()->selfClosing();
211
212
            // Should this tag use a trailing slash?
213 141
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
214 3
                $node->getTag()->noTrailingSlash();
215
            }
216
        }
217
218 282
        if ($content->canFastForward(1)) {
219 279
            $content->fastForward(1);
220
        }
221
222 282
        return TagDTO::makeFromPrimitives(true, false, $node);
223
    }
224
225
    /**
226
     * @throws ChildNotFoundException
227
     */
228 252
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
229
    {
230
        /** @var AbstractNode|null $meta */
231 252
        $meta = $root->find('meta[charset]', 0);
232 252
        if ($meta == null) {
233 249
            return false;
234
        }
235
236 3
        $encode->from(\trim($meta->getAttribute('charset')));
237 3
        $root->propagateEncoding($encode);
238
239 3
        return true;
240
    }
241
242
    /**
243
     * @throws ContentLengthException
244
     * @throws LogicalException
245
     */
246 252
    private function makeEndTag(Content $content, Options $options): TagDTO
247
    {
248 252
        $tag = $content->fastForward(1)
249 252
            ->copyByToken(StringToken::SLASH(), true);
250
        // move to end of tag
251 252
        $content->copyUntil('>');
252 252
        $content->fastForward(1);
253
254
        // check if this closing tag counts
255 252
        $tag = \strtolower($tag);
256 252
        if (\in_array($tag, $options->getSelfClosing(), true)) {
257 57
            return TagDTO::makeFromPrimitives(true);
258
        }
259
260 252
        return TagDTO::makeFromPrimitives(true, true, null, \strtolower($tag));
261
    }
262
263
    /**
264
     * @param string|Tag $tag
265
     *
266
     * @throws ContentLengthException
267
     * @throws LogicalException
268
     * @throws StrictException
269
     */
270 282
    private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
271
    {
272
        while (
273 282
            $content->char() != '>' &&
274 282
            $content->char() != '/'
275
        ) {
276 264
            $space = $content->skipByToken(StringToken::BLANK(), true);
277 264
            if (empty($space)) {
278
                try {
279 12
                    $content->fastForward(1);
280 3
                } catch (ContentLengthException $exception) {
281
                    // reached the end of the content
282 3
                    break;
283
                }
284 12
                continue;
285
            }
286
287 264
            $name = $content->copyByToken(StringToken::EQUAL(), true);
288 264
            if ($name == '/') {
289
                break;
290
            }
291
292 264
            if (empty($name)) {
293 120
                $content->skipByToken(StringToken::BLANK());
294 120
                continue;
295
            }
296
297 261
            $content->skipByToken(StringToken::BLANK());
298 261
            if ($content->char() == '=') {
299 255
                $content->fastForward(1)
300 255
                    ->skipByToken(StringToken::BLANK());
301 255
                switch ($content->char()) {
302 255
                    case '"':
303 240
                        $content->fastForward(1);
304 240
                        $string = $content->copyUntil('"', true);
305
                        do {
306 240
                            $moreString = $content->copyUntilUnless('"', '=>');
307 240
                            $string .= $moreString;
308 240
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
309 240
                        $content->fastForward(1);
310 240
                        $node->getTag()->setAttribute($name, $string);
311 240
                        break;
312 21
                    case "'":
313 18
                        $content->fastForward(1);
314 18
                        $string = $content->copyUntil("'", true);
315
                        do {
316 18
                            $moreString = $content->copyUntilUnless("'", '=>');
317 18
                            $string .= $moreString;
318 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
319 18
                        $content->fastForward(1);
320 18
                        $node->getTag()->setAttribute($name, $string, false);
321 18
                        break;
322
                    default:
323 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
324 255
                        break;
325
                }
326
            } else {
327
                // no value attribute
328 81
                if ($options->isStrict()) {
329
                    // can't have this in strict html
330 3
                    $character = $content->getPosition();
331 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
332
                }
333 78
                $node->getTag()->setAttribute($name, null);
334 78
                if ($content->char() != '>') {
335 21
                    $content->rewind(1);
336
                }
337
            }
338
        }
339 282
    }
340
}
341