Passed
Push — master ( 668c77...c11634 )
by Gilles
02:19
created

Parser::parse()   C

Complexity

Conditions 15
Paths 25

Size

Total Lines 64
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 38
CRAP Score 15

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 15
eloc 38
c 1
b 0
f 0
nc 25
nop 3
dl 0
loc 64
ccs 38
cts 38
cp 1
crap 15
rs 5.9166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 288
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 288
        $root = new HtmlNode('root');
37 288
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 288
        $activeNode = $root;
39 288
        while ($activeNode !== null) {
40 288
            if ($activeNode && $activeNode->tag->name() === 'script'
41 288
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 288
                $str = $content->copyUntil('<');
46
            }
47 288
            if ($str == '') {
48 288
                $tagDTO = $this->parseTag($options, $content, $size);
49 288
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 282
                    $activeNode = null;
52 282
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 282
                if ($tagDTO->isClosing()) {
57 252
                    $foundOpeningTag = true;
58 252
                    $originalNode = $activeNode;
59 252
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 252
                    if ($foundOpeningTag) {
69 252
                        $activeNode = $activeNode->getParent();
70
                    }
71 252
                    continue;
72
                }
73
74 282
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 282
                $node = $tagDTO->getNode();
80 282
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 282
                if (!$node->getTag()->isSelfClosing()) {
84 282
                    $activeNode = $node;
85
                }
86 249
            } elseif ($options->isWhitespaceTextNode() ||
87 249
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 246
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 246
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 246
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 282
        return $root;
97
    }
98
99
    /**
100
     * Attempts to detect the charset that the html was sent in.
101
     *
102
     * @throws ChildNotFoundException
103
     */
104 282
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
105
    {
106
        // set the default
107 282
        $encode = new Encode();
108 282
        $encode->from($defaultCharset);
109 282
        $encode->to($defaultCharset);
110
111 282
        $enforceEncoding = $options->getEnforceEncoding();
112 282
        if ($enforceEncoding !== null) {
113
            //  they want to enforce the given encoding
114
            $encode->from($enforceEncoding);
115
            $encode->to($enforceEncoding);
116
117
            return false;
118
        }
119
120
        /** @var AbstractNode $meta */
121 282
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
122 282
        if ($meta == null) {
123 252
            if (!$this->detectHTML5Charset($encode, $root)) {
124
                // could not find meta tag
125 249
                $root->propagateEncoding($encode);
126
127 249
                return false;
128
            }
129
130 3
            return true;
131
        }
132 30
        $content = $meta->getAttribute('content');
133 30
        if (\is_null($content)) {
134
            // could not find content
135
            $root->propagateEncoding($encode);
136
137
            return false;
138
        }
139 30
        $matches = [];
140 30
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
141 30
            $encode->from(\trim($matches[1]));
142 30
            $root->propagateEncoding($encode);
143
144 30
            return true;
145
        }
146
147
        // no charset found
148
        $root->propagateEncoding($encode);
149
150
        return false;
151
    }
152
153
    /**
154
     * Attempt to parse a tag out of the content.
155
     *
156
     * @throws StrictException
157
     * @throws ContentLengthException
158
     * @throws LogicalException
159
     * @throws StrictException
160
     */
161 288
    private function parseTag(Options $options, Content $content, int $size): TagDTO
162
    {
163 288
        $return = [];
164 288
        if ($content->char() != '<') {
165
            // we are not at the beginning of a tag
166 279
            return new TagDTO();
167
        }
168
169
        // check if this is a closing tag
170
        try {
171 282
            $content->fastForward(1);
172
        } catch (ContentLengthException $exception) {
173
            // we are at the end of the file
174
            return new TagDTO();
175
        }
176 282
        if ($content->char() == '/') {
177 252
            return $this->makeEndTag($content, $options);
178
        }
179 282
        if ($content->char() == '?') {
180
            // special setting tag
181 3
            $tag = $content->fastForward(1)
182 3
                ->copyByToken(StringToken::SLASH(), true);
183 3
            $tag = (new Tag($tag))
184 3
                ->setOpening('<?')
185 3
                ->setClosing(' ?>')
186 3
                ->selfClosing();
187
        } else {
188 282
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
189 282
            if (\trim($tag) == '') {
190
                // no tag found, invalid < found
191 3
                return new TagDTO();
192
            }
193
        }
194 282
        $node = new HtmlNode($tag);
195 282
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
196 282
        $this->setUpAttributes($content, $size, $node, $options, $tag);
197
198 282
        $content->skipByToken(StringToken::BLANK());
199 282
        if ($content->char() == '/') {
200
            // self closing tag
201 117
            $node->getTag()->selfClosing();
202 117
            $content->fastForward(1);
203 279
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
204
            // Should be a self closing tag, check if we are strict
205 144
            if ($options->isStrict()) {
206 3
                $character = $content->getPosition();
207 3
                throw new StrictException("Tag '" . $node->getTag()->name() . "' is not self closing! (character #$character)");
208
            }
209
210
            // We force self closing on this tag.
211 141
            $node->getTag()->selfClosing();
212
213
            // Should this tag use a trailing slash?
214 141
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
215 3
                $node->getTag()->noTrailingSlash();
216
            }
217
        }
218
219 282
        if ($content->canFastForward(1)) {
220 279
            $content->fastForward(1);
221
        }
222
223 282
        $return['status'] = true;
224 282
        $return['node'] = $node;
225
226 282
        return new TagDTO($return);
227
    }
228
229
    /**
230
     * @throws ChildNotFoundException
231
     */
232 252
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
233
    {
234
        /** @var AbstractNode|null $meta */
235 252
        $meta = $root->find('meta[charset]', 0);
236 252
        if ($meta == null) {
237 249
            return false;
238
        }
239
240 3
        $encode->from(\trim($meta->getAttribute('charset')));
241 3
        $root->propagateEncoding($encode);
242
243 3
        return true;
244
    }
245
246
    /**
247
     * @throws ContentLengthException
248
     * @throws LogicalException
249
     */
250 252
    private function makeEndTag(Content $content, Options $options): TagDTO
251
    {
252 252
        $return = [];
253 252
        $tag = $content->fastForward(1)
254 252
            ->copyByToken(StringToken::SLASH(), true);
255
        // move to end of tag
256 252
        $content->copyUntil('>');
257 252
        $content->fastForward(1);
258
259
        // check if this closing tag counts
260 252
        $tag = \strtolower($tag);
261 252
        if (\in_array($tag, $options->getSelfClosing(), true)) {
262 57
            $return['status'] = true;
263
264 57
            return new TagDTO($return);
265
        }
266 252
        $return['status'] = true;
267 252
        $return['closing'] = true;
268 252
        $return['tag'] = \strtolower($tag);
269
270 252
        return new TagDTO($return);
271
    }
272
273
    /**
274
     * @param string|Tag $tag
275
     *
276
     * @throws ContentLengthException
277
     * @throws LogicalException
278
     * @throws StrictException
279
     */
280 282
    private function setUpAttributes(Content $content, int $size, HtmlNode $node, Options $options, $tag): void
281
    {
282
        while (
283 282
            $content->char() != '>' &&
284 282
            $content->char() != '/'
285
        ) {
286 264
            $space = $content->skipByToken(StringToken::BLANK(), true);
287 264
            if (empty($space)) {
288
                try {
289 12
                    $content->fastForward(1);
290 3
                } catch (ContentLengthException $exception) {
291
                    // reached the end of the content
292 3
                    break;
293
                }
294 12
                continue;
295
            }
296
297 264
            $name = $content->copyByToken(StringToken::EQUAL(), true);
298 264
            if ($name == '/') {
299
                break;
300
            }
301
302 264
            if (empty($name)) {
303 120
                $content->skipByToken(StringToken::BLANK());
304 120
                continue;
305
            }
306
307 261
            $content->skipByToken(StringToken::BLANK());
308 261
            if ($content->char() == '=') {
309 255
                $content->fastForward(1)
310 255
                    ->skipByToken(StringToken::BLANK());
311 255
                switch ($content->char()) {
312 255
                    case '"':
313 240
                        $content->fastForward(1);
314 240
                        $string = $content->copyUntil('"', true);
315
                        do {
316 240
                            $moreString = $content->copyUntilUnless('"', '=>');
317 240
                            $string .= $moreString;
318 240
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
319 240
                        $content->fastForward(1);
320 240
                        $node->getTag()->setAttribute($name, $string);
321 240
                        break;
322 21
                    case "'":
323 18
                        $content->fastForward(1);
324 18
                        $string = $content->copyUntil("'", true);
325
                        do {
326 18
                            $moreString = $content->copyUntilUnless("'", '=>');
327 18
                            $string .= $moreString;
328 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
329 18
                        $content->fastForward(1);
330 18
                        $node->getTag()->setAttribute($name, $string, false);
331 18
                        break;
332
                    default:
333 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
334 255
                        break;
335
                }
336
            } else {
337
                // no value attribute
338 81
                if ($options->isStrict()) {
339
                    // can't have this in strict html
340 3
                    $character = $content->getPosition();
341 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
342
                }
343 78
                $node->getTag()->setAttribute($name, null);
344 78
                if ($content->char() != '>') {
345 21
                    $content->rewind(1);
346
                }
347
            }
348
        }
349 282
    }
350
}
351