Passed
Branch dev/3.0.0 (c487fc)
by Gilles
01:48
created

Parser::parse()   C

Complexity

Conditions 15
Paths 25

Size

Total Lines 64
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 38
CRAP Score 15

Importance

Changes 0
Metric Value
cc 15
eloc 38
c 0
b 0
f 0
nc 25
nop 3
dl 0
loc 64
ccs 38
cts 38
cp 1
crap 15
rs 5.9166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 285
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 285
        $root = new HtmlNode('root');
37 285
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 285
        $activeNode = $root;
39 285
        while ($activeNode !== null) {
40 285
            if ($activeNode && $activeNode->tag->name() === 'script'
41 285
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 285
                $str = $content->copyUntil('<');
46
            }
47 285
            if ($str == '') {
48 285
                $tagDTO = $this->parseTag($options, $content, $size);
49 285
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 279
                    $activeNode = null;
52 279
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 279
                if ($tagDTO->isClosing()) {
57 249
                    $foundOpeningTag = true;
58 249
                    $originalNode = $activeNode;
59 249
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 249
                    if ($foundOpeningTag) {
69 249
                        $activeNode = $activeNode->getParent();
70
                    }
71 249
                    continue;
72
                }
73
74 279
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 279
                $node = $tagDTO->getNode();
80 279
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 279
                if (!$node->getTag()->isSelfClosing()) {
84 279
                    $activeNode = $node;
85
                }
86 246
            } elseif ($options->isWhitespaceTextNode() ||
87 246
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 243
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 243
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 243
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 279
        return $root;
97
    }
98
99
    /**
100
     * Attempt to parse a tag out of the content.
101
     *
102
     * @throws StrictException
103
     * @throws ContentLengthException
104
     * @throws LogicalException
105
     * @throws StrictException
106
     */
107 285
    private function parseTag(Options $options, Content $content, int $size): TagDTO
108
    {
109 285
        $return = [];
110 285
        if ($content->char() != '<') {
111
            // we are not at the beginning of a tag
112 276
            return new TagDTO();
113
        }
114
115
        // check if this is a closing tag
116
        try {
117 279
            $content->fastForward(1);
118
        } catch (ContentLengthException $exception) {
119
            // we are at the end of the file
120
            return new TagDTO();
121
        }
122 279
        if ($content->char() == '/') {
123
            // end tag
124 249
            $tag = $content->fastForward(1)
125 249
                ->copyByToken(StringToken::SLASH(), true);
126
            // move to end of tag
127 249
            $content->copyUntil('>');
128 249
            $content->fastForward(1);
129
130
            // check if this closing tag counts
131 249
            $tag = \strtolower($tag);
132 249
            if (\in_array($tag, $options->getSelfClosing(), true)) {
133 57
                $return['status'] = true;
134
135 57
                return new TagDTO($return);
136
            }
137 249
            $return['status'] = true;
138 249
            $return['closing'] = true;
139 249
            $return['tag'] = \strtolower($tag);
140
141 249
            return new TagDTO($return);
142 279
        } elseif ($content->char() == '?') {
143
            // special setting tag
144 3
            $tag = $content->fastForward(1)
145 3
                ->copyByToken(StringToken::SLASH(), true);
146 3
            $tag = (new Tag($tag))
147 3
                ->setOpening('<?')
148 3
                ->setClosing(' ?>')
149 3
                ->selfClosing();
150
        } else {
151 279
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
152 279
            if (\trim($tag) == '') {
153
                // no tag found, invalid < found
154 3
                return new TagDTO();
155
            }
156
        }
157 279
        $node = new HtmlNode($tag);
158 279
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
159
160
        // attributes
161
        while (
162 279
            $content->char() != '>' &&
163 279
            $content->char() != '/'
164
        ) {
165 261
            $space = $content->skipByToken(StringToken::BLANK(), true);
166 261
            if (empty($space)) {
167
                try {
168 12
                    $content->fastForward(1);
169 3
                } catch (ContentLengthException $exception) {
170
                    // reached the end of the content
171 3
                    break;
172
                }
173 12
                continue;
174
            }
175
176 261
            $name = $content->copyByToken(StringToken::EQUAL(), true);
177 261
            if ($name == '/') {
178
                break;
179
            }
180
181 261
            if (empty($name)) {
182 120
                $content->skipByToken(StringToken::BLANK());
183 120
                continue;
184
            }
185
186 258
            $content->skipByToken(StringToken::BLANK());
187 258
            if ($content->char() == '=') {
188 252
                $content->fastForward(1)
189 252
                    ->skipByToken(StringToken::BLANK());
190 252
                switch ($content->char()) {
191 252
                    case '"':
192 237
                        $content->fastForward(1);
193 237
                        $string = $content->copyUntil('"', true);
194
                        do {
195 237
                            $moreString = $content->copyUntilUnless('"', '=>');
196 237
                            $string .= $moreString;
197 237
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
198 237
                        $attr['value'] = $string;
199 237
                        $content->fastForward(1);
200 237
                        $node->getTag()->setAttribute($name, $string);
201 237
                        break;
202 21
                    case "'":
203 18
                        $content->fastForward(1);
204 18
                        $string = $content->copyUntil("'", true);
205
                        do {
206 18
                            $moreString = $content->copyUntilUnless("'", '=>');
207 18
                            $string .= $moreString;
208 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
209 18
                        $attr['value'] = $string;
210 18
                        $content->fastForward(1);
211 18
                        $node->getTag()->setAttribute($name, $string, false);
212 18
                        break;
213
                    default:
214 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
215 252
                        break;
216
                }
217
            } else {
218
                // no value attribute
219 81
                if ($options->isStrict()) {
220
                    // can't have this in strict html
221 3
                    $character = $content->getPosition();
222 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
223
                }
224 78
                $node->getTag()->setAttribute($name, null);
225 78
                if ($content->char() != '>') {
226 21
                    $content->rewind(1);
227
                }
228
            }
229
        }
230
231 279
        $content->skipByToken(StringToken::BLANK());
232 279
        if ($content->char() == '/') {
233
            // self closing tag
234 117
            $node->getTag()->selfClosing();
235 117
            $content->fastForward(1);
236 276
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
237
            // Should be a self closing tag, check if we are strict
238 144
            if ($options->isStrict()) {
239 3
                $character = $content->getPosition();
240 3
                throw new StrictException("Tag '".$node->getTag()->name()."' is not self closing! (character #$character)");
241
            }
242
243
            // We force self closing on this tag.
244 141
            $node->getTag()->selfClosing();
245
246
            // Should this tag use a trailing slash?
247 141
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
248 3
                $node->getTag()->noTrailingSlash();
249
            }
250
        }
251
252 279
        if ($content->canFastForward(1)) {
253 276
            $content->fastForward(1);
254
        }
255
256 279
        $return['status'] = true;
257 279
        $return['node'] = $node;
258
259 279
        return new TagDTO($return);
260
    }
261
262
    /**
263
     * Attempts to detect the charset that the html was sent in.
264
     *
265
     * @throws ChildNotFoundException
266
     */
267 279
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
268
    {
269
        // set the default
270 279
        $encode = new Encode();
271 279
        $encode->from($defaultCharset);
272 279
        $encode->to($defaultCharset);
273
274 279
        $enforceEncoding = $options->getEnforceEncoding();
275 279
        if ($enforceEncoding !== null) {
276
            //  they want to enforce the given encoding
277
            $encode->from($enforceEncoding);
278
            $encode->to($enforceEncoding);
279
280
            return false;
281
        }
282
283
        /** @var AbstractNode $meta */
284 279
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
285 279
        if ($meta == null) {
286 249
            if (!$this->detectHTML5Charset($encode, $root)) {
287
                // could not find meta tag
288 246
                $root->propagateEncoding($encode);
289
290 246
                return false;
291
            }
292
293 3
            return true;
294
        }
295 30
        $content = $meta->getAttribute('content');
296 30
        if (\is_null($content)) {
297
            // could not find content
298
            $root->propagateEncoding($encode);
299
300
            return false;
301
        }
302 30
        $matches = [];
303 30
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
304 30
            $encode->from(\trim($matches[1]));
305 30
            $root->propagateEncoding($encode);
306
307 30
            return true;
308
        }
309
310
        // no charset found
311
        $root->propagateEncoding($encode);
312
313
        return false;
314
    }
315
316
    /**
317
     * @throws ChildNotFoundException
318
     */
319 249
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
320
    {
321
        /** @var AbstractNode|null $meta */
322 249
        $meta = $root->find('meta[charset]', 0);
323 249
        if ($meta == null) {
324 246
            return false;
325
        }
326
327 3
        $encode->from(\trim($meta->getAttribute('charset')));
328 3
        $root->propagateEncoding($encode);
329
330 3
        return true;
331
    }
332
}
333