Passed
Branch dev/3.0.0 (c487fc)
by Gilles
01:48
created

Parser::parseTag()   F

Complexity

Conditions 27
Paths 79

Size

Total Lines 153
Code Lines 102

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 92
CRAP Score 27.023

Importance

Changes 0
Metric Value
cc 27
eloc 102
c 0
b 0
f 0
nc 79
nop 3
dl 0
loc 153
ccs 92
cts 95
cp 0.9684
crap 27.023
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace PHPHtmlParser\Dom;
6
7
use PHPHtmlParser\Content;
8
use PHPHtmlParser\Contracts\Dom\ParserInterface;
9
use PHPHtmlParser\Dom\Node\AbstractNode;
10
use PHPHtmlParser\Dom\Node\HtmlNode;
11
use PHPHtmlParser\Dom\Node\TextNode;
12
use PHPHtmlParser\DTO\TagDTO;
13
use PHPHtmlParser\Enum\StringToken;
14
use PHPHtmlParser\Exceptions\ChildNotFoundException;
15
use PHPHtmlParser\Exceptions\CircularException;
16
use PHPHtmlParser\Exceptions\ContentLengthException;
17
use PHPHtmlParser\Exceptions\LogicalException;
18
use PHPHtmlParser\Exceptions\StrictException;
19
use PHPHtmlParser\Options;
20
use stringEncode\Encode;
21
22
class Parser implements ParserInterface
23
{
24
    /**
25
     * Attempts to parse the html in content.
26
     *
27
     * @throws ChildNotFoundException
28
     * @throws CircularException
29
     * @throws ContentLengthException
30
     * @throws LogicalException
31
     * @throws StrictException
32
     */
33 285
    public function parse(Options $options, Content $content, int $size): AbstractNode
34
    {
35
        // add the root node
36 285
        $root = new HtmlNode('root');
37 285
        $root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
38 285
        $activeNode = $root;
39 285
        while ($activeNode !== null) {
40 285
            if ($activeNode && $activeNode->tag->name() === 'script'
41 285
                && $options->isCleanupInput() !== true
42
            ) {
43 9
                $str = $content->copyUntil('</');
44
            } else {
45 285
                $str = $content->copyUntil('<');
46
            }
47 285
            if ($str == '') {
48 285
                $tagDTO = $this->parseTag($options, $content, $size);
49 285
                if (!$tagDTO->isStatus()) {
50
                    // we are done here
51 279
                    $activeNode = null;
52 279
                    continue;
53
                }
54
55
                // check if it was a closing tag
56 279
                if ($tagDTO->isClosing()) {
57 249
                    $foundOpeningTag = true;
58 249
                    $originalNode = $activeNode;
59 249
                    while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
60 105
                        $activeNode = $activeNode->getParent();
61 105
                        if ($activeNode === null) {
62
                            // we could not find opening tag
63 39
                            $activeNode = $originalNode;
64 39
                            $foundOpeningTag = false;
65 39
                            break;
66
                        }
67
                    }
68 249
                    if ($foundOpeningTag) {
69 249
                        $activeNode = $activeNode->getParent();
70
                    }
71 249
                    continue;
72
                }
73
74 279
                if ($tagDTO->getNode() === null) {
75 57
                    continue;
76
                }
77
78
                /** @var AbstractNode $node */
79 279
                $node = $tagDTO->getNode();
80 279
                $activeNode->addChild($node);
81
82
                // check if node is self closing
83 279
                if (!$node->getTag()->isSelfClosing()) {
84 279
                    $activeNode = $node;
85
                }
86 246
            } elseif ($options->isWhitespaceTextNode() ||
87 246
                \trim($str) != ''
88
            ) {
89
                // we found text we care about
90 243
                $textNode = new TextNode($str, $options->isRemoveDoubleSpace());
91 243
                $textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
92 243
                $activeNode->addChild($textNode);
93
            }
94
        }
95
96 279
        return $root;
97
    }
98
99
    /**
100
     * Attempt to parse a tag out of the content.
101
     *
102
     * @throws StrictException
103
     * @throws ContentLengthException
104
     * @throws LogicalException
105
     * @throws StrictException
106
     */
107 285
    private function parseTag(Options $options, Content $content, int $size): TagDTO
108
    {
109 285
        $return = [];
110 285
        if ($content->char() != '<') {
111
            // we are not at the beginning of a tag
112 276
            return new TagDTO();
113
        }
114
115
        // check if this is a closing tag
116
        try {
117 279
            $content->fastForward(1);
118
        } catch (ContentLengthException $exception) {
119
            // we are at the end of the file
120
            return new TagDTO();
121
        }
122 279
        if ($content->char() == '/') {
123
            // end tag
124 249
            $tag = $content->fastForward(1)
125 249
                ->copyByToken(StringToken::SLASH(), true);
126
            // move to end of tag
127 249
            $content->copyUntil('>');
128 249
            $content->fastForward(1);
129
130
            // check if this closing tag counts
131 249
            $tag = \strtolower($tag);
132 249
            if (\in_array($tag, $options->getSelfClosing(), true)) {
133 57
                $return['status'] = true;
134
135 57
                return new TagDTO($return);
136
            }
137 249
            $return['status'] = true;
138 249
            $return['closing'] = true;
139 249
            $return['tag'] = \strtolower($tag);
140
141 249
            return new TagDTO($return);
142 279
        } elseif ($content->char() == '?') {
143
            // special setting tag
144 3
            $tag = $content->fastForward(1)
145 3
                ->copyByToken(StringToken::SLASH(), true);
146 3
            $tag = (new Tag($tag))
147 3
                ->setOpening('<?')
148 3
                ->setClosing(' ?>')
149 3
                ->selfClosing();
150
        } else {
151 279
            $tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
152 279
            if (\trim($tag) == '') {
153
                // no tag found, invalid < found
154 3
                return new TagDTO();
155
            }
156
        }
157 279
        $node = new HtmlNode($tag);
158 279
        $node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
159
160
        // attributes
161
        while (
162 279
            $content->char() != '>' &&
163 279
            $content->char() != '/'
164
        ) {
165 261
            $space = $content->skipByToken(StringToken::BLANK(), true);
166 261
            if (empty($space)) {
167
                try {
168 12
                    $content->fastForward(1);
169 3
                } catch (ContentLengthException $exception) {
170
                    // reached the end of the content
171 3
                    break;
172
                }
173 12
                continue;
174
            }
175
176 261
            $name = $content->copyByToken(StringToken::EQUAL(), true);
177 261
            if ($name == '/') {
178
                break;
179
            }
180
181 261
            if (empty($name)) {
182 120
                $content->skipByToken(StringToken::BLANK());
183 120
                continue;
184
            }
185
186 258
            $content->skipByToken(StringToken::BLANK());
187 258
            if ($content->char() == '=') {
188 252
                $content->fastForward(1)
189 252
                    ->skipByToken(StringToken::BLANK());
190 252
                switch ($content->char()) {
191 252
                    case '"':
192 237
                        $content->fastForward(1);
193 237
                        $string = $content->copyUntil('"', true);
194
                        do {
195 237
                            $moreString = $content->copyUntilUnless('"', '=>');
196 237
                            $string .= $moreString;
197 237
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
198 237
                        $attr['value'] = $string;
199 237
                        $content->fastForward(1);
200 237
                        $node->getTag()->setAttribute($name, $string);
201 237
                        break;
202 21
                    case "'":
203 18
                        $content->fastForward(1);
204 18
                        $string = $content->copyUntil("'", true);
205
                        do {
206 18
                            $moreString = $content->copyUntilUnless("'", '=>');
207 18
                            $string .= $moreString;
208 18
                        } while (\strlen($moreString) > 0 && $content->getPosition() < $size);
209 18
                        $attr['value'] = $string;
210 18
                        $content->fastForward(1);
211 18
                        $node->getTag()->setAttribute($name, $string, false);
212 18
                        break;
213
                    default:
214 3
                        $node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
215 252
                        break;
216
                }
217
            } else {
218
                // no value attribute
219 81
                if ($options->isStrict()) {
220
                    // can't have this in strict html
221 3
                    $character = $content->getPosition();
222 3
                    throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
223
                }
224 78
                $node->getTag()->setAttribute($name, null);
225 78
                if ($content->char() != '>') {
226 21
                    $content->rewind(1);
227
                }
228
            }
229
        }
230
231 279
        $content->skipByToken(StringToken::BLANK());
232 279
        if ($content->char() == '/') {
233
            // self closing tag
234 117
            $node->getTag()->selfClosing();
235 117
            $content->fastForward(1);
236 276
        } elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
237
            // Should be a self closing tag, check if we are strict
238 144
            if ($options->isStrict()) {
239 3
                $character = $content->getPosition();
240 3
                throw new StrictException("Tag '".$node->getTag()->name()."' is not self closing! (character #$character)");
241
            }
242
243
            // We force self closing on this tag.
244 141
            $node->getTag()->selfClosing();
245
246
            // Should this tag use a trailing slash?
247 141
            if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
248 3
                $node->getTag()->noTrailingSlash();
249
            }
250
        }
251
252 279
        if ($content->canFastForward(1)) {
253 276
            $content->fastForward(1);
254
        }
255
256 279
        $return['status'] = true;
257 279
        $return['node'] = $node;
258
259 279
        return new TagDTO($return);
260
    }
261
262
    /**
263
     * Attempts to detect the charset that the html was sent in.
264
     *
265
     * @throws ChildNotFoundException
266
     */
267 279
    public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
268
    {
269
        // set the default
270 279
        $encode = new Encode();
271 279
        $encode->from($defaultCharset);
272 279
        $encode->to($defaultCharset);
273
274 279
        $enforceEncoding = $options->getEnforceEncoding();
275 279
        if ($enforceEncoding !== null) {
276
            //  they want to enforce the given encoding
277
            $encode->from($enforceEncoding);
278
            $encode->to($enforceEncoding);
279
280
            return false;
281
        }
282
283
        /** @var AbstractNode $meta */
284 279
        $meta = $root->find('meta[http-equiv=Content-Type]', 0);
285 279
        if ($meta == null) {
286 249
            if (!$this->detectHTML5Charset($encode, $root)) {
287
                // could not find meta tag
288 246
                $root->propagateEncoding($encode);
289
290 246
                return false;
291
            }
292
293 3
            return true;
294
        }
295 30
        $content = $meta->getAttribute('content');
296 30
        if (\is_null($content)) {
297
            // could not find content
298
            $root->propagateEncoding($encode);
299
300
            return false;
301
        }
302 30
        $matches = [];
303 30
        if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
304 30
            $encode->from(\trim($matches[1]));
305 30
            $root->propagateEncoding($encode);
306
307 30
            return true;
308
        }
309
310
        // no charset found
311
        $root->propagateEncoding($encode);
312
313
        return false;
314
    }
315
316
    /**
317
     * @throws ChildNotFoundException
318
     */
319 249
    private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
320
    {
321
        /** @var AbstractNode|null $meta */
322 249
        $meta = $root->find('meta[charset]', 0);
323 249
        if ($meta == null) {
324 246
            return false;
325
        }
326
327 3
        $encode->from(\trim($meta->getAttribute('charset')));
328 3
        $root->propagateEncoding($encode);
329
330 3
        return true;
331
    }
332
}
333