1
|
|
|
<?php
|
2
|
|
|
|
3
|
|
|
declare(strict_types=1);
|
4
|
|
|
|
5
|
|
|
namespace PHPHtmlParser\Dom;
|
6
|
|
|
|
7
|
|
|
use PHPHtmlParser\Content;
|
8
|
|
|
use PHPHtmlParser\Contracts\Dom\ParserInterface;
|
9
|
|
|
use PHPHtmlParser\Dom\Node\AbstractNode;
|
10
|
|
|
use PHPHtmlParser\Dom\Node\HtmlNode;
|
11
|
|
|
use PHPHtmlParser\Dom\Node\TextNode;
|
12
|
|
|
use PHPHtmlParser\DTO\TagDTO;
|
13
|
|
|
use PHPHtmlParser\Enum\StringToken;
|
14
|
|
|
use PHPHtmlParser\Exceptions\ChildNotFoundException;
|
15
|
|
|
use PHPHtmlParser\Exceptions\CircularException;
|
16
|
|
|
use PHPHtmlParser\Exceptions\ContentLengthException;
|
17
|
|
|
use PHPHtmlParser\Exceptions\LogicalException;
|
18
|
|
|
use PHPHtmlParser\Exceptions\StrictException;
|
19
|
|
|
use PHPHtmlParser\Options;
|
20
|
|
|
use stringEncode\Encode;
|
21
|
|
|
|
22
|
|
|
class Parser implements ParserInterface
|
23
|
|
|
{
|
24
|
|
|
/**
|
25
|
|
|
* Attempts to parse the html in content.
|
26
|
|
|
*
|
27
|
|
|
* @throws ChildNotFoundException
|
28
|
|
|
* @throws CircularException
|
29
|
|
|
* @throws ContentLengthException
|
30
|
|
|
* @throws LogicalException
|
31
|
|
|
* @throws StrictException
|
32
|
|
|
*/
|
33
|
285 |
|
public function parse(Options $options, Content $content, int $size): AbstractNode
|
34
|
|
|
{
|
35
|
|
|
// add the root node
|
36
|
285 |
|
$root = new HtmlNode('root');
|
37
|
285 |
|
$root->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
|
38
|
285 |
|
$activeNode = $root;
|
39
|
285 |
|
while ($activeNode !== null) {
|
40
|
285 |
|
if ($activeNode && $activeNode->tag->name() === 'script'
|
41
|
285 |
|
&& $options->isCleanupInput() !== true
|
42
|
|
|
) {
|
43
|
9 |
|
$str = $content->copyUntil('</');
|
44
|
|
|
} else {
|
45
|
285 |
|
$str = $content->copyUntil('<');
|
46
|
|
|
}
|
47
|
285 |
|
if ($str == '') {
|
48
|
285 |
|
$tagDTO = $this->parseTag($options, $content, $size);
|
49
|
285 |
|
if (!$tagDTO->isStatus()) {
|
50
|
|
|
// we are done here
|
51
|
279 |
|
$activeNode = null;
|
52
|
279 |
|
continue;
|
53
|
|
|
}
|
54
|
|
|
|
55
|
|
|
// check if it was a closing tag
|
56
|
279 |
|
if ($tagDTO->isClosing()) {
|
57
|
249 |
|
$foundOpeningTag = true;
|
58
|
249 |
|
$originalNode = $activeNode;
|
59
|
249 |
|
while ($activeNode->getTag()->name() != $tagDTO->getTag()) {
|
60
|
105 |
|
$activeNode = $activeNode->getParent();
|
61
|
105 |
|
if ($activeNode === null) {
|
62
|
|
|
// we could not find opening tag
|
63
|
39 |
|
$activeNode = $originalNode;
|
64
|
39 |
|
$foundOpeningTag = false;
|
65
|
39 |
|
break;
|
66
|
|
|
}
|
67
|
|
|
}
|
68
|
249 |
|
if ($foundOpeningTag) {
|
69
|
249 |
|
$activeNode = $activeNode->getParent();
|
70
|
|
|
}
|
71
|
249 |
|
continue;
|
72
|
|
|
}
|
73
|
|
|
|
74
|
279 |
|
if ($tagDTO->getNode() === null) {
|
75
|
57 |
|
continue;
|
76
|
|
|
}
|
77
|
|
|
|
78
|
|
|
/** @var AbstractNode $node */
|
79
|
279 |
|
$node = $tagDTO->getNode();
|
80
|
279 |
|
$activeNode->addChild($node);
|
81
|
|
|
|
82
|
|
|
// check if node is self closing
|
83
|
279 |
|
if (!$node->getTag()->isSelfClosing()) {
|
84
|
279 |
|
$activeNode = $node;
|
85
|
|
|
}
|
86
|
246 |
|
} elseif ($options->isWhitespaceTextNode() ||
|
87
|
246 |
|
\trim($str) != ''
|
88
|
|
|
) {
|
89
|
|
|
// we found text we care about
|
90
|
243 |
|
$textNode = new TextNode($str, $options->isRemoveDoubleSpace());
|
91
|
243 |
|
$textNode->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
|
92
|
243 |
|
$activeNode->addChild($textNode);
|
93
|
|
|
}
|
94
|
|
|
}
|
95
|
|
|
|
96
|
279 |
|
return $root;
|
97
|
|
|
}
|
98
|
|
|
|
99
|
|
|
/**
|
100
|
|
|
* Attempt to parse a tag out of the content.
|
101
|
|
|
*
|
102
|
|
|
* @throws StrictException
|
103
|
|
|
* @throws ContentLengthException
|
104
|
|
|
* @throws LogicalException
|
105
|
|
|
* @throws StrictException
|
106
|
|
|
*/
|
107
|
285 |
|
private function parseTag(Options $options, Content $content, int $size): TagDTO
|
108
|
|
|
{
|
109
|
285 |
|
$return = [];
|
110
|
285 |
|
if ($content->char() != '<') {
|
111
|
|
|
// we are not at the beginning of a tag
|
112
|
276 |
|
return new TagDTO();
|
113
|
|
|
}
|
114
|
|
|
|
115
|
|
|
// check if this is a closing tag
|
116
|
|
|
try {
|
117
|
279 |
|
$content->fastForward(1);
|
118
|
|
|
} catch (ContentLengthException $exception) {
|
119
|
|
|
// we are at the end of the file
|
120
|
|
|
return new TagDTO();
|
121
|
|
|
}
|
122
|
279 |
|
if ($content->char() == '/') {
|
123
|
|
|
// end tag
|
124
|
249 |
|
$tag = $content->fastForward(1)
|
125
|
249 |
|
->copyByToken(StringToken::SLASH(), true);
|
126
|
|
|
// move to end of tag
|
127
|
249 |
|
$content->copyUntil('>');
|
128
|
249 |
|
$content->fastForward(1);
|
129
|
|
|
|
130
|
|
|
// check if this closing tag counts
|
131
|
249 |
|
$tag = \strtolower($tag);
|
132
|
249 |
|
if (\in_array($tag, $options->getSelfClosing(), true)) {
|
133
|
57 |
|
$return['status'] = true;
|
134
|
|
|
|
135
|
57 |
|
return new TagDTO($return);
|
136
|
|
|
}
|
137
|
249 |
|
$return['status'] = true;
|
138
|
249 |
|
$return['closing'] = true;
|
139
|
249 |
|
$return['tag'] = \strtolower($tag);
|
140
|
|
|
|
141
|
249 |
|
return new TagDTO($return);
|
142
|
279 |
|
} elseif ($content->char() == '?') {
|
143
|
|
|
// special setting tag
|
144
|
3 |
|
$tag = $content->fastForward(1)
|
145
|
3 |
|
->copyByToken(StringToken::SLASH(), true);
|
146
|
3 |
|
$tag = (new Tag($tag))
|
147
|
3 |
|
->setOpening('<?')
|
148
|
3 |
|
->setClosing(' ?>')
|
149
|
3 |
|
->selfClosing();
|
150
|
|
|
} else {
|
151
|
279 |
|
$tag = \strtolower($content->copyByToken(StringToken::SLASH(), true));
|
152
|
279 |
|
if (\trim($tag) == '') {
|
153
|
|
|
// no tag found, invalid < found
|
154
|
3 |
|
return new TagDTO();
|
155
|
|
|
}
|
156
|
|
|
}
|
157
|
279 |
|
$node = new HtmlNode($tag);
|
158
|
279 |
|
$node->setHtmlSpecialCharsDecode($options->isHtmlSpecialCharsDecode());
|
159
|
|
|
|
160
|
|
|
// attributes
|
161
|
|
|
while (
|
162
|
279 |
|
$content->char() != '>' &&
|
163
|
279 |
|
$content->char() != '/'
|
164
|
|
|
) {
|
165
|
261 |
|
$space = $content->skipByToken(StringToken::BLANK(), true);
|
166
|
261 |
|
if (empty($space)) {
|
167
|
|
|
try {
|
168
|
12 |
|
$content->fastForward(1);
|
169
|
3 |
|
} catch (ContentLengthException $exception) {
|
170
|
|
|
// reached the end of the content
|
171
|
3 |
|
break;
|
172
|
|
|
}
|
173
|
12 |
|
continue;
|
174
|
|
|
}
|
175
|
|
|
|
176
|
261 |
|
$name = $content->copyByToken(StringToken::EQUAL(), true);
|
177
|
261 |
|
if ($name == '/') {
|
178
|
|
|
break;
|
179
|
|
|
}
|
180
|
|
|
|
181
|
261 |
|
if (empty($name)) {
|
182
|
120 |
|
$content->skipByToken(StringToken::BLANK());
|
183
|
120 |
|
continue;
|
184
|
|
|
}
|
185
|
|
|
|
186
|
258 |
|
$content->skipByToken(StringToken::BLANK());
|
187
|
258 |
|
if ($content->char() == '=') {
|
188
|
252 |
|
$content->fastForward(1)
|
189
|
252 |
|
->skipByToken(StringToken::BLANK());
|
190
|
252 |
|
switch ($content->char()) {
|
191
|
252 |
|
case '"':
|
192
|
237 |
|
$content->fastForward(1);
|
193
|
237 |
|
$string = $content->copyUntil('"', true);
|
194
|
|
|
do {
|
195
|
237 |
|
$moreString = $content->copyUntilUnless('"', '=>');
|
196
|
237 |
|
$string .= $moreString;
|
197
|
237 |
|
} while (\strlen($moreString) > 0 && $content->getPosition() < $size);
|
198
|
237 |
|
$attr['value'] = $string;
|
199
|
237 |
|
$content->fastForward(1);
|
200
|
237 |
|
$node->getTag()->setAttribute($name, $string);
|
201
|
237 |
|
break;
|
202
|
21 |
|
case "'":
|
203
|
18 |
|
$content->fastForward(1);
|
204
|
18 |
|
$string = $content->copyUntil("'", true);
|
205
|
|
|
do {
|
206
|
18 |
|
$moreString = $content->copyUntilUnless("'", '=>');
|
207
|
18 |
|
$string .= $moreString;
|
208
|
18 |
|
} while (\strlen($moreString) > 0 && $content->getPosition() < $size);
|
209
|
18 |
|
$attr['value'] = $string;
|
210
|
18 |
|
$content->fastForward(1);
|
211
|
18 |
|
$node->getTag()->setAttribute($name, $string, false);
|
212
|
18 |
|
break;
|
213
|
|
|
default:
|
214
|
3 |
|
$node->getTag()->setAttribute($name, $content->copyByToken(StringToken::ATTR(), true));
|
215
|
252 |
|
break;
|
216
|
|
|
}
|
217
|
|
|
} else {
|
218
|
|
|
// no value attribute
|
219
|
81 |
|
if ($options->isStrict()) {
|
220
|
|
|
// can't have this in strict html
|
221
|
3 |
|
$character = $content->getPosition();
|
222
|
3 |
|
throw new StrictException("Tag '$tag' has an attribute '$name' with out a value! (character #$character)");
|
223
|
|
|
}
|
224
|
78 |
|
$node->getTag()->setAttribute($name, null);
|
225
|
78 |
|
if ($content->char() != '>') {
|
226
|
21 |
|
$content->rewind(1);
|
227
|
|
|
}
|
228
|
|
|
}
|
229
|
|
|
}
|
230
|
|
|
|
231
|
279 |
|
$content->skipByToken(StringToken::BLANK());
|
232
|
279 |
|
if ($content->char() == '/') {
|
233
|
|
|
// self closing tag
|
234
|
117 |
|
$node->getTag()->selfClosing();
|
235
|
117 |
|
$content->fastForward(1);
|
236
|
276 |
|
} elseif (\in_array($node->getTag()->name(), $options->getSelfClosing(), true)) {
|
237
|
|
|
// Should be a self closing tag, check if we are strict
|
238
|
144 |
|
if ($options->isStrict()) {
|
239
|
3 |
|
$character = $content->getPosition();
|
240
|
3 |
|
throw new StrictException("Tag '".$node->getTag()->name()."' is not self closing! (character #$character)");
|
241
|
|
|
}
|
242
|
|
|
|
243
|
|
|
// We force self closing on this tag.
|
244
|
141 |
|
$node->getTag()->selfClosing();
|
245
|
|
|
|
246
|
|
|
// Should this tag use a trailing slash?
|
247
|
141 |
|
if (\in_array($node->getTag()->name(), $options->getNoSlash(), true)) {
|
248
|
3 |
|
$node->getTag()->noTrailingSlash();
|
249
|
|
|
}
|
250
|
|
|
}
|
251
|
|
|
|
252
|
279 |
|
if ($content->canFastForward(1)) {
|
253
|
276 |
|
$content->fastForward(1);
|
254
|
|
|
}
|
255
|
|
|
|
256
|
279 |
|
$return['status'] = true;
|
257
|
279 |
|
$return['node'] = $node;
|
258
|
|
|
|
259
|
279 |
|
return new TagDTO($return);
|
260
|
|
|
}
|
261
|
|
|
|
262
|
|
|
/**
|
263
|
|
|
* Attempts to detect the charset that the html was sent in.
|
264
|
|
|
*
|
265
|
|
|
* @throws ChildNotFoundException
|
266
|
|
|
*/
|
267
|
279 |
|
public function detectCharset(Options $options, string $defaultCharset, AbstractNode $root): bool
|
268
|
|
|
{
|
269
|
|
|
// set the default
|
270
|
279 |
|
$encode = new Encode();
|
271
|
279 |
|
$encode->from($defaultCharset);
|
272
|
279 |
|
$encode->to($defaultCharset);
|
273
|
|
|
|
274
|
279 |
|
$enforceEncoding = $options->getEnforceEncoding();
|
275
|
279 |
|
if ($enforceEncoding !== null) {
|
276
|
|
|
// they want to enforce the given encoding
|
277
|
|
|
$encode->from($enforceEncoding);
|
278
|
|
|
$encode->to($enforceEncoding);
|
279
|
|
|
|
280
|
|
|
return false;
|
281
|
|
|
}
|
282
|
|
|
|
283
|
|
|
/** @var AbstractNode $meta */
|
284
|
279 |
|
$meta = $root->find('meta[http-equiv=Content-Type]', 0);
|
285
|
279 |
|
if ($meta == null) {
|
286
|
249 |
|
if (!$this->detectHTML5Charset($encode, $root)) {
|
287
|
|
|
// could not find meta tag
|
288
|
246 |
|
$root->propagateEncoding($encode);
|
289
|
|
|
|
290
|
246 |
|
return false;
|
291
|
|
|
}
|
292
|
|
|
|
293
|
3 |
|
return true;
|
294
|
|
|
}
|
295
|
30 |
|
$content = $meta->getAttribute('content');
|
296
|
30 |
|
if (\is_null($content)) {
|
297
|
|
|
// could not find content
|
298
|
|
|
$root->propagateEncoding($encode);
|
299
|
|
|
|
300
|
|
|
return false;
|
301
|
|
|
}
|
302
|
30 |
|
$matches = [];
|
303
|
30 |
|
if (\preg_match('/charset=([^;]+)/', $content, $matches)) {
|
304
|
30 |
|
$encode->from(\trim($matches[1]));
|
305
|
30 |
|
$root->propagateEncoding($encode);
|
306
|
|
|
|
307
|
30 |
|
return true;
|
308
|
|
|
}
|
309
|
|
|
|
310
|
|
|
// no charset found
|
311
|
|
|
$root->propagateEncoding($encode);
|
312
|
|
|
|
313
|
|
|
return false;
|
314
|
|
|
}
|
315
|
|
|
|
316
|
|
|
/**
|
317
|
|
|
* @throws ChildNotFoundException
|
318
|
|
|
*/
|
319
|
249 |
|
private function detectHTML5Charset(Encode $encode, AbstractNode $root): bool
|
320
|
|
|
{
|
321
|
|
|
/** @var AbstractNode|null $meta */
|
322
|
249 |
|
$meta = $root->find('meta[charset]', 0);
|
323
|
249 |
|
if ($meta == null) {
|
324
|
246 |
|
return false;
|
325
|
|
|
}
|
326
|
|
|
|
327
|
3 |
|
$encode->from(\trim($meta->getAttribute('charset')));
|
328
|
3 |
|
$root->propagateEncoding($encode);
|
329
|
|
|
|
330
|
3 |
|
return true;
|
331
|
|
|
}
|
332
|
|
|
}
|
333
|
|
|
|