1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace voku\helper; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* @property-read string $outerText |
9
|
|
|
* <p>Get dom node's outer html (alias for "outerHtml").</p> |
10
|
|
|
* @property-read string $outerHtml |
11
|
|
|
* <p>Get dom node's outer html.</p> |
12
|
|
|
* @property-read string $innerText |
13
|
|
|
* <p>Get dom node's inner html (alias for "innerHtml").</p> |
14
|
|
|
* @property-read string $innerHtml |
15
|
|
|
* <p>Get dom node's inner html.</p> |
16
|
|
|
* @property-read string $plaintext |
17
|
|
|
* <p>Get dom node's plain text.</p> |
18
|
|
|
* |
19
|
|
|
* @method string outerText() |
20
|
|
|
* <p>Get dom node's outer html (alias for "outerHtml()").</p> |
21
|
|
|
* @method string outerHtml() |
22
|
|
|
* <p>Get dom node's outer html.</p> |
23
|
|
|
* @method string innerText() |
24
|
|
|
* <p>Get dom node's inner html (alias for "innerHtml()").</p> |
25
|
|
|
* @method HtmlDomParser load(string $html) |
26
|
|
|
* <p>Load HTML from string.</p> |
27
|
|
|
* @method HtmlDomParser load_file(string $html) |
28
|
|
|
* <p>Load HTML from file.</p> |
29
|
|
|
* @method static HtmlDomParser file_get_html($filePath, $libXMLExtraOptions = null) |
30
|
|
|
* <p>Load HTML from file.</p> |
31
|
|
|
* @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null) |
32
|
|
|
* <p>Load HTML from string.</p> |
33
|
|
|
*/ |
34
|
|
|
class HtmlDomParser extends AbstractDomParser |
35
|
|
|
{ |
36
|
|
|
/** |
37
|
|
|
* @var callable|null |
38
|
|
|
* |
39
|
|
|
* @phpstan-var null|callable(string $cssSelectorString, string $xPathString, \DOMXPath, \voku\helper\HtmlDomParser): string |
40
|
|
|
*/ |
41
|
|
|
private $callbackXPathBeforeQuery; |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* @var callable|null |
45
|
|
|
* |
46
|
|
|
* @phpstan-var null|callable(string $htmlString, \voku\helper\HtmlDomParser): string |
47
|
|
|
*/ |
48
|
|
|
private $callbackBeforeCreateDom; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var string[] |
52
|
|
|
*/ |
53
|
|
|
protected static $functionAliases = [ |
54
|
|
|
'outertext' => 'html', |
55
|
|
|
'outerhtml' => 'html', |
56
|
|
|
'innertext' => 'innerHtml', |
57
|
|
|
'innerhtml' => 'innerHtml', |
58
|
|
|
'load' => 'loadHtml', |
59
|
|
|
'load_file' => 'loadHtmlFile', |
60
|
|
|
]; |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* @var string[] |
64
|
|
|
*/ |
65
|
|
|
protected $templateLogicSyntaxInSpecialScriptTags = [ |
66
|
|
|
'+', |
67
|
|
|
'<%', |
68
|
|
|
'{%', |
69
|
|
|
'{{', |
70
|
|
|
]; |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* The properties specified for each special script tag is an array. |
74
|
|
|
* |
75
|
|
|
* ```php |
76
|
|
|
* protected $specialScriptTags = [ |
77
|
|
|
* 'text/html', |
78
|
|
|
* 'text/x-custom-template', |
79
|
|
|
* 'text/x-handlebars-template' |
80
|
|
|
* ] |
81
|
|
|
* ``` |
82
|
|
|
* |
83
|
|
|
* @var string[] |
84
|
|
|
*/ |
85
|
|
|
protected $specialScriptTags = [ |
86
|
|
|
'text/html', |
87
|
|
|
'text/x-custom-template', |
88
|
|
|
'text/x-handlebars-template', |
89
|
|
|
]; |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* @var string[] |
93
|
|
|
*/ |
94
|
|
|
protected $selfClosingTags = [ |
95
|
|
|
'area', |
96
|
|
|
'base', |
97
|
|
|
'br', |
98
|
|
|
'col', |
99
|
|
|
'command', |
100
|
|
|
'embed', |
101
|
|
|
'hr', |
102
|
|
|
'img', |
103
|
|
|
'input', |
104
|
|
|
'keygen', |
105
|
|
|
'link', |
106
|
|
|
'meta', |
107
|
|
|
'param', |
108
|
|
|
'source', |
109
|
|
|
'track', |
110
|
|
|
'wbr', |
111
|
|
|
]; |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* @var bool |
115
|
|
|
*/ |
116
|
|
|
protected $isDOMDocumentCreatedWithoutHtml = false; |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* @var bool |
120
|
|
|
*/ |
121
|
|
|
protected $isDOMDocumentCreatedWithoutWrapper = false; |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* @var bool |
125
|
|
|
*/ |
126
|
|
|
protected $isDOMDocumentCreatedWithCommentWrapper = false; |
127
|
|
|
|
128
|
|
|
/** |
129
|
|
|
* @var bool |
130
|
|
|
*/ |
131
|
|
|
protected $isDOMDocumentCreatedWithoutHeadWrapper = false; |
132
|
|
|
|
133
|
|
|
/** |
134
|
|
|
* @var bool |
135
|
|
|
*/ |
136
|
|
|
protected $isDOMDocumentCreatedWithoutPTagWrapper = false; |
137
|
|
|
|
138
|
|
|
/** |
139
|
|
|
* @var bool |
140
|
|
|
*/ |
141
|
|
|
protected $isDOMDocumentCreatedWithoutHtmlWrapper = false; |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* @var bool |
145
|
|
|
*/ |
146
|
|
|
protected $isDOMDocumentCreatedWithoutBodyWrapper = false; |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* @var bool |
150
|
|
|
*/ |
151
|
|
|
protected $isDOMDocumentCreatedWithFakeEndScript = false; |
152
|
|
|
|
153
|
|
|
/** |
154
|
|
|
* @var bool |
155
|
|
|
*/ |
156
|
|
|
protected $keepBrokenHtml = false; |
157
|
|
|
|
158
|
|
|
/** |
159
|
|
|
* @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode |
160
|
|
|
*/ |
161
|
221 |
View Code Duplication |
public function __construct($element = null) |
|
|
|
|
162
|
|
|
{ |
163
|
221 |
|
$this->document = new \DOMDocument('1.0', $this->getEncoding()); |
164
|
|
|
|
165
|
|
|
// DOMDocument settings |
166
|
221 |
|
$this->document->preserveWhiteSpace = true; |
167
|
221 |
|
$this->document->formatOutput = true; |
168
|
|
|
|
169
|
221 |
|
if ($element instanceof SimpleHtmlDomInterface) { |
170
|
105 |
|
$element = $element->getNode(); |
171
|
|
|
} |
172
|
|
|
|
173
|
221 |
|
if ($element instanceof \DOMNode) { |
174
|
105 |
|
$domNode = $this->document->importNode($element, true); |
175
|
|
|
|
176
|
105 |
|
if ($domNode instanceof \DOMNode) { |
177
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
178
|
105 |
|
$this->document->appendChild($domNode); |
179
|
|
|
} |
180
|
|
|
|
181
|
105 |
|
return; |
182
|
|
|
} |
183
|
|
|
|
184
|
221 |
|
if ($element !== null) { |
185
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
186
|
88 |
|
$this->loadHtml($element); |
187
|
|
|
} |
188
|
220 |
|
} |
189
|
|
|
|
190
|
|
|
/** |
191
|
|
|
* @param string $name |
192
|
|
|
* @param array $arguments |
193
|
|
|
* |
194
|
|
|
* @return bool|mixed |
195
|
|
|
*/ |
196
|
79 |
|
public function __call($name, $arguments) |
197
|
|
|
{ |
198
|
79 |
|
$name = \strtolower($name); |
199
|
|
|
|
200
|
79 |
|
if (isset(self::$functionAliases[$name])) { |
201
|
78 |
|
return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments); |
202
|
|
|
} |
203
|
|
|
|
204
|
1 |
|
throw new \BadMethodCallException('Method does not exist: ' . $name); |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
/** |
208
|
|
|
* @param string $name |
209
|
|
|
* @param array $arguments |
210
|
|
|
* |
211
|
|
|
* @throws \BadMethodCallException |
212
|
|
|
* @throws \RuntimeException |
213
|
|
|
* |
214
|
|
|
* @return HtmlDomParser |
215
|
|
|
*/ |
216
|
29 |
View Code Duplication |
public static function __callStatic($name, $arguments) |
|
|
|
|
217
|
|
|
{ |
218
|
29 |
|
$arguments0 = $arguments[0] ?? ''; |
219
|
|
|
|
220
|
29 |
|
$arguments1 = $arguments[1] ?? null; |
221
|
|
|
|
222
|
29 |
|
if ($name === 'str_get_html') { |
223
|
23 |
|
$parser = new static(); |
224
|
|
|
|
225
|
23 |
|
return $parser->loadHtml($arguments0, $arguments1); |
226
|
|
|
} |
227
|
|
|
|
228
|
7 |
|
if ($name === 'file_get_html') { |
229
|
6 |
|
$parser = new static(); |
230
|
|
|
|
231
|
6 |
|
return $parser->loadHtmlFile($arguments0, $arguments1); |
232
|
|
|
} |
233
|
|
|
|
234
|
1 |
|
throw new \BadMethodCallException('Method does not exist'); |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
/** @noinspection MagicMethodsValidityInspection */ |
238
|
|
|
|
239
|
|
|
/** |
240
|
|
|
* @param string $name |
241
|
|
|
* |
242
|
|
|
* @return string|null |
243
|
|
|
*/ |
244
|
17 |
|
public function __get($name) |
245
|
|
|
{ |
246
|
17 |
|
$name = \strtolower($name); |
247
|
|
|
|
248
|
17 |
|
switch ($name) { |
249
|
17 |
|
case 'outerhtml': |
250
|
17 |
|
case 'outertext': |
251
|
7 |
|
return $this->html(); |
252
|
11 |
|
case 'innerhtml': |
253
|
5 |
|
case 'innertext': |
254
|
7 |
|
return $this->innerHtml(); |
255
|
4 |
|
case 'text': |
256
|
4 |
|
case 'plaintext': |
257
|
3 |
|
return $this->text(); |
258
|
|
|
} |
259
|
|
|
|
260
|
1 |
|
return null; |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
/** |
264
|
|
|
* @return string |
265
|
|
|
*/ |
266
|
20 |
|
public function __toString() |
267
|
|
|
{ |
268
|
20 |
|
return $this->html(); |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
/** |
272
|
|
|
* does nothing (only for api-compatibility-reasons) |
273
|
|
|
* |
274
|
|
|
* @return bool |
275
|
|
|
* |
276
|
|
|
* @deprecated |
277
|
|
|
*/ |
278
|
1 |
|
public function clear(): bool |
279
|
|
|
{ |
280
|
1 |
|
return true; |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
/** |
284
|
|
|
* Create DOMDocument from HTML. |
285
|
|
|
* |
286
|
|
|
* @param string $html |
287
|
|
|
* @param int|null $libXMLExtraOptions |
288
|
|
|
* |
289
|
|
|
* @return \DOMDocument |
290
|
|
|
*/ |
291
|
205 |
|
protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument |
292
|
|
|
{ |
293
|
205 |
|
if ($this->callbackBeforeCreateDom) { |
294
|
1 |
|
$html = \call_user_func($this->callbackBeforeCreateDom, $html, $this); |
295
|
|
|
} |
296
|
|
|
|
297
|
|
|
// Remove content before <!DOCTYPE.*> because otherwise the DOMDocument can not handle the input. |
298
|
205 |
|
$isDOMDocumentCreatedWithDoctype = false; |
299
|
205 |
View Code Duplication |
if (\stripos($html, '<!DOCTYPE') !== false) { |
|
|
|
|
300
|
60 |
|
$isDOMDocumentCreatedWithDoctype = true; |
301
|
|
|
if ( |
302
|
60 |
|
\preg_match('/(^.*?)<!(?:DOCTYPE)(?: [^>]*)?>/sui', $html, $matches_before_doctype) |
303
|
|
|
&& |
304
|
60 |
|
\trim($matches_before_doctype[1]) |
305
|
|
|
) { |
306
|
2 |
|
$html = \str_replace($matches_before_doctype[1], '', $html); |
307
|
|
|
} |
308
|
|
|
} |
309
|
|
|
|
310
|
205 |
|
if ($this->keepBrokenHtml) { |
311
|
5 |
|
$html = $this->keepBrokenHtml(\trim($html)); |
312
|
|
|
} |
313
|
|
|
|
314
|
205 |
|
if (\strpos($html, '<') === false) { |
315
|
12 |
|
$this->isDOMDocumentCreatedWithoutHtml = true; |
316
|
203 |
|
} elseif (\strpos(\ltrim($html), '<') !== 0) { |
317
|
6 |
|
$this->isDOMDocumentCreatedWithoutWrapper = true; |
318
|
|
|
} |
319
|
|
|
|
320
|
205 |
|
if (\strpos(\ltrim($html), '<!--') === 0) { |
321
|
12 |
|
$this->isDOMDocumentCreatedWithCommentWrapper = true; |
322
|
|
|
} |
323
|
|
|
|
324
|
|
|
/** @noinspection HtmlRequiredLangAttribute */ |
325
|
|
|
if ( |
326
|
205 |
|
\strpos($html, '<html ') === false |
327
|
|
|
&& |
328
|
205 |
|
\strpos($html, '<html>') === false |
329
|
|
|
) { |
330
|
124 |
|
$this->isDOMDocumentCreatedWithoutHtmlWrapper = true; |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
if ( |
334
|
205 |
|
\strpos($html, '<body ') === false |
335
|
|
|
&& |
336
|
205 |
|
\strpos($html, '<body>') === false |
337
|
|
|
) { |
338
|
129 |
|
$this->isDOMDocumentCreatedWithoutBodyWrapper = true; |
339
|
|
|
} |
340
|
|
|
|
341
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
342
|
|
|
if ( |
343
|
205 |
|
\strpos($html, '<head ') === false |
344
|
|
|
&& |
345
|
205 |
|
\strpos($html, '<head>') === false |
346
|
|
|
) { |
347
|
148 |
|
$this->isDOMDocumentCreatedWithoutHeadWrapper = true; |
348
|
|
|
} |
349
|
|
|
|
350
|
|
|
if ( |
351
|
205 |
|
\strpos($html, '<p ') === false |
352
|
|
|
&& |
353
|
205 |
|
\strpos($html, '<p>') === false |
354
|
|
|
) { |
355
|
114 |
|
$this->isDOMDocumentCreatedWithoutPTagWrapper = true; |
356
|
|
|
} |
357
|
|
|
|
358
|
|
|
if ( |
359
|
205 |
|
\strpos($html, '</script>') === false |
360
|
|
|
&& |
361
|
205 |
|
\strpos($html, '<\/script>') !== false |
362
|
|
|
) { |
363
|
1 |
|
$this->isDOMDocumentCreatedWithFakeEndScript = true; |
364
|
|
|
} |
365
|
|
|
|
366
|
205 |
View Code Duplication |
if (\stripos($html, '</html>') !== false) { |
|
|
|
|
367
|
|
|
/** @noinspection NestedPositiveIfStatementsInspection */ |
368
|
|
|
if ( |
369
|
90 |
|
\preg_match('/<\/html>(.*?)/suiU', $html, $matches_after_html) |
370
|
|
|
&& |
371
|
90 |
|
\trim($matches_after_html[1]) |
372
|
|
|
) { |
373
|
4 |
|
$html = \str_replace($matches_after_html[0], $matches_after_html[1] . '</html>', $html); |
374
|
|
|
} |
375
|
|
|
} |
376
|
|
|
|
377
|
205 |
|
if (\strpos($html, '<script') !== false) { |
378
|
23 |
|
$this->html5FallbackForScriptTags($html); |
379
|
|
|
|
380
|
23 |
|
foreach ($this->specialScriptTags as $tag) { |
381
|
23 |
|
if (\strpos($html, $tag) !== false) { |
382
|
6 |
|
$this->keepSpecialScriptTags($html); |
383
|
|
|
} |
384
|
|
|
} |
385
|
|
|
} |
386
|
|
|
|
387
|
205 |
|
$html = \str_replace( |
388
|
|
|
\array_map(static function ($e) { |
389
|
205 |
|
return '<' . $e . '>'; |
390
|
205 |
|
}, $this->selfClosingTags), |
391
|
|
|
\array_map(static function ($e) { |
392
|
205 |
|
return '<' . $e . '/>'; |
393
|
205 |
|
}, $this->selfClosingTags), |
394
|
205 |
|
$html |
395
|
|
|
); |
396
|
|
|
|
397
|
|
|
// set error level |
398
|
205 |
|
$internalErrors = \libxml_use_internal_errors(true); |
399
|
205 |
|
if (\PHP_VERSION_ID < 80000) { |
400
|
205 |
|
$disableEntityLoader = \libxml_disable_entity_loader(true); |
401
|
|
|
} |
402
|
205 |
|
\libxml_clear_errors(); |
403
|
|
|
|
404
|
205 |
|
$optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET; |
405
|
|
|
|
406
|
205 |
|
if (\defined('LIBXML_BIGLINES')) { |
407
|
205 |
|
$optionsXml |= \LIBXML_BIGLINES; |
408
|
|
|
} |
409
|
|
|
|
410
|
205 |
|
if (\defined('LIBXML_COMPACT')) { |
411
|
205 |
|
$optionsXml |= \LIBXML_COMPACT; |
412
|
|
|
} |
413
|
|
|
|
414
|
205 |
|
if (\defined('LIBXML_HTML_NODEFDTD')) { |
415
|
205 |
|
$optionsXml |= \LIBXML_HTML_NODEFDTD; |
416
|
|
|
} |
417
|
|
|
|
418
|
205 |
|
if ($libXMLExtraOptions !== null) { |
419
|
5 |
|
$optionsXml |= $libXMLExtraOptions; |
420
|
|
|
} |
421
|
|
|
|
422
|
|
|
if ( |
423
|
205 |
|
$this->isDOMDocumentCreatedWithoutWrapper |
424
|
|
|
|| |
425
|
201 |
|
$this->isDOMDocumentCreatedWithCommentWrapper |
426
|
|
|
|| |
427
|
|
|
( |
428
|
189 |
|
!$isDOMDocumentCreatedWithDoctype |
429
|
|
|
&& |
430
|
205 |
|
$this->keepBrokenHtml |
431
|
|
|
) |
432
|
|
|
) { |
433
|
20 |
|
$html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>'; |
434
|
|
|
} |
435
|
|
|
|
436
|
205 |
|
$html = self::replaceToPreserveHtmlEntities($html); |
437
|
|
|
|
438
|
205 |
|
$documentFound = false; |
439
|
205 |
|
$sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml); |
440
|
205 |
View Code Duplication |
if ($sxe !== false && \count(\libxml_get_errors()) === 0) { |
|
|
|
|
441
|
100 |
|
$domElementTmp = \dom_import_simplexml($sxe); |
442
|
|
|
if ( |
443
|
100 |
|
$domElementTmp |
444
|
|
|
&& |
445
|
100 |
|
$domElementTmp->ownerDocument |
446
|
|
|
) { |
447
|
100 |
|
$documentFound = true; |
448
|
100 |
|
$this->document = $domElementTmp->ownerDocument; |
449
|
|
|
} |
450
|
|
|
} |
451
|
|
|
|
452
|
205 |
View Code Duplication |
if ($documentFound === false) { |
|
|
|
|
453
|
|
|
|
454
|
|
|
// UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251 |
455
|
115 |
|
$xmlHackUsed = false; |
456
|
|
|
/** @noinspection StringFragmentMisplacedInspection */ |
457
|
115 |
|
if (\stripos('<?xml', $html) !== 0) { |
458
|
115 |
|
$xmlHackUsed = true; |
459
|
115 |
|
$html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html; |
460
|
|
|
} |
461
|
|
|
|
462
|
115 |
|
if ($html !== '') { |
463
|
115 |
|
$this->document->loadHTML($html, $optionsXml); |
464
|
|
|
} |
465
|
|
|
|
466
|
|
|
// remove the "xml-encoding" hack |
467
|
115 |
|
if ($xmlHackUsed) { |
468
|
115 |
|
foreach ($this->document->childNodes as $child) { |
469
|
115 |
|
if ($child->nodeType === \XML_PI_NODE) { |
470
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
471
|
115 |
|
$this->document->removeChild($child); |
472
|
|
|
|
473
|
115 |
|
break; |
474
|
|
|
} |
475
|
|
|
} |
476
|
|
|
} |
477
|
|
|
} |
478
|
|
|
|
479
|
|
|
// set encoding |
480
|
205 |
|
$this->document->encoding = $this->getEncoding(); |
481
|
|
|
|
482
|
|
|
// restore lib-xml settings |
483
|
205 |
|
\libxml_clear_errors(); |
484
|
205 |
|
\libxml_use_internal_errors($internalErrors); |
485
|
205 |
|
if (\PHP_VERSION_ID < 80000 && isset($disableEntityLoader)) { |
486
|
205 |
|
\libxml_disable_entity_loader($disableEntityLoader); |
487
|
|
|
} |
488
|
|
|
|
489
|
205 |
|
return $this->document; |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
/** |
493
|
|
|
* Find list of nodes with a CSS selector. |
494
|
|
|
* |
495
|
|
|
* @param string $selector |
496
|
|
|
* @param int|null $idx |
497
|
|
|
* |
498
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
499
|
|
|
*/ |
500
|
150 |
View Code Duplication |
public function find(string $selector, $idx = null) |
|
|
|
|
501
|
|
|
{ |
502
|
150 |
|
$xPathQuery = SelectorConverter::toXPath($selector); |
503
|
|
|
|
504
|
150 |
|
$xPath = new \DOMXPath($this->document); |
505
|
|
|
|
506
|
150 |
|
if ($this->callbackXPathBeforeQuery) { |
507
|
1 |
|
$xPathQuery = \call_user_func($this->callbackXPathBeforeQuery, $selector, $xPathQuery, $xPath, $this); |
508
|
|
|
} |
509
|
|
|
|
510
|
150 |
|
$nodesList = $xPath->query($xPathQuery); |
511
|
|
|
|
512
|
150 |
|
$elements = new SimpleHtmlDomNode(); |
513
|
|
|
|
514
|
150 |
|
if ($nodesList) { |
515
|
150 |
|
foreach ($nodesList as $node) { |
516
|
140 |
|
$elements[] = new SimpleHtmlDom($node); |
517
|
|
|
} |
518
|
|
|
} |
519
|
|
|
|
520
|
|
|
// return all elements |
521
|
150 |
|
if ($idx === null) { |
522
|
75 |
|
if (\count($elements) === 0) { |
523
|
16 |
|
return new SimpleHtmlDomNodeBlank(); |
524
|
|
|
} |
525
|
|
|
|
526
|
72 |
|
return $elements; |
527
|
|
|
} |
528
|
|
|
|
529
|
|
|
// handle negative values |
530
|
93 |
|
if ($idx < 0) { |
531
|
11 |
|
$idx = \count($elements) + $idx; |
532
|
|
|
} |
533
|
|
|
|
534
|
|
|
// return one element |
535
|
93 |
|
return $elements[$idx] ?? new SimpleHtmlDomBlank(); |
536
|
|
|
} |
537
|
|
|
|
538
|
|
|
/** |
539
|
|
|
* Find nodes with a CSS selector. |
540
|
|
|
* |
541
|
|
|
* @param string $selector |
542
|
|
|
* |
543
|
|
|
* @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
544
|
|
|
*/ |
545
|
12 |
|
public function findMulti(string $selector): SimpleHtmlDomNodeInterface |
546
|
|
|
{ |
547
|
12 |
|
return $this->find($selector, null); |
548
|
|
|
} |
549
|
|
|
|
550
|
|
|
/** |
551
|
|
|
* Find nodes with a CSS selector or false, if no element is found. |
552
|
|
|
* |
553
|
|
|
* @param string $selector |
554
|
|
|
* |
555
|
|
|
* @return false|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
556
|
|
|
*/ |
557
|
4 |
|
public function findMultiOrFalse(string $selector) |
558
|
|
|
{ |
559
|
4 |
|
$return = $this->find($selector, null); |
560
|
|
|
|
561
|
4 |
|
if ($return instanceof SimpleHtmlDomNodeBlank) { |
562
|
3 |
|
return false; |
563
|
|
|
} |
564
|
|
|
|
565
|
2 |
|
return $return; |
566
|
|
|
} |
567
|
|
|
|
568
|
|
|
/** |
569
|
|
|
* Find one node with a CSS selector. |
570
|
|
|
* |
571
|
|
|
* @param string $selector |
572
|
|
|
* |
573
|
|
|
* @return SimpleHtmlDomInterface |
574
|
|
|
*/ |
575
|
34 |
|
public function findOne(string $selector): SimpleHtmlDomInterface |
576
|
|
|
{ |
577
|
34 |
|
return $this->find($selector, 0); |
578
|
|
|
} |
579
|
|
|
|
580
|
|
|
/** |
581
|
|
|
* Find one node with a CSS selector or false, if no element is found. |
582
|
|
|
* |
583
|
|
|
* @param string $selector |
584
|
|
|
* |
585
|
|
|
* @return false|SimpleHtmlDomInterface |
586
|
|
|
*/ |
587
|
6 |
|
public function findOneOrFalse(string $selector) |
588
|
|
|
{ |
589
|
6 |
|
$return = $this->find($selector, 0); |
590
|
|
|
|
591
|
6 |
|
if ($return instanceof SimpleHtmlDomBlank) { |
592
|
3 |
|
return false; |
593
|
|
|
} |
594
|
|
|
|
595
|
4 |
|
return $return; |
596
|
|
|
} |
597
|
|
|
|
598
|
|
|
/** |
599
|
|
|
* @param string $content |
600
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
601
|
|
|
* |
602
|
|
|
* @return string |
603
|
|
|
*/ |
604
|
131 |
|
public function fixHtmlOutput( |
605
|
|
|
string $content, |
606
|
|
|
bool $multiDecodeNewHtmlEntity = false |
607
|
|
|
): string { |
608
|
|
|
// INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>), |
609
|
|
|
// so we try to remove it here again ... |
610
|
|
|
|
611
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) { |
612
|
|
|
/** @noinspection HtmlRequiredLangAttribute */ |
613
|
62 |
|
$content = \str_replace( |
614
|
|
|
[ |
615
|
62 |
|
'<html>', |
616
|
|
|
'</html>', |
617
|
|
|
], |
618
|
62 |
|
'', |
619
|
62 |
|
$content |
620
|
|
|
); |
621
|
|
|
} |
622
|
|
|
|
623
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutHeadWrapper()) { |
624
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
625
|
66 |
|
$content = \str_replace( |
626
|
|
|
[ |
627
|
66 |
|
'<head>', |
628
|
|
|
'</head>', |
629
|
|
|
], |
630
|
66 |
|
'', |
631
|
66 |
|
$content |
632
|
|
|
); |
633
|
|
|
} |
634
|
|
|
|
635
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutBodyWrapper()) { |
636
|
65 |
|
$content = \str_replace( |
637
|
|
|
[ |
638
|
65 |
|
'<body>', |
639
|
|
|
'</body>', |
640
|
|
|
], |
641
|
65 |
|
'', |
642
|
65 |
|
$content |
643
|
|
|
); |
644
|
|
|
} |
645
|
|
|
|
646
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithFakeEndScript()) { |
647
|
1 |
|
$content = \str_replace( |
648
|
1 |
|
'</script>', |
649
|
1 |
|
'', |
650
|
1 |
|
$content |
651
|
|
|
); |
652
|
|
|
} |
653
|
|
|
|
654
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutWrapper()) { |
655
|
4 |
|
$content = (string) \preg_replace('/^<p>/', '', $content); |
656
|
4 |
|
$content = (string) \preg_replace('/<\/p>/', '', $content); |
657
|
|
|
} |
658
|
|
|
|
659
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutPTagWrapper()) { |
660
|
63 |
|
$content = \str_replace( |
661
|
|
|
[ |
662
|
63 |
|
'<p>', |
663
|
|
|
'</p>', |
664
|
|
|
], |
665
|
63 |
|
'', |
666
|
63 |
|
$content |
667
|
|
|
); |
668
|
|
|
} |
669
|
|
|
|
670
|
131 |
|
if ($this->getIsDOMDocumentCreatedWithoutHtml()) { |
671
|
10 |
|
$content = \str_replace( |
672
|
10 |
|
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', |
673
|
10 |
|
'', |
674
|
10 |
|
$content |
675
|
|
|
); |
676
|
|
|
} |
677
|
|
|
|
678
|
|
|
// https://bugs.php.net/bug.php?id=73175 |
679
|
131 |
|
$content = \str_replace( |
680
|
|
|
\array_map(static function ($e) { |
681
|
131 |
|
return '</' . $e . '>'; |
682
|
131 |
|
}, $this->selfClosingTags), |
683
|
131 |
|
'', |
684
|
131 |
|
$content |
685
|
|
|
); |
686
|
|
|
|
687
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
688
|
131 |
|
$content = \trim( |
689
|
131 |
|
\str_replace( |
690
|
|
|
[ |
691
|
131 |
|
'<simpleHtmlDomHtml>', |
692
|
|
|
'</simpleHtmlDomHtml>', |
693
|
|
|
'<simpleHtmlDomP>', |
694
|
|
|
'</simpleHtmlDomP>', |
695
|
|
|
'<head><head>', |
696
|
|
|
'</head></head>', |
697
|
|
|
], |
698
|
|
|
[ |
699
|
131 |
|
'', |
700
|
|
|
'', |
701
|
|
|
'', |
702
|
|
|
'', |
703
|
|
|
'<head>', |
704
|
|
|
'</head>', |
705
|
|
|
], |
706
|
131 |
|
$content |
707
|
|
|
) |
708
|
|
|
); |
709
|
|
|
|
710
|
131 |
|
$content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity); |
711
|
|
|
|
712
|
131 |
|
return self::putReplacedBackToPreserveHtmlEntities($content); |
713
|
|
|
} |
714
|
|
|
|
715
|
|
|
/** |
716
|
|
|
* Return elements by ".class". |
717
|
|
|
* |
718
|
|
|
* @param string $class |
719
|
|
|
* |
720
|
|
|
* @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
721
|
|
|
*/ |
722
|
|
|
public function getElementByClass(string $class): SimpleHtmlDomNodeInterface |
723
|
|
|
{ |
724
|
|
|
return $this->findMulti(".${class}"); |
725
|
|
|
} |
726
|
|
|
|
727
|
|
|
/** |
728
|
|
|
* Return element by #id. |
729
|
|
|
* |
730
|
|
|
* @param string $id |
731
|
|
|
* |
732
|
|
|
* @return SimpleHtmlDomInterface |
733
|
|
|
*/ |
734
|
3 |
|
public function getElementById(string $id): SimpleHtmlDomInterface |
735
|
|
|
{ |
736
|
3 |
|
return $this->findOne("#${id}"); |
737
|
|
|
} |
738
|
|
|
|
739
|
|
|
/** |
740
|
|
|
* Return element by tag name. |
741
|
|
|
* |
742
|
|
|
* @param string $name |
743
|
|
|
* |
744
|
|
|
* @return SimpleHtmlDomInterface |
745
|
|
|
*/ |
746
|
1 |
|
public function getElementByTagName(string $name): SimpleHtmlDomInterface |
747
|
|
|
{ |
748
|
1 |
|
$node = $this->document->getElementsByTagName($name)->item(0); |
749
|
|
|
|
750
|
1 |
|
if ($node === null) { |
751
|
|
|
return new SimpleHtmlDomBlank(); |
752
|
|
|
} |
753
|
|
|
|
754
|
1 |
|
return new SimpleHtmlDom($node); |
755
|
|
|
} |
756
|
|
|
|
757
|
|
|
/** |
758
|
|
|
* Returns elements by "#id". |
759
|
|
|
* |
760
|
|
|
* @param string $id |
761
|
|
|
* @param int|null $idx |
762
|
|
|
* |
763
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
764
|
|
|
*/ |
765
|
|
|
public function getElementsById(string $id, $idx = null) |
766
|
|
|
{ |
767
|
|
|
return $this->find("#${id}", $idx); |
768
|
|
|
} |
769
|
|
|
|
770
|
|
|
/** |
771
|
|
|
* Returns elements by tag name. |
772
|
|
|
* |
773
|
|
|
* @param string $name |
774
|
|
|
* @param int|null $idx |
775
|
|
|
* |
776
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
777
|
|
|
*/ |
778
|
6 |
|
public function getElementsByTagName(string $name, $idx = null) |
779
|
|
|
{ |
780
|
6 |
|
$nodesList = $this->document->getElementsByTagName($name); |
781
|
|
|
|
782
|
6 |
|
$elements = new SimpleHtmlDomNode(); |
783
|
|
|
|
784
|
6 |
|
foreach ($nodesList as $node) { |
785
|
4 |
|
$elements[] = new SimpleHtmlDom($node); |
786
|
|
|
} |
787
|
|
|
|
788
|
|
|
// return all elements |
789
|
6 |
|
if ($idx === null) { |
790
|
5 |
|
if (\count($elements) === 0) { |
791
|
2 |
|
return new SimpleHtmlDomNodeBlank(); |
792
|
|
|
} |
793
|
|
|
|
794
|
3 |
|
return $elements; |
795
|
|
|
} |
796
|
|
|
|
797
|
|
|
// handle negative values |
798
|
1 |
|
if ($idx < 0) { |
799
|
|
|
$idx = \count($elements) + $idx; |
800
|
|
|
} |
801
|
|
|
|
802
|
|
|
// return one element |
803
|
1 |
|
return $elements[$idx] ?? new SimpleHtmlDomNodeBlank(); |
804
|
|
|
} |
805
|
|
|
|
806
|
|
|
/** |
807
|
|
|
* Get dom node's outer html. |
808
|
|
|
* |
809
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
810
|
|
|
* |
811
|
|
|
* @return string |
812
|
|
|
*/ |
813
|
97 |
|
public function html(bool $multiDecodeNewHtmlEntity = false): string |
814
|
|
|
{ |
815
|
97 |
|
if (static::$callback !== null) { |
816
|
|
|
\call_user_func(static::$callback, [$this]); |
817
|
|
|
} |
818
|
|
|
|
819
|
97 |
|
if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) { |
820
|
55 |
|
$content = $this->document->saveHTML($this->document->documentElement); |
821
|
|
|
} else { |
822
|
57 |
|
$content = $this->document->saveHTML(); |
823
|
|
|
} |
824
|
|
|
|
825
|
97 |
|
if ($content === false) { |
826
|
|
|
return ''; |
827
|
|
|
} |
828
|
|
|
|
829
|
97 |
|
return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity); |
830
|
|
|
} |
831
|
|
|
|
832
|
|
|
/** |
833
|
|
|
* Load HTML from string. |
834
|
|
|
* |
835
|
|
|
* @param string $html |
836
|
|
|
* @param int|null $libXMLExtraOptions |
837
|
|
|
* |
838
|
|
|
* @return HtmlDomParser |
839
|
|
|
*/ |
840
|
205 |
|
public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface |
841
|
|
|
{ |
842
|
|
|
// reset |
843
|
205 |
|
self::$domBrokenReplaceHelper = []; |
844
|
|
|
|
845
|
205 |
|
$this->document = $this->createDOMDocument($html, $libXMLExtraOptions); |
846
|
|
|
|
847
|
205 |
|
return $this; |
|
|
|
|
848
|
|
|
} |
849
|
|
|
|
850
|
|
|
/** |
851
|
|
|
* Load HTML from file. |
852
|
|
|
* |
853
|
|
|
* @param string $filePath |
854
|
|
|
* @param int|null $libXMLExtraOptions |
855
|
|
|
* |
856
|
|
|
* @throws \RuntimeException |
857
|
|
|
* |
858
|
|
|
* @return HtmlDomParser |
859
|
|
|
*/ |
860
|
13 |
View Code Duplication |
public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface |
|
|
|
|
861
|
|
|
{ |
862
|
|
|
// reset |
863
|
13 |
|
self::$domBrokenReplaceHelper = []; |
864
|
|
|
|
865
|
|
|
if ( |
866
|
13 |
|
!\preg_match("/^https?:\/\//i", $filePath) |
867
|
|
|
&& |
868
|
13 |
|
!\file_exists($filePath) |
869
|
|
|
) { |
870
|
1 |
|
throw new \RuntimeException("File ${filePath} not found"); |
871
|
|
|
} |
872
|
|
|
|
873
|
|
|
try { |
874
|
12 |
|
if (\class_exists('\voku\helper\UTF8')) { |
875
|
|
|
$html = \voku\helper\UTF8::file_get_contents($filePath); |
876
|
|
|
} else { |
877
|
12 |
|
$html = \file_get_contents($filePath); |
878
|
|
|
} |
879
|
1 |
|
} catch (\Exception $e) { |
880
|
1 |
|
throw new \RuntimeException("Could not load file ${filePath}"); |
881
|
|
|
} |
882
|
|
|
|
883
|
11 |
|
if ($html === false) { |
884
|
|
|
throw new \RuntimeException("Could not load file ${filePath}"); |
885
|
|
|
} |
886
|
|
|
|
887
|
11 |
|
return $this->loadHtml($html, $libXMLExtraOptions); |
888
|
|
|
} |
889
|
|
|
|
890
|
|
|
/** |
891
|
|
|
* Get the HTML as XML or plain XML if needed. |
892
|
|
|
* |
893
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
894
|
|
|
* @param bool $htmlToXml |
895
|
|
|
* @param bool $removeXmlHeader |
896
|
|
|
* @param int $options |
897
|
|
|
* |
898
|
|
|
* @return string |
899
|
|
|
*/ |
900
|
2 |
View Code Duplication |
public function xml( |
|
|
|
|
901
|
|
|
bool $multiDecodeNewHtmlEntity = false, |
902
|
|
|
bool $htmlToXml = true, |
903
|
|
|
bool $removeXmlHeader = true, |
904
|
|
|
int $options = \LIBXML_NOEMPTYTAG |
905
|
|
|
): string { |
906
|
2 |
|
$xml = $this->document->saveXML(null, $options); |
907
|
2 |
|
if ($xml === false) { |
908
|
|
|
return ''; |
909
|
|
|
} |
910
|
|
|
|
911
|
2 |
|
if ($removeXmlHeader) { |
912
|
2 |
|
$xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml)); |
913
|
|
|
} |
914
|
|
|
|
915
|
2 |
|
if ($htmlToXml) { |
916
|
2 |
|
$return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity); |
917
|
|
|
} else { |
918
|
|
|
$xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity); |
919
|
|
|
|
920
|
|
|
$return = self::putReplacedBackToPreserveHtmlEntities($xml); |
921
|
|
|
} |
922
|
|
|
|
923
|
2 |
|
return $return; |
924
|
|
|
} |
925
|
|
|
|
926
|
|
|
/** |
927
|
|
|
* @param string $selector |
928
|
|
|
* @param int $idx |
929
|
|
|
* |
930
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface<SimpleHtmlDomInterface> |
|
|
|
|
931
|
|
|
*/ |
932
|
3 |
|
public function __invoke($selector, $idx = null) |
933
|
|
|
{ |
934
|
3 |
|
return $this->find($selector, $idx); |
935
|
|
|
} |
936
|
|
|
|
937
|
|
|
/** |
938
|
|
|
* @return bool |
939
|
|
|
*/ |
940
|
131 |
|
public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool |
941
|
|
|
{ |
942
|
131 |
|
return $this->isDOMDocumentCreatedWithoutHeadWrapper; |
943
|
|
|
} |
944
|
|
|
|
945
|
|
|
/** |
946
|
|
|
* @return bool |
947
|
|
|
*/ |
948
|
131 |
|
public function getIsDOMDocumentCreatedWithoutPTagWrapper(): bool |
949
|
|
|
{ |
950
|
131 |
|
return $this->isDOMDocumentCreatedWithoutPTagWrapper; |
951
|
|
|
} |
952
|
|
|
|
953
|
|
|
/** |
954
|
|
|
* @return bool |
955
|
|
|
*/ |
956
|
131 |
|
public function getIsDOMDocumentCreatedWithoutHtml(): bool |
957
|
|
|
{ |
958
|
131 |
|
return $this->isDOMDocumentCreatedWithoutHtml; |
959
|
|
|
} |
960
|
|
|
|
961
|
|
|
/** |
962
|
|
|
* @return bool |
963
|
|
|
*/ |
964
|
131 |
|
public function getIsDOMDocumentCreatedWithoutBodyWrapper(): bool |
965
|
|
|
{ |
966
|
131 |
|
return $this->isDOMDocumentCreatedWithoutBodyWrapper; |
967
|
|
|
} |
968
|
|
|
|
969
|
|
|
/** |
970
|
|
|
* @return bool |
971
|
|
|
*/ |
972
|
131 |
|
public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool |
973
|
|
|
{ |
974
|
131 |
|
return $this->isDOMDocumentCreatedWithoutHtmlWrapper; |
975
|
|
|
} |
976
|
|
|
|
977
|
|
|
/** |
978
|
|
|
* @return bool |
979
|
|
|
*/ |
980
|
131 |
|
public function getIsDOMDocumentCreatedWithoutWrapper(): bool |
981
|
|
|
{ |
982
|
131 |
|
return $this->isDOMDocumentCreatedWithoutWrapper; |
983
|
|
|
} |
984
|
|
|
|
985
|
|
|
/** |
986
|
|
|
* @return bool |
987
|
|
|
*/ |
988
|
131 |
|
public function getIsDOMDocumentCreatedWithFakeEndScript(): bool |
989
|
|
|
{ |
990
|
131 |
|
return $this->isDOMDocumentCreatedWithFakeEndScript; |
991
|
|
|
} |
992
|
|
|
|
993
|
|
|
/** |
994
|
|
|
* @param string $html |
995
|
|
|
* |
996
|
|
|
* @return string |
997
|
|
|
*/ |
998
|
5 |
|
protected function keepBrokenHtml(string $html): string |
999
|
|
|
{ |
1000
|
|
|
do { |
1001
|
5 |
|
$original = $html; |
1002
|
|
|
|
1003
|
5 |
|
$html = (string) \preg_replace_callback( |
1004
|
5 |
|
'/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui', |
1005
|
|
|
static function ($matches) { |
1006
|
5 |
|
return $matches['start'] . |
1007
|
5 |
|
'°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' . |
1008
|
5 |
|
$matches['value'] . |
1009
|
5 |
|
'°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' . |
1010
|
5 |
|
$matches['end']; |
1011
|
5 |
|
}, |
1012
|
5 |
|
$html |
1013
|
|
|
); |
1014
|
5 |
|
} while ($original !== $html); |
1015
|
|
|
|
1016
|
|
|
do { |
1017
|
5 |
|
$original = $html; |
1018
|
|
|
|
1019
|
5 |
|
$html = (string) \preg_replace_callback( |
1020
|
5 |
|
'/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u', |
1021
|
|
|
static function ($matches) { |
1022
|
3 |
|
$matches['broken'] = \str_replace( |
1023
|
3 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
1024
|
3 |
|
['</', '<', '>'], |
1025
|
3 |
|
$matches['broken'] |
1026
|
|
|
); |
1027
|
|
|
|
1028
|
3 |
|
self::$domBrokenReplaceHelper['orig'][] = $matches['broken']; |
1029
|
3 |
|
self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = self::$domHtmlBrokenHtmlHelper . \crc32($matches['broken']); |
1030
|
|
|
|
1031
|
3 |
|
return $matches['start'] . $matchesHash . $matches['end']; |
1032
|
5 |
|
}, |
1033
|
5 |
|
$html |
1034
|
|
|
); |
1035
|
5 |
|
} while ($original !== $html); |
1036
|
|
|
|
1037
|
5 |
|
return \str_replace( |
1038
|
5 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
1039
|
5 |
|
['</', '<', '>'], |
1040
|
5 |
|
$html |
1041
|
|
|
); |
1042
|
|
|
} |
1043
|
|
|
|
1044
|
|
|
/** |
1045
|
|
|
* @param string $html |
1046
|
|
|
* |
1047
|
|
|
* @return void |
1048
|
|
|
*/ |
1049
|
6 |
|
protected function keepSpecialScriptTags(string &$html) |
1050
|
|
|
{ |
1051
|
|
|
// regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>] |
1052
|
6 |
|
$tags = \implode('|', \array_map( |
1053
|
|
|
static function ($value) { |
1054
|
6 |
|
return \preg_quote($value, '/'); |
1055
|
6 |
|
}, |
1056
|
6 |
|
$this->specialScriptTags |
1057
|
|
|
)); |
1058
|
6 |
|
$html = (string) \preg_replace_callback( |
1059
|
6 |
|
'/(?<start>((?:<script) [^>]*type=(?:["\'])?(?:' . $tags . ')+(?:[^>]*)>))(?<innerContent>.*)(?<end><\/script>)/isU', |
1060
|
|
|
function ($matches) { |
1061
|
|
|
|
1062
|
|
|
// Check for logic in special script tags, like [<% _.each(tierPrices, function(item, key) { %>], |
1063
|
|
|
// because often this looks like non valid html in the template itself. |
1064
|
4 |
|
foreach ($this->templateLogicSyntaxInSpecialScriptTags as $logicSyntaxInSpecialScriptTag) { |
1065
|
4 |
|
if (\strpos($matches['innerContent'], $logicSyntaxInSpecialScriptTag) !== false) { |
1066
|
|
|
// remove the html5 fallback |
1067
|
3 |
|
$matches['innerContent'] = \str_replace('<\/', '</', $matches['innerContent']); |
1068
|
|
|
|
1069
|
3 |
|
self::$domBrokenReplaceHelper['orig'][] = $matches['innerContent']; |
1070
|
3 |
|
self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '' . self::$domHtmlBrokenHtmlHelper . '' . \crc32($matches['innerContent']); |
1071
|
|
|
|
1072
|
3 |
|
return $matches['start'] . $matchesHash . $matches['end']; |
1073
|
|
|
} |
1074
|
|
|
} |
1075
|
|
|
|
1076
|
|
|
// remove the html5 fallback |
1077
|
3 |
|
$matches[0] = \str_replace('<\/', '</', $matches[0]); |
1078
|
|
|
|
1079
|
3 |
|
$specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($matches[0], \strlen('<script')); |
1080
|
|
|
|
1081
|
3 |
|
return \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>'; |
1082
|
6 |
|
}, |
1083
|
6 |
|
$html |
1084
|
|
|
); |
1085
|
6 |
|
} |
1086
|
|
|
|
1087
|
|
|
/** |
1088
|
|
|
* @param bool $keepBrokenHtml |
1089
|
|
|
* |
1090
|
|
|
* @return HtmlDomParser |
1091
|
|
|
*/ |
1092
|
5 |
|
public function useKeepBrokenHtml(bool $keepBrokenHtml): DomParserInterface |
1093
|
|
|
{ |
1094
|
5 |
|
$this->keepBrokenHtml = $keepBrokenHtml; |
1095
|
|
|
|
1096
|
5 |
|
return $this; |
1097
|
|
|
} |
1098
|
|
|
|
1099
|
|
|
/** |
1100
|
|
|
* @param string[] $templateLogicSyntaxInSpecialScriptTags |
1101
|
|
|
* |
1102
|
|
|
* @return HtmlDomParser |
1103
|
|
|
*/ |
1104
|
2 |
|
public function overwriteTemplateLogicSyntaxInSpecialScriptTags(array $templateLogicSyntaxInSpecialScriptTags): DomParserInterface |
1105
|
|
|
{ |
1106
|
2 |
|
foreach ($templateLogicSyntaxInSpecialScriptTags as $tmp) { |
1107
|
2 |
|
if (!\is_string($tmp)) { |
1108
|
1 |
|
throw new \InvalidArgumentException('setTemplateLogicSyntaxInSpecialScriptTags only allows string[]'); |
1109
|
|
|
} |
1110
|
|
|
} |
1111
|
|
|
|
1112
|
1 |
|
$this->templateLogicSyntaxInSpecialScriptTags = $templateLogicSyntaxInSpecialScriptTags; |
1113
|
|
|
|
1114
|
1 |
|
return $this; |
1115
|
|
|
} |
1116
|
|
|
|
1117
|
|
|
/** |
1118
|
|
|
* @param string[] $specialScriptTags |
1119
|
|
|
* |
1120
|
|
|
* @return HtmlDomParser |
1121
|
|
|
*/ |
1122
|
|
|
public function overwriteSpecialScriptTags(array $specialScriptTags): DomParserInterface |
1123
|
|
|
{ |
1124
|
|
|
foreach ($specialScriptTags as $tag) { |
1125
|
|
|
if (!\is_string($tag)) { |
1126
|
|
|
throw new \InvalidArgumentException('SpecialScriptTags only allows string[]'); |
1127
|
|
|
} |
1128
|
|
|
} |
1129
|
|
|
|
1130
|
|
|
$this->specialScriptTags = $specialScriptTags; |
1131
|
|
|
|
1132
|
|
|
return $this; |
1133
|
|
|
} |
1134
|
|
|
|
1135
|
|
|
/** |
1136
|
|
|
* @param callable $callbackXPathBeforeQuery |
1137
|
|
|
* |
1138
|
|
|
* @phpstan-param callable(string $cssSelectorString, string $xPathString,\DOMXPath,\voku\helper\HtmlDomParser): string $callbackXPathBeforeQuery |
1139
|
|
|
*/ |
1140
|
1 |
|
public function setCallbackXPathBeforeQuery(callable $callbackXPathBeforeQuery) |
1141
|
|
|
{ |
1142
|
1 |
|
$this->callbackXPathBeforeQuery = $callbackXPathBeforeQuery; |
1143
|
1 |
|
} |
1144
|
|
|
|
1145
|
|
|
/** |
1146
|
|
|
* @param callable $callbackBeforeCreateDom |
1147
|
|
|
* |
1148
|
|
|
* @phpstan-param callable(string $htmlString, \voku\helper\HtmlDomParser): string $callbackBeforeCreateDom |
1149
|
|
|
*/ |
1150
|
1 |
|
public function setCallbackBeforeCreateDom(callable $callbackBeforeCreateDom) |
1151
|
|
|
{ |
1152
|
1 |
|
$this->callbackBeforeCreateDom = $callbackBeforeCreateDom; |
1153
|
1 |
|
} |
1154
|
|
|
} |
1155
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.