1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace voku\helper; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* @property-read string $outerText |
9
|
|
|
* <p>Get dom node's outer html (alias for "outerHtml").</p> |
10
|
|
|
* @property-read string $outerHtml |
11
|
|
|
* <p>Get dom node's outer html.</p> |
12
|
|
|
* @property-read string $innerText |
13
|
|
|
* <p>Get dom node's inner html (alias for "innerHtml").</p> |
14
|
|
|
* @property-read string $innerHtml |
15
|
|
|
* <p>Get dom node's inner html.</p> |
16
|
|
|
* @property-read string $plaintext |
17
|
|
|
* <p>Get dom node's plain text.</p> |
18
|
|
|
* |
19
|
|
|
* @method string outerText() |
20
|
|
|
* <p>Get dom node's outer html (alias for "outerHtml()").</p> |
21
|
|
|
* @method string outerHtml() |
22
|
|
|
* <p>Get dom node's outer html.</p> |
23
|
|
|
* @method string innerText() |
24
|
|
|
* <p>Get dom node's inner html (alias for "innerHtml()").</p> |
25
|
|
|
* @method HtmlDomParser load(string $html) |
26
|
|
|
* <p>Load HTML from string.</p> |
27
|
|
|
* @method HtmlDomParser load_file(string $html) |
28
|
|
|
* <p>Load HTML from file.</p> |
29
|
|
|
* @method static HtmlDomParser file_get_html($html, $libXMLExtraOptions = null) |
30
|
|
|
* <p>Load HTML from file.</p> |
31
|
|
|
* @method static HtmlDomParser str_get_html($html, $libXMLExtraOptions = null) |
32
|
|
|
* <p>Load HTML from string.</p> |
33
|
|
|
*/ |
34
|
|
|
class HtmlDomParser |
35
|
|
|
{ |
36
|
|
|
/** |
37
|
|
|
* @var string[] |
38
|
|
|
*/ |
39
|
|
|
protected static $functionAliases = [ |
40
|
|
|
'outertext' => 'html', |
41
|
|
|
'outerhtml' => 'html', |
42
|
|
|
'innertext' => 'innerHtml', |
43
|
|
|
'innerhtml' => 'innerHtml', |
44
|
|
|
'load' => 'loadHtml', |
45
|
|
|
'load_file' => 'loadHtmlFile', |
46
|
|
|
]; |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* @var string[][] |
50
|
|
|
*/ |
51
|
|
|
protected static $domLinkReplaceHelper = [ |
52
|
|
|
'orig' => ['[', ']', '{', '}'], |
53
|
|
|
'tmp' => [ |
54
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____', |
55
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____', |
56
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____', |
57
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____', |
58
|
|
|
], |
59
|
|
|
]; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* @var string[][] |
63
|
|
|
*/ |
64
|
|
|
protected static $domReplaceHelper = [ |
65
|
|
|
'orig' => ['&', '|', '+', '%', '@', '<html ⚡'], |
66
|
|
|
'tmp' => [ |
67
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AMP____', |
68
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PIPE____', |
69
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PLUS____', |
70
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PERCENT____', |
71
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AT____', |
72
|
|
|
'<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"', |
73
|
|
|
], |
74
|
|
|
]; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* @var string |
78
|
|
|
*/ |
79
|
|
|
protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____'; |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* @var string |
83
|
|
|
*/ |
84
|
|
|
protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_sctipt____'; |
85
|
|
|
|
86
|
|
|
/** |
87
|
|
|
* @var array |
88
|
|
|
*/ |
89
|
|
|
protected static $domBrokenReplaceHelper = []; |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* @var callable |
93
|
|
|
*/ |
94
|
|
|
protected static $callback; |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* @var string[] |
98
|
|
|
*/ |
99
|
|
|
protected $namespaces = []; |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* @var \DOMDocument |
103
|
|
|
*/ |
104
|
|
|
protected $document; |
105
|
|
|
|
106
|
|
|
/** |
107
|
|
|
* @var string |
108
|
|
|
*/ |
109
|
|
|
protected $encoding = 'UTF-8'; |
110
|
|
|
|
111
|
|
|
/** |
112
|
|
|
* @var bool |
113
|
|
|
*/ |
114
|
|
|
protected $isDOMDocumentCreatedWithoutHtml = false; |
115
|
|
|
|
116
|
|
|
/** |
117
|
|
|
* @var bool |
118
|
|
|
*/ |
119
|
|
|
protected $isDOMDocumentCreatedWithoutWrapper = false; |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* @var bool |
123
|
|
|
*/ |
124
|
|
|
protected $isDOMDocumentCreatedWithoutHeadWrapper = false; |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* @var bool |
128
|
|
|
*/ |
129
|
|
|
protected $isDOMDocumentCreatedWithoutHtmlWrapper = false; |
130
|
|
|
|
131
|
|
|
/** |
132
|
|
|
* @var bool |
133
|
|
|
*/ |
134
|
|
|
protected $isDOMDocumentCreatedWithFakeEndScript = false; |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* @var bool |
138
|
|
|
*/ |
139
|
|
|
protected $keepBrokenHtml; |
140
|
|
|
|
141
|
|
|
/** |
142
|
|
|
* Constructor |
143
|
|
|
* |
144
|
|
|
* @param \DOMNode|SimpleHtmlDomInterface|string $element HTML code or SimpleHtmlDomInterface, \DOMNode |
145
|
|
|
*/ |
146
|
146 |
|
public function __construct($element = null) |
147
|
|
|
{ |
148
|
146 |
|
$this->document = new \DOMDocument('1.0', $this->getEncoding()); |
149
|
|
|
|
150
|
|
|
// reset |
151
|
146 |
|
self::$domBrokenReplaceHelper = []; |
152
|
|
|
|
153
|
|
|
// DOMDocument settings |
154
|
146 |
|
$this->document->preserveWhiteSpace = true; |
155
|
146 |
|
$this->document->formatOutput = true; |
156
|
|
|
|
157
|
146 |
|
if ($element instanceof SimpleHtmlDomInterface) { |
158
|
72 |
|
$element = $element->getNode(); |
159
|
|
|
} |
160
|
|
|
|
161
|
146 |
|
if ($element instanceof \DOMNode) { |
162
|
72 |
|
$domNode = $this->document->importNode($element, true); |
163
|
|
|
|
164
|
72 |
|
if ($domNode instanceof \DOMNode) { |
165
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
166
|
72 |
|
$this->document->appendChild($domNode); |
167
|
|
|
} |
168
|
|
|
|
169
|
72 |
|
return; |
170
|
|
|
} |
171
|
|
|
|
172
|
146 |
|
if ($element !== null) { |
173
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
174
|
79 |
|
$this->loadHtml($element); |
175
|
|
|
} |
176
|
145 |
|
} |
177
|
|
|
|
178
|
|
|
/** |
179
|
|
|
* @param string $name |
180
|
|
|
* @param array $arguments |
181
|
|
|
* |
182
|
|
|
* @return bool|mixed |
183
|
|
|
*/ |
184
|
53 |
View Code Duplication |
public function __call($name, $arguments) |
|
|
|
|
185
|
|
|
{ |
186
|
53 |
|
$name = \strtolower($name); |
187
|
|
|
|
188
|
53 |
|
if (isset(self::$functionAliases[$name])) { |
189
|
52 |
|
return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments); |
190
|
|
|
} |
191
|
|
|
|
192
|
1 |
|
throw new \BadMethodCallException('Method does not exist: ' . $name); |
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
/** |
196
|
|
|
* @param string $name |
197
|
|
|
* @param array $arguments |
198
|
|
|
* |
199
|
|
|
* @throws \RuntimeException |
200
|
|
|
* @throws \BadMethodCallException |
201
|
|
|
* |
202
|
|
|
* @return HtmlDomParser |
203
|
|
|
*/ |
204
|
21 |
View Code Duplication |
public static function __callStatic($name, $arguments) |
|
|
|
|
205
|
|
|
{ |
206
|
21 |
|
$arguments0 = $arguments[0] ?? ''; |
207
|
|
|
|
208
|
21 |
|
$arguments1 = $arguments[1] ?? null; |
209
|
|
|
|
210
|
21 |
|
if ($name === 'str_get_html') { |
211
|
16 |
|
$parser = new static(); |
212
|
|
|
|
213
|
16 |
|
return $parser->loadHtml($arguments0, $arguments1); |
214
|
|
|
} |
215
|
|
|
|
216
|
5 |
|
if ($name === 'file_get_html') { |
217
|
4 |
|
$parser = new static(); |
218
|
|
|
|
219
|
4 |
|
return $parser->loadHtmlFile($arguments0, $arguments1); |
|
|
|
|
220
|
|
|
} |
221
|
|
|
|
222
|
1 |
|
throw new \BadMethodCallException('Method does not exist'); |
223
|
|
|
} |
224
|
|
|
|
225
|
|
|
public function __clone() |
226
|
|
|
{ |
227
|
|
|
$this->document = clone $this->document; |
228
|
|
|
} |
229
|
|
|
|
230
|
|
|
/** @noinspection MagicMethodsValidityInspection */ |
231
|
|
|
|
232
|
|
|
/** |
233
|
|
|
* @param string $name |
234
|
|
|
* |
235
|
|
|
* @return string|null |
236
|
|
|
*/ |
237
|
14 |
|
public function __get($name) |
238
|
|
|
{ |
239
|
14 |
|
$name = \strtolower($name); |
240
|
|
|
|
241
|
|
|
switch ($name) { |
242
|
14 |
|
case 'outerhtml': |
243
|
14 |
|
case 'outertext': |
244
|
5 |
|
return $this->html(); |
245
|
10 |
|
case 'innerhtml': |
246
|
4 |
|
case 'innertext': |
247
|
7 |
|
return $this->innerHtml(); |
248
|
3 |
|
case 'text': |
249
|
3 |
|
case 'plaintext': |
250
|
2 |
|
return $this->text(); |
251
|
|
|
} |
252
|
|
|
|
253
|
1 |
|
return null; |
254
|
|
|
} |
255
|
|
|
|
256
|
|
|
/** |
257
|
|
|
* @param string $selector |
258
|
|
|
* @param int $idx |
259
|
|
|
* |
260
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
261
|
|
|
*/ |
262
|
3 |
|
public function __invoke($selector, $idx = null) |
263
|
|
|
{ |
264
|
3 |
|
return $this->find($selector, $idx); |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
/** |
268
|
|
|
* @return string |
269
|
|
|
*/ |
270
|
17 |
|
public function __toString() |
271
|
|
|
{ |
272
|
17 |
|
return $this->html(); |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
/** |
276
|
|
|
* does nothing (only for api-compatibility-reasons) |
277
|
|
|
* |
278
|
|
|
* @return bool |
279
|
|
|
* |
280
|
|
|
* @deprecated |
281
|
|
|
*/ |
282
|
1 |
|
public function clear(): bool |
283
|
|
|
{ |
284
|
1 |
|
return true; |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
/** |
288
|
|
|
* Create DOMDocument from HTML. |
289
|
|
|
* |
290
|
|
|
* @param string $html |
291
|
|
|
* @param int|null $libXMLExtraOptions |
292
|
|
|
* |
293
|
|
|
* @return \DOMDocument |
294
|
|
|
*/ |
295
|
131 |
|
protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument |
296
|
|
|
{ |
297
|
131 |
|
if ($this->keepBrokenHtml) { |
298
|
2 |
|
$html = $this->keepBrokenHtml(\trim($html)); |
299
|
|
|
} |
300
|
|
|
|
301
|
131 |
|
if (\strpos($html, '<') === false) { |
302
|
7 |
|
$this->isDOMDocumentCreatedWithoutHtml = true; |
303
|
129 |
|
} elseif (\strpos(\ltrim($html), '<') !== 0) { |
304
|
5 |
|
$this->isDOMDocumentCreatedWithoutWrapper = true; |
305
|
|
|
} |
306
|
|
|
|
307
|
131 |
|
if (\strpos($html, '<html') === false) { |
308
|
79 |
|
$this->isDOMDocumentCreatedWithoutHtmlWrapper = true; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
312
|
131 |
|
if (\strpos($html, '<head>') === false) { |
313
|
82 |
|
$this->isDOMDocumentCreatedWithoutHeadWrapper = true; |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
if ( |
317
|
131 |
|
\strpos($html, '</script>') === false |
318
|
|
|
&& |
319
|
131 |
|
\strpos($html, '<\/script>') !== false |
320
|
|
|
) { |
321
|
1 |
|
$this->isDOMDocumentCreatedWithFakeEndScript = true; |
322
|
|
|
} |
323
|
|
|
|
324
|
131 |
|
if (\strpos($html, '<script') !== false) { |
325
|
15 |
|
$this->html5FallbackForScriptTags($html); |
326
|
|
|
|
327
|
|
|
if ( |
328
|
15 |
|
\strpos($html, 'type="text/html"') !== false |
329
|
|
|
|| |
330
|
14 |
|
\strpos($html, 'type=\'text/html\'') !== false |
331
|
|
|
|| |
332
|
15 |
|
\strpos($html, 'type=text/html') !== false |
333
|
|
|
) { |
334
|
1 |
|
$this->keepSpecialScriptTags($html); |
335
|
|
|
} |
336
|
|
|
} |
337
|
|
|
|
338
|
|
|
// set error level |
339
|
131 |
|
$internalErrors = \libxml_use_internal_errors(true); |
340
|
131 |
|
$disableEntityLoader = \libxml_disable_entity_loader(true); |
341
|
131 |
|
\libxml_clear_errors(); |
342
|
|
|
|
343
|
131 |
|
$optionsXml = \LIBXML_DTDLOAD | \LIBXML_DTDATTR | \LIBXML_NONET; |
344
|
|
|
|
345
|
131 |
|
if (\defined('LIBXML_BIGLINES')) { |
346
|
131 |
|
$optionsXml |= \LIBXML_BIGLINES; |
347
|
|
|
} |
348
|
|
|
|
349
|
131 |
|
if (\defined('LIBXML_COMPACT')) { |
350
|
131 |
|
$optionsXml |= \LIBXML_COMPACT; |
351
|
|
|
} |
352
|
|
|
|
353
|
131 |
|
if (\defined('LIBXML_HTML_NODEFDTD')) { |
354
|
131 |
|
$optionsXml |= \LIBXML_HTML_NODEFDTD; |
355
|
|
|
} |
356
|
|
|
|
357
|
131 |
|
if ($libXMLExtraOptions !== null) { |
358
|
1 |
|
$optionsXml |= $libXMLExtraOptions; |
359
|
|
|
} |
360
|
|
|
|
361
|
|
|
if ( |
362
|
131 |
|
$this->isDOMDocumentCreatedWithoutWrapper |
363
|
|
|
|| |
364
|
131 |
|
$this->keepBrokenHtml |
365
|
|
|
) { |
366
|
6 |
|
$html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>'; |
367
|
|
|
} |
368
|
|
|
|
369
|
131 |
|
$html = self::replaceToPreserveHtmlEntities($html); |
370
|
|
|
|
371
|
131 |
|
$documentFound = false; |
372
|
131 |
|
$sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml); |
373
|
131 |
View Code Duplication |
if ($sxe !== false && \count(\libxml_get_errors()) === 0) { |
|
|
|
|
374
|
47 |
|
$domElementTmp = \dom_import_simplexml($sxe); |
375
|
47 |
|
if ($domElementTmp) { |
376
|
47 |
|
$documentFound = true; |
377
|
47 |
|
$this->document = $domElementTmp->ownerDocument; |
378
|
|
|
} |
379
|
|
|
} |
380
|
|
|
|
381
|
131 |
View Code Duplication |
if ($documentFound === false) { |
|
|
|
|
382
|
|
|
|
383
|
|
|
// UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251 |
384
|
89 |
|
$xmlHackUsed = false; |
385
|
89 |
|
if (\stripos('<?xml', $html) !== 0) { |
386
|
89 |
|
$xmlHackUsed = true; |
387
|
89 |
|
$html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html; |
388
|
|
|
} |
389
|
|
|
|
390
|
89 |
|
$this->document->loadHTML($html, $optionsXml); |
391
|
|
|
|
392
|
|
|
// remove the "xml-encoding" hack |
393
|
89 |
|
if ($xmlHackUsed) { |
394
|
89 |
|
foreach ($this->document->childNodes as $child) { |
395
|
89 |
|
if ($child->nodeType === \XML_PI_NODE) { |
396
|
|
|
/** @noinspection UnusedFunctionResultInspection */ |
397
|
89 |
|
$this->document->removeChild($child); |
398
|
|
|
|
399
|
89 |
|
break; |
400
|
|
|
} |
401
|
|
|
} |
402
|
|
|
} |
403
|
|
|
} |
404
|
|
|
|
405
|
|
|
// set encoding |
406
|
131 |
|
$this->document->encoding = $this->getEncoding(); |
407
|
|
|
|
408
|
|
|
// restore lib-xml settings |
409
|
131 |
|
\libxml_clear_errors(); |
410
|
131 |
|
\libxml_use_internal_errors($internalErrors); |
411
|
131 |
|
\libxml_disable_entity_loader($disableEntityLoader); |
412
|
|
|
|
413
|
131 |
|
return $this->document; |
414
|
|
|
} |
415
|
|
|
|
416
|
|
|
/** |
417
|
|
|
* @param string $content |
418
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
419
|
|
|
* |
420
|
|
|
* @return string |
421
|
|
|
*/ |
422
|
78 |
|
protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string |
423
|
|
|
{ |
424
|
78 |
|
if ($multiDecodeNewHtmlEntity) { |
425
|
3 |
|
if (\class_exists('\voku\helper\UTF8')) { |
426
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
427
|
|
|
$content = UTF8::rawurldecode($content, true); |
428
|
|
|
} else { |
429
|
|
|
do { |
430
|
3 |
|
$content_compare = $content; |
431
|
|
|
|
432
|
3 |
|
$content = \rawurldecode( |
433
|
3 |
|
\html_entity_decode( |
434
|
3 |
|
$content, |
435
|
3 |
|
\ENT_QUOTES | \ENT_HTML5 |
436
|
|
|
) |
437
|
|
|
); |
438
|
3 |
|
} while ($content_compare !== $content); |
439
|
|
|
} |
440
|
|
|
} else { |
441
|
|
|
/** @noinspection NestedPositiveIfStatementsInspection */ |
442
|
77 |
|
if (\class_exists('\voku\helper\UTF8')) { |
443
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
444
|
|
|
$content = UTF8::rawurldecode($content, false); |
445
|
|
|
} else { |
446
|
77 |
|
$content = \rawurldecode( |
447
|
77 |
|
\html_entity_decode( |
448
|
77 |
|
$content, |
449
|
77 |
|
\ENT_QUOTES | \ENT_HTML5 |
450
|
|
|
) |
451
|
|
|
); |
452
|
|
|
} |
453
|
|
|
} |
454
|
|
|
|
455
|
78 |
|
return $content; |
456
|
|
|
} |
457
|
|
|
|
458
|
|
|
/** |
459
|
|
|
* Find list of nodes with a CSS selector. |
460
|
|
|
* |
461
|
|
|
* @param string $selector |
462
|
|
|
* @param int|null $idx |
463
|
|
|
* |
464
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
465
|
|
|
*/ |
466
|
95 |
|
public function find(string $selector, $idx = null) |
467
|
|
|
{ |
468
|
95 |
|
$xPathQuery = SelectorConverter::toXPath($selector); |
469
|
|
|
|
470
|
95 |
|
$xPath = new \DOMXPath($this->document); |
471
|
95 |
|
$nodesList = $xPath->query($xPathQuery); |
472
|
95 |
|
$elements = new SimpleHtmlDomNode(); |
473
|
|
|
|
474
|
|
|
// register the namespaces |
475
|
95 |
|
foreach ($this->namespaces as $namespace => $url) { |
476
|
|
|
$xPath->registerNamespace($namespace, $url); |
477
|
|
|
} |
478
|
|
|
|
479
|
95 |
|
foreach ($nodesList as $node) { |
480
|
87 |
|
$elements[] = new SimpleHtmlDom($node); |
481
|
|
|
} |
482
|
|
|
|
483
|
|
|
// return all elements |
484
|
95 |
|
if ($idx === null) { |
485
|
61 |
|
if (\count($elements) === 0) { |
486
|
13 |
|
return new SimpleHtmlDomNodeBlank(); |
487
|
|
|
} |
488
|
|
|
|
489
|
58 |
|
return $elements; |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
// handle negative values |
493
|
47 |
|
if ($idx < 0) { |
494
|
11 |
|
$idx = \count($elements) + $idx; |
495
|
|
|
} |
496
|
|
|
|
497
|
|
|
// return one element |
498
|
47 |
|
return $elements[$idx] ?? new SimpleHtmlDomBlank(); |
499
|
|
|
} |
500
|
|
|
|
501
|
|
|
/** |
502
|
|
|
* Find nodes with a CSS selector. |
503
|
|
|
* |
504
|
|
|
* @param string $selector |
505
|
|
|
* |
506
|
|
|
* @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
507
|
|
|
*/ |
508
|
4 |
|
public function findMulti(string $selector): SimpleHtmlDomNodeInterface |
509
|
|
|
{ |
510
|
4 |
|
return $this->find($selector, null); |
511
|
|
|
} |
512
|
|
|
|
513
|
|
|
/** |
514
|
|
|
* Find one node with a CSS selector. |
515
|
|
|
* |
516
|
|
|
* @param string $selector |
517
|
|
|
* |
518
|
|
|
* @return SimpleHtmlDomInterface |
519
|
|
|
*/ |
520
|
6 |
|
public function findOne(string $selector): SimpleHtmlDomInterface |
521
|
|
|
{ |
522
|
6 |
|
return $this->find($selector, 0); |
523
|
|
|
} |
524
|
|
|
|
525
|
|
|
/** |
526
|
|
|
* @param string $content |
527
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
528
|
|
|
* |
529
|
|
|
* @return string |
530
|
|
|
*/ |
531
|
76 |
|
public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string |
532
|
|
|
{ |
533
|
|
|
// INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>), |
534
|
|
|
// so we try to remove it here again ... |
535
|
|
|
|
536
|
76 |
|
if ($this->isDOMDocumentCreatedWithoutHtmlWrapper) { |
537
|
|
|
/** @noinspection HtmlRequiredLangAttribute */ |
538
|
30 |
|
$content = \str_replace( |
539
|
|
|
[ |
540
|
30 |
|
'<body>', |
541
|
|
|
'</body>', |
542
|
|
|
'<html>', |
543
|
|
|
'</html>', |
544
|
|
|
], |
545
|
30 |
|
'', |
546
|
30 |
|
$content |
547
|
|
|
); |
548
|
|
|
} |
549
|
|
|
|
550
|
76 |
|
if ($this->isDOMDocumentCreatedWithoutHeadWrapper) { |
551
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
552
|
31 |
|
$content = \str_replace( |
553
|
|
|
[ |
554
|
31 |
|
'<head>', |
555
|
|
|
'</head>', |
556
|
|
|
], |
557
|
31 |
|
'', |
558
|
31 |
|
$content |
559
|
|
|
); |
560
|
|
|
} |
561
|
|
|
|
562
|
76 |
|
if ($this->isDOMDocumentCreatedWithFakeEndScript) { |
563
|
1 |
|
$content = \str_replace( |
564
|
1 |
|
'</script>', |
565
|
1 |
|
'', |
566
|
1 |
|
$content |
567
|
|
|
); |
568
|
|
|
} |
569
|
|
|
|
570
|
76 |
|
if ($this->isDOMDocumentCreatedWithoutWrapper) { |
571
|
4 |
|
$content = (string) \preg_replace('/^<p>/', '', $content); |
572
|
4 |
|
$content = (string) \preg_replace('/<\/p>/', '', $content); |
573
|
|
|
} |
574
|
|
|
|
575
|
76 |
|
if ($this->isDOMDocumentCreatedWithoutHtml) { |
576
|
5 |
|
$content = \str_replace( |
577
|
|
|
[ |
578
|
5 |
|
'<p>', |
579
|
|
|
'</p>', |
580
|
|
|
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', |
581
|
|
|
], |
582
|
5 |
|
'', |
583
|
5 |
|
$content |
584
|
|
|
); |
585
|
|
|
} |
586
|
|
|
|
587
|
|
|
/** @noinspection CheckTagEmptyBody */ |
588
|
|
|
/** @noinspection HtmlExtraClosingTag */ |
589
|
|
|
/** @noinspection HtmlRequiredTitleElement */ |
590
|
76 |
|
$content = \trim( |
591
|
76 |
|
\str_replace( |
592
|
|
|
[ |
593
|
76 |
|
'<simpleHtmlDomP>', |
594
|
|
|
'</simpleHtmlDomP>', |
595
|
|
|
'<head><head>', |
596
|
|
|
'</head></head>', |
597
|
|
|
'<br></br>', |
598
|
|
|
], |
599
|
|
|
[ |
600
|
76 |
|
'', |
601
|
|
|
'', |
602
|
|
|
'<head>', |
603
|
|
|
'</head>', |
604
|
|
|
'<br>', |
605
|
|
|
], |
606
|
76 |
|
$content |
607
|
|
|
) |
608
|
|
|
); |
609
|
|
|
|
610
|
76 |
|
$content = $this->decodeHtmlEntity($content, $multiDecodeNewHtmlEntity); |
611
|
|
|
|
612
|
76 |
|
return self::putReplacedBackToPreserveHtmlEntities($content); |
613
|
|
|
} |
614
|
|
|
|
615
|
|
|
/** |
616
|
|
|
* @return \DOMDocument |
617
|
|
|
*/ |
618
|
39 |
|
public function getDocument(): \DOMDocument |
619
|
|
|
{ |
620
|
39 |
|
return $this->document; |
621
|
|
|
} |
622
|
|
|
|
623
|
|
|
/** |
624
|
|
|
* Return elements by .class. |
625
|
|
|
* |
626
|
|
|
* @param string $class |
627
|
|
|
* |
628
|
|
|
* @return SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
629
|
|
|
*/ |
630
|
|
|
public function getElementByClass(string $class): SimpleHtmlDomNodeInterface |
631
|
|
|
{ |
632
|
|
|
return $this->findMulti(".${class}"); |
633
|
|
|
} |
634
|
|
|
|
635
|
|
|
/** |
636
|
|
|
* Return element by #id. |
637
|
|
|
* |
638
|
|
|
* @param string $id |
639
|
|
|
* |
640
|
|
|
* @return SimpleHtmlDomInterface |
641
|
|
|
*/ |
642
|
|
|
public function getElementById(string $id): SimpleHtmlDomInterface |
643
|
|
|
{ |
644
|
2 |
|
return $this->findOne("#${id}"); |
645
|
|
|
} |
646
|
|
|
|
647
|
|
|
/** |
648
|
|
|
* Return element by tag name. |
649
|
|
|
* |
650
|
|
|
* @param string $name |
651
|
|
|
* |
652
|
|
|
* @return SimpleHtmlDomInterface |
653
|
|
|
*/ |
654
|
|
|
public function getElementByTagName(string $name): SimpleHtmlDomInterface |
655
|
|
|
{ |
656
|
1 |
|
$node = $this->document->getElementsByTagName($name)->item(0); |
657
|
|
|
|
658
|
1 |
|
if ($node === null) { |
659
|
|
|
return new SimpleHtmlDomBlank(); |
660
|
|
|
} |
661
|
|
|
|
662
|
1 |
|
return new SimpleHtmlDom($node); |
663
|
|
|
} |
664
|
|
|
|
665
|
|
|
/** |
666
|
|
|
* Returns elements by #id. |
667
|
|
|
* |
668
|
|
|
* @param string $id |
669
|
|
|
* @param int|null $idx |
670
|
|
|
* |
671
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
672
|
|
|
*/ |
673
|
|
|
public function getElementsById(string $id, $idx = null) |
674
|
|
|
{ |
675
|
|
|
return $this->find("#${id}", $idx); |
676
|
|
|
} |
677
|
|
|
|
678
|
|
|
/** |
679
|
|
|
* Returns elements by tag name. |
680
|
|
|
* |
681
|
|
|
* @param string $name |
682
|
|
|
* @param int|null $idx |
683
|
|
|
* |
684
|
|
|
* @return SimpleHtmlDomInterface|SimpleHtmlDomInterface[]|SimpleHtmlDomNodeInterface |
685
|
|
|
*/ |
686
|
|
View Code Duplication |
public function getElementsByTagName(string $name, $idx = null) |
|
|
|
|
687
|
|
|
{ |
688
|
3 |
|
$nodesList = $this->document->getElementsByTagName($name); |
689
|
|
|
|
690
|
3 |
|
$elements = new SimpleHtmlDomNode(); |
691
|
|
|
|
692
|
3 |
|
foreach ($nodesList as $node) { |
693
|
3 |
|
$elements[] = new SimpleHtmlDom($node); |
694
|
|
|
} |
695
|
|
|
|
696
|
|
|
// return all elements |
697
|
3 |
|
if ($idx === null) { |
698
|
2 |
|
if (\count($elements) === 0) { |
699
|
|
|
return new SimpleHtmlDomNodeBlank(); |
700
|
|
|
} |
701
|
|
|
|
702
|
2 |
|
return $elements; |
703
|
|
|
} |
704
|
|
|
|
705
|
|
|
// handle negative values |
706
|
1 |
|
if ($idx < 0) { |
707
|
|
|
$idx = \count($elements) + $idx; |
708
|
|
|
} |
709
|
|
|
|
710
|
|
|
// return one element |
711
|
1 |
|
return $elements[$idx] ?? new SimpleHtmlDomNodeBlank(); |
712
|
|
|
} |
713
|
|
|
|
714
|
|
|
/** |
715
|
|
|
* Get the encoding to use. |
716
|
|
|
* |
717
|
|
|
* @return string |
718
|
|
|
*/ |
719
|
|
|
protected function getEncoding(): string |
720
|
|
|
{ |
721
|
146 |
|
return $this->encoding; |
722
|
|
|
} |
723
|
|
|
|
724
|
|
|
/** |
725
|
|
|
* @return bool |
726
|
|
|
*/ |
727
|
|
|
public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool |
728
|
|
|
{ |
729
|
9 |
|
return $this->isDOMDocumentCreatedWithoutHeadWrapper; |
730
|
|
|
} |
731
|
|
|
|
732
|
|
|
/** |
733
|
|
|
* @return bool |
734
|
|
|
*/ |
735
|
|
|
public function getIsDOMDocumentCreatedWithoutHtml(): bool |
736
|
|
|
{ |
737
|
9 |
|
return $this->isDOMDocumentCreatedWithoutHtml; |
738
|
|
|
} |
739
|
|
|
|
740
|
|
|
/** |
741
|
|
|
* @return bool |
742
|
|
|
*/ |
743
|
|
|
public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool |
744
|
|
|
{ |
745
|
50 |
|
return $this->isDOMDocumentCreatedWithoutHtmlWrapper; |
746
|
|
|
} |
747
|
|
|
|
748
|
|
|
/** |
749
|
|
|
* @return bool |
750
|
|
|
*/ |
751
|
|
|
public function getIsDOMDocumentCreatedWithoutWrapper(): bool |
752
|
|
|
{ |
753
|
|
|
return $this->isDOMDocumentCreatedWithoutWrapper; |
754
|
|
|
} |
755
|
|
|
|
756
|
|
|
/** |
757
|
|
|
* Get the list of registered namespaces as an array. |
758
|
|
|
* |
759
|
|
|
* @return array |
760
|
|
|
* An array in form ['prefix' => 'namespace-uri'] |
761
|
|
|
*/ |
762
|
|
|
public function getNamespaces(): array |
763
|
|
|
{ |
764
|
|
|
return $this->namespaces; |
765
|
|
|
} |
766
|
|
|
|
767
|
|
|
/** |
768
|
|
|
* Get dom node's outer html. |
769
|
|
|
* |
770
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
771
|
|
|
* |
772
|
|
|
* @return string |
773
|
|
|
*/ |
774
|
|
|
public function html(bool $multiDecodeNewHtmlEntity = false): string |
775
|
|
|
{ |
776
|
50 |
|
if ($this::$callback !== null) { |
777
|
|
|
\call_user_func($this::$callback, [$this]); |
778
|
|
|
} |
779
|
|
|
|
780
|
50 |
|
if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) { |
781
|
23 |
|
$content = $this->document->saveHTML($this->document->documentElement); |
782
|
|
|
} else { |
783
|
35 |
|
$content = $this->document->saveHTML(); |
784
|
|
|
} |
785
|
|
|
|
786
|
50 |
|
if ($content === false) { |
787
|
|
|
return ''; |
788
|
|
|
} |
789
|
|
|
|
790
|
50 |
|
return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity); |
791
|
|
|
} |
792
|
|
|
|
793
|
|
|
/** |
794
|
|
|
* workaround for bug: https://bugs.php.net/bug.php?id=74628 |
795
|
|
|
* |
796
|
|
|
* @param string $html |
797
|
|
|
*/ |
798
|
|
|
protected function html5FallbackForScriptTags(string &$html) |
799
|
|
|
{ |
800
|
|
|
// regEx for e.g.: [<script id="elements-image-2">...<script>] |
801
|
|
|
/** @noinspection HtmlDeprecatedTag */ |
802
|
15 |
|
$regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU'; |
803
|
15 |
|
$html = \preg_replace_callback( |
804
|
15 |
|
$regExSpecialScript, |
805
|
|
|
static function ($scripts) { |
806
|
14 |
|
return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>'; |
807
|
15 |
|
}, |
808
|
15 |
|
$html |
809
|
|
|
); |
810
|
15 |
|
} |
811
|
|
|
|
812
|
|
|
/** |
813
|
|
|
* Get dom node's inner html. |
814
|
|
|
* |
815
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
816
|
|
|
* |
817
|
|
|
* @return string |
818
|
|
|
*/ |
819
|
|
|
public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string |
820
|
|
|
{ |
821
|
|
|
// init |
822
|
20 |
|
$text = ''; |
823
|
|
|
|
824
|
20 |
|
if ($this->document->documentElement) { |
825
|
20 |
|
foreach ($this->document->documentElement->childNodes as $node) { |
826
|
20 |
|
$text .= $this->document->saveHTML($node); |
827
|
|
|
} |
828
|
|
|
} |
829
|
|
|
|
830
|
20 |
|
return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity); |
831
|
|
|
} |
832
|
|
|
|
833
|
|
|
/** |
834
|
|
|
* @param string $html |
835
|
|
|
* |
836
|
|
|
* @return string |
837
|
|
|
*/ |
838
|
|
|
protected function keepBrokenHtml(string $html): string |
839
|
|
|
{ |
840
|
|
|
do { |
841
|
2 |
|
$original = $html; |
842
|
|
|
|
843
|
2 |
|
$html = (string) \preg_replace_callback( |
844
|
2 |
|
'/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui', |
845
|
|
|
static function ($matches) { |
846
|
2 |
|
return $matches['start'] . |
847
|
2 |
|
'°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' . |
848
|
2 |
|
$matches['value'] . |
849
|
2 |
|
'°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' . |
850
|
2 |
|
$matches['end']; |
851
|
2 |
|
}, |
852
|
2 |
|
$html |
853
|
|
|
); |
854
|
2 |
|
} while ($original !== $html); |
855
|
|
|
|
856
|
|
|
do { |
857
|
2 |
|
$original = $html; |
858
|
|
|
|
859
|
2 |
|
$html = (string) \preg_replace_callback( |
860
|
2 |
|
'/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u', |
861
|
|
|
static function ($matches) { |
862
|
2 |
|
$matches['broken'] = \str_replace( |
863
|
2 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
864
|
2 |
|
['</', '<', '>'], |
865
|
2 |
|
$matches['broken'] |
866
|
|
|
); |
867
|
|
|
|
868
|
2 |
|
self::$domBrokenReplaceHelper['orig'][] = $matches['broken']; |
869
|
2 |
|
self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . \crc32($matches['broken']); |
870
|
|
|
|
871
|
2 |
|
return $matches['start'] . $matchesHash . $matches['end']; |
872
|
2 |
|
}, |
873
|
2 |
|
$html |
874
|
|
|
); |
875
|
2 |
|
} while ($original !== $html); |
876
|
|
|
|
877
|
2 |
|
return \str_replace( |
878
|
2 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
879
|
2 |
|
['</', '<', '>'], |
880
|
2 |
|
$html |
881
|
|
|
); |
882
|
|
|
} |
883
|
|
|
|
884
|
|
|
/** |
885
|
|
|
* @param string $html |
886
|
|
|
*/ |
887
|
|
|
protected function keepSpecialScriptTags(string &$html) |
888
|
|
|
{ |
889
|
1 |
|
$specialScripts = []; |
890
|
|
|
// regEx for e.g.: [<script id="elements-image-1" type="text/html">...</script>] |
891
|
1 |
|
$regExSpecialScript = '/<(script) [^>]*type=(["\']){0,1}text\/html\2{0,1}([^>]*)>.*<\/\1>/isU'; |
892
|
1 |
|
\preg_match_all($regExSpecialScript, $html, $specialScripts); |
893
|
|
|
|
894
|
1 |
|
if (isset($specialScripts[0])) { |
895
|
1 |
|
foreach ($specialScripts[0] as $specialScript) { |
896
|
1 |
|
$specialNonScript = '<' . self::$domHtmlSpecialScriptHelper . \substr($specialScript, \strlen('<script')); |
897
|
1 |
|
$specialNonScript = \substr($specialNonScript, 0, -\strlen('</script>')) . '</' . self::$domHtmlSpecialScriptHelper . '>'; |
898
|
|
|
// remove the html5 fallback |
899
|
1 |
|
$specialNonScript = \str_replace('<\/', '</', $specialNonScript); |
900
|
|
|
|
901
|
1 |
|
$html = \str_replace($specialScript, $specialNonScript, $html); |
902
|
|
|
} |
903
|
|
|
} |
904
|
1 |
|
} |
905
|
|
|
|
906
|
|
|
/** |
907
|
|
|
* Load HTML from string. |
908
|
|
|
* |
909
|
|
|
* @param string $html |
910
|
|
|
* @param int|null $libXMLExtraOptions |
911
|
|
|
* |
912
|
|
|
* @return HtmlDomParser |
913
|
|
|
*/ |
914
|
|
|
public function loadHtml(string $html, $libXMLExtraOptions = null): self |
915
|
|
|
{ |
916
|
131 |
|
$this->document = $this->createDOMDocument($html, $libXMLExtraOptions); |
917
|
|
|
|
918
|
131 |
|
return $this; |
919
|
|
|
} |
920
|
|
|
|
921
|
|
|
/** |
922
|
|
|
* Load HTML from file. |
923
|
|
|
* |
924
|
|
|
* @param string $filePath |
925
|
|
|
* @param int|null $libXMLExtraOptions |
926
|
|
|
* |
927
|
|
|
* @throws \RuntimeException |
928
|
|
|
* |
929
|
|
|
* @return HtmlDomParser |
930
|
|
|
*/ |
931
|
|
View Code Duplication |
public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self |
|
|
|
|
932
|
|
|
{ |
933
|
|
|
if ( |
934
|
11 |
|
!\preg_match("/^https?:\/\//i", $filePath) |
935
|
|
|
&& |
936
|
11 |
|
!\file_exists($filePath) |
937
|
|
|
) { |
938
|
1 |
|
throw new \RuntimeException("File ${filePath} not found"); |
939
|
|
|
} |
940
|
|
|
|
941
|
|
|
try { |
942
|
10 |
|
if (\class_exists('\voku\helper\UTF8')) { |
943
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
944
|
|
|
$html = UTF8::file_get_contents($filePath); |
945
|
|
|
} else { |
946
|
10 |
|
$html = \file_get_contents($filePath); |
947
|
|
|
} |
948
|
1 |
|
} catch (\Exception $e) { |
949
|
1 |
|
throw new \RuntimeException("Could not load file ${filePath}"); |
950
|
|
|
} |
951
|
|
|
|
952
|
9 |
|
if ($html === false) { |
953
|
|
|
throw new \RuntimeException("Could not load file ${filePath}"); |
954
|
|
|
} |
955
|
|
|
|
956
|
9 |
|
return $this->loadHtml($html, $libXMLExtraOptions); |
957
|
|
|
} |
958
|
|
|
|
959
|
|
|
/** |
960
|
|
|
* @param string $html |
961
|
|
|
* |
962
|
|
|
* @return string |
963
|
|
|
*/ |
964
|
|
|
public static function putReplacedBackToPreserveHtmlEntities(string $html): string |
965
|
|
|
{ |
966
|
86 |
|
static $DOM_REPLACE__HELPER_CACHE = null; |
967
|
|
|
|
968
|
86 |
|
if ($DOM_REPLACE__HELPER_CACHE === null) { |
969
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge( |
970
|
2 |
|
self::$domLinkReplaceHelper['tmp'], |
971
|
2 |
|
self::$domReplaceHelper['tmp'] |
972
|
|
|
); |
973
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge( |
974
|
2 |
|
self::$domLinkReplaceHelper['orig'], |
975
|
2 |
|
self::$domReplaceHelper['orig'] |
976
|
|
|
); |
977
|
|
|
|
978
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>'; |
979
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>'; |
980
|
|
|
|
981
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = ''; |
982
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = ''; |
983
|
|
|
|
984
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper; |
985
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>'; |
986
|
|
|
|
987
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script'; |
988
|
2 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>'; |
989
|
|
|
} |
990
|
|
|
|
991
|
|
|
if ( |
992
|
86 |
|
isset(self::$domBrokenReplaceHelper['tmp']) |
993
|
|
|
&& |
994
|
86 |
|
\count(self::$domBrokenReplaceHelper['tmp']) > 0 |
995
|
|
|
) { |
996
|
2 |
|
$html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html); |
997
|
|
|
} |
998
|
|
|
|
999
|
86 |
|
return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html); |
1000
|
|
|
} |
1001
|
|
|
|
1002
|
|
|
/** |
1003
|
|
|
* Register a namespace to be used in xpath queries. |
1004
|
|
|
* |
1005
|
|
|
* @param string $prefix |
1006
|
|
|
* Namespace prefix to register |
1007
|
|
|
* @param string $url |
1008
|
|
|
* Connonical URL for this namespace prefix |
1009
|
|
|
*/ |
1010
|
|
|
protected function registerNamespace($prefix, $url) |
1011
|
|
|
{ |
1012
|
|
|
$this->namespaces[$prefix] = $url; |
1013
|
|
|
} |
1014
|
|
|
|
1015
|
|
|
/** |
1016
|
|
|
* @param string $html |
1017
|
|
|
* |
1018
|
|
|
* @return string |
1019
|
|
|
*/ |
1020
|
|
|
public static function replaceToPreserveHtmlEntities(string $html): string |
1021
|
|
|
{ |
1022
|
|
|
// init |
1023
|
135 |
|
$linksNew = []; |
1024
|
135 |
|
$linksOld = []; |
1025
|
|
|
|
1026
|
135 |
|
if (\strpos($html, 'http') !== false) { |
1027
|
|
|
|
1028
|
|
|
// regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo] |
1029
|
61 |
|
$regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i'; |
1030
|
61 |
|
\preg_match_all($regExUrl, $html, $linksOld); |
1031
|
|
|
|
1032
|
61 |
|
if (!empty($linksOld[1])) { |
1033
|
58 |
|
$linksOld = $linksOld[1]; |
1034
|
58 |
|
foreach ((array) $linksOld as $linkKey => $linkOld) { |
1035
|
58 |
|
$linksNew[$linkKey] = \str_replace( |
1036
|
58 |
|
self::$domLinkReplaceHelper['orig'], |
1037
|
58 |
|
self::$domLinkReplaceHelper['tmp'], |
1038
|
58 |
|
$linkOld |
1039
|
|
|
); |
1040
|
|
|
} |
1041
|
|
|
} |
1042
|
|
|
} |
1043
|
|
|
|
1044
|
135 |
|
$linksNewCount = \count($linksNew); |
1045
|
135 |
|
if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) { |
1046
|
58 |
|
$search = \array_merge($linksOld, self::$domReplaceHelper['orig']); |
1047
|
58 |
|
$replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']); |
1048
|
|
|
} else { |
1049
|
82 |
|
$search = self::$domReplaceHelper['orig']; |
1050
|
82 |
|
$replace = self::$domReplaceHelper['tmp']; |
1051
|
|
|
} |
1052
|
|
|
|
1053
|
135 |
|
return \str_replace($search, $replace, $html); |
1054
|
|
|
} |
1055
|
|
|
|
1056
|
|
|
/** |
1057
|
|
|
* Save the html-dom as string. |
1058
|
|
|
* |
1059
|
|
|
* @param string $filepath |
1060
|
|
|
* |
1061
|
|
|
* @return string |
1062
|
|
|
*/ |
1063
|
|
|
public function save(string $filepath = ''): string |
1064
|
|
|
{ |
1065
|
1 |
|
$string = $this->innerHtml(); |
1066
|
1 |
|
if ($filepath !== '') { |
1067
|
|
|
\file_put_contents($filepath, $string, \LOCK_EX); |
1068
|
|
|
} |
1069
|
|
|
|
1070
|
1 |
|
return $string; |
1071
|
|
|
} |
1072
|
|
|
|
1073
|
|
|
/** |
1074
|
|
|
* @param callable $functionName |
1075
|
|
|
*/ |
1076
|
|
|
public function set_callback($functionName) |
1077
|
|
|
{ |
1078
|
|
|
static::$callback = $functionName; |
1079
|
|
|
} |
1080
|
|
|
|
1081
|
|
|
/** |
1082
|
|
|
* Get dom node's plain text. |
1083
|
|
|
* |
1084
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
1085
|
|
|
* |
1086
|
|
|
* @return string |
1087
|
|
|
*/ |
1088
|
|
|
public function text(bool $multiDecodeNewHtmlEntity = false): string |
1089
|
|
|
{ |
1090
|
3 |
|
return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity); |
1091
|
|
|
} |
1092
|
|
|
|
1093
|
|
|
/** |
1094
|
|
|
* @param bool $keepBrokenHtml |
1095
|
|
|
* |
1096
|
|
|
* @return HtmlDomParser |
1097
|
|
|
*/ |
1098
|
|
|
public function useKeepBrokenHtml(bool $keepBrokenHtml): self |
1099
|
|
|
{ |
1100
|
2 |
|
$this->keepBrokenHtml = $keepBrokenHtml; |
1101
|
|
|
|
1102
|
2 |
|
return $this; |
1103
|
|
|
} |
1104
|
|
|
|
1105
|
|
|
/** |
1106
|
|
|
* Get the HTML as XML or plain XML if needed. |
1107
|
|
|
* |
1108
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
1109
|
|
|
* @param bool $htmlToXml |
1110
|
|
|
* @param bool $removeXmlHeader |
1111
|
|
|
* @param int $options |
1112
|
|
|
* |
1113
|
|
|
* @return string |
1114
|
|
|
*/ |
1115
|
|
|
public function xml( |
1116
|
|
|
bool $multiDecodeNewHtmlEntity = false, |
1117
|
|
|
bool $htmlToXml = true, |
1118
|
|
|
bool $removeXmlHeader = true, |
1119
|
|
|
int $options = \LIBXML_NOEMPTYTAG |
1120
|
|
|
): string { |
1121
|
4 |
|
$xml = $this->document->saveXML(null, $options); |
1122
|
|
|
|
1123
|
4 |
|
if ($removeXmlHeader) { |
1124
|
4 |
|
$xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml)); |
1125
|
|
|
} |
1126
|
|
|
|
1127
|
4 |
|
if ($htmlToXml) { |
1128
|
2 |
|
$return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity); |
1129
|
|
|
} else { |
1130
|
2 |
|
$xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity); |
1131
|
|
|
|
1132
|
2 |
|
$return = self::putReplacedBackToPreserveHtmlEntities($xml); |
1133
|
|
|
} |
1134
|
|
|
|
1135
|
4 |
|
return $return; |
1136
|
|
|
} |
1137
|
|
|
} |
1138
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.