1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace voku\helper; |
6
|
|
|
|
7
|
|
|
abstract class AbstractDomParser implements DomParserInterface |
8
|
|
|
{ |
9
|
|
|
/** |
10
|
|
|
* @var string |
11
|
|
|
*/ |
12
|
|
|
protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____'; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* @var string |
16
|
|
|
*/ |
17
|
|
|
protected static $domHtmlBrokenHtmlHelper = '____simple_html_dom__voku__broken_html____'; |
18
|
|
|
|
19
|
|
|
/** |
20
|
|
|
* @var string |
21
|
|
|
*/ |
22
|
|
|
protected static $domHtmlSpecialScriptHelper = '____simple_html_dom__voku__html_special_script____'; |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* @var array |
26
|
|
|
*/ |
27
|
|
|
protected static $domBrokenReplaceHelper = []; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* @var string[][] |
31
|
|
|
*/ |
32
|
|
|
protected static $domLinkReplaceHelper = [ |
33
|
|
|
'orig' => ['[', ']', '{', '}'], |
34
|
|
|
'tmp' => [ |
35
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____', |
36
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____', |
37
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____', |
38
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____', |
39
|
|
|
], |
40
|
|
|
]; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* @var string[][] |
44
|
|
|
*/ |
45
|
|
|
protected static $domReplaceHelper = [ |
46
|
|
|
'orig' => ['&', '|', '+', '%', '@', '<html ⚡'], |
47
|
|
|
'tmp' => [ |
48
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AMP____', |
49
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PIPE____', |
50
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PLUS____', |
51
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PERCENT____', |
52
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AT____', |
53
|
|
|
'<html ____SIMPLE_HTML_DOM__VOKU__GOOGLE_AMP____="true"', |
54
|
|
|
], |
55
|
|
|
]; |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* @var callable|null |
59
|
|
|
*/ |
60
|
|
|
protected static $callback; |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* @var string[] |
64
|
|
|
*/ |
65
|
|
|
protected static $functionAliases = []; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* @var \DOMDocument |
69
|
|
|
*/ |
70
|
|
|
protected $document; |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* @var string |
74
|
|
|
*/ |
75
|
|
|
protected $encoding = 'UTF-8'; |
76
|
|
|
|
77
|
|
|
/** |
78
|
|
|
* @param string $name |
79
|
|
|
* @param array $arguments |
80
|
|
|
* |
81
|
|
|
* @return bool|mixed |
82
|
|
|
*/ |
83
|
|
|
public function __call($name, $arguments) |
84
|
|
|
{ |
85
|
|
|
$name = \strtolower($name); |
86
|
|
|
|
87
|
|
|
if (isset(self::$functionAliases[$name])) { |
88
|
|
|
return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments); |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
throw new \BadMethodCallException('Method does not exist: ' . $name); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* @param string $name |
96
|
|
|
* @param array $arguments |
97
|
|
|
* |
98
|
|
|
* @throws \BadMethodCallException |
99
|
|
|
* @throws \RuntimeException |
100
|
|
|
* |
101
|
|
|
* @return static |
102
|
|
|
*/ |
103
|
|
|
abstract public static function __callStatic($name, $arguments); |
104
|
|
|
|
105
|
|
|
public function __clone() |
106
|
|
|
{ |
107
|
|
|
$this->document = clone $this->document; |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** @noinspection MagicMethodsValidityInspection */ |
111
|
|
|
|
112
|
|
|
/** |
113
|
|
|
* @param string $name |
114
|
|
|
* |
115
|
|
|
* @return string|null |
116
|
|
|
*/ |
117
|
|
|
abstract public function __get($name); |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* @return string |
121
|
|
|
*/ |
122
|
|
|
abstract public function __toString(); |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* does nothing (only for api-compatibility-reasons) |
126
|
|
|
* |
127
|
|
|
* @return bool |
128
|
|
|
* |
129
|
|
|
* @deprecated |
130
|
|
|
*/ |
131
|
|
|
public function clear(): bool |
132
|
|
|
{ |
133
|
|
|
return true; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* Create DOMDocument from HTML. |
138
|
|
|
* |
139
|
|
|
* @param string $html |
140
|
|
|
* @param int|null $libXMLExtraOptions |
141
|
|
|
* |
142
|
|
|
* @return \DOMDocument |
143
|
|
|
*/ |
144
|
|
|
abstract protected function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument; |
145
|
|
|
|
146
|
|
|
/** |
147
|
|
|
* @param string $content |
148
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
149
|
|
|
* |
150
|
|
|
* @return string |
151
|
|
|
*/ |
152
|
124 |
|
protected function decodeHtmlEntity(string $content, bool $multiDecodeNewHtmlEntity): string |
153
|
|
|
{ |
154
|
124 |
|
if ($multiDecodeNewHtmlEntity) { |
155
|
3 |
|
if (\class_exists('\voku\helper\UTF8')) { |
156
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
157
|
|
|
$content = UTF8::rawurldecode($content, true); |
158
|
|
|
} else { |
159
|
|
|
do { |
160
|
3 |
|
$content_compare = $content; |
161
|
|
|
|
162
|
3 |
|
$content = \rawurldecode( |
163
|
3 |
|
\html_entity_decode( |
164
|
3 |
|
$content, |
165
|
3 |
|
\ENT_QUOTES | \ENT_HTML5 |
166
|
|
|
) |
167
|
|
|
); |
168
|
3 |
|
} while ($content_compare !== $content); |
169
|
|
|
} |
170
|
|
|
} else { |
171
|
|
|
/** @noinspection NestedPositiveIfStatementsInspection */ |
172
|
123 |
|
if (\class_exists('\voku\helper\UTF8')) { |
173
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
174
|
|
|
$content = UTF8::rawurldecode($content, false); |
175
|
|
|
} else { |
176
|
123 |
|
$content = \rawurldecode( |
177
|
123 |
|
\html_entity_decode( |
178
|
123 |
|
$content, |
179
|
123 |
|
\ENT_QUOTES | \ENT_HTML5 |
180
|
|
|
) |
181
|
|
|
); |
182
|
|
|
} |
183
|
|
|
} |
184
|
|
|
|
185
|
124 |
|
return $content; |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
/** |
189
|
|
|
* Find list of nodes with a CSS selector. |
190
|
|
|
* |
191
|
|
|
* @param string $selector |
192
|
|
|
* @param int|null $idx |
193
|
|
|
*/ |
194
|
|
|
abstract public function find(string $selector, $idx = null); |
195
|
|
|
|
196
|
|
|
/** |
197
|
|
|
* Find nodes with a CSS selector. |
198
|
|
|
* |
199
|
|
|
* @param string $selector |
200
|
|
|
*/ |
201
|
|
|
abstract public function findMulti(string $selector); |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Find nodes with a CSS selector or false, if no element is found. |
205
|
|
|
* |
206
|
|
|
* @param string $selector |
207
|
|
|
*/ |
208
|
|
|
abstract public function findMultiOrFalse(string $selector); |
209
|
|
|
|
210
|
|
|
/** |
211
|
|
|
* Find one node with a CSS selector. |
212
|
|
|
* |
213
|
|
|
* @param string $selector |
214
|
|
|
*/ |
215
|
|
|
abstract public function findOne(string $selector); |
216
|
|
|
|
217
|
|
|
/** |
218
|
|
|
* Find one node with a CSS selector or false, if no element is found. |
219
|
|
|
* |
220
|
|
|
* @param string $selector |
221
|
|
|
*/ |
222
|
|
|
abstract public function findOneOrFalse(string $selector); |
223
|
|
|
|
224
|
|
|
/** |
225
|
|
|
* @return \DOMDocument |
226
|
|
|
*/ |
227
|
48 |
|
public function getDocument(): \DOMDocument |
228
|
|
|
{ |
229
|
48 |
|
return $this->document; |
230
|
|
|
} |
231
|
|
|
|
232
|
|
|
/** |
233
|
|
|
* Get dom node's outer html. |
234
|
|
|
* |
235
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
236
|
|
|
* |
237
|
|
|
* @return string |
238
|
|
|
*/ |
239
|
|
|
abstract public function html(bool $multiDecodeNewHtmlEntity = false): string; |
240
|
|
|
|
241
|
|
|
/** |
242
|
|
|
* Get dom node's inner html. |
243
|
|
|
* |
244
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
245
|
|
|
* |
246
|
|
|
* @return string |
247
|
|
|
*/ |
248
|
30 |
View Code Duplication |
public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string |
|
|
|
|
249
|
|
|
{ |
250
|
|
|
// init |
251
|
30 |
|
$text = ''; |
252
|
|
|
|
253
|
30 |
|
if ($this->document->documentElement) { |
254
|
30 |
|
foreach ($this->document->documentElement->childNodes as $node) { |
255
|
30 |
|
$text .= $this->document->saveHTML($node); |
256
|
|
|
} |
257
|
|
|
} |
258
|
|
|
|
259
|
30 |
|
return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity); |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
/** |
263
|
|
|
* Get dom node's inner html. |
264
|
|
|
* |
265
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
266
|
|
|
* |
267
|
|
|
* @return string |
268
|
|
|
*/ |
269
|
|
View Code Duplication |
public function innerXml(bool $multiDecodeNewHtmlEntity = false): string |
|
|
|
|
270
|
|
|
{ |
271
|
|
|
// init |
272
|
|
|
$text = ''; |
273
|
|
|
|
274
|
|
|
if ($this->document->documentElement) { |
275
|
|
|
foreach ($this->document->documentElement->childNodes as $node) { |
276
|
|
|
$text .= $this->document->saveXML($node); |
277
|
|
|
} |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity); |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
/** |
284
|
|
|
* Load HTML from string. |
285
|
|
|
* |
286
|
|
|
* @param string $html |
287
|
|
|
* @param int|null $libXMLExtraOptions |
288
|
|
|
* |
289
|
|
|
* @return DomParserInterface |
290
|
|
|
*/ |
291
|
|
|
abstract public function loadHtml(string $html, $libXMLExtraOptions = null): DomParserInterface; |
292
|
|
|
|
293
|
|
|
/** |
294
|
|
|
* Load HTML from file. |
295
|
|
|
* |
296
|
|
|
* @param string $filePath |
297
|
|
|
* @param int|null $libXMLExtraOptions |
298
|
|
|
* |
299
|
|
|
* @throws \RuntimeException |
300
|
|
|
* |
301
|
|
|
* @return DomParserInterface |
302
|
|
|
*/ |
303
|
|
|
abstract public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): DomParserInterface; |
304
|
|
|
|
305
|
|
|
/** |
306
|
|
|
* Save the html-dom as string. |
307
|
|
|
* |
308
|
|
|
* @param string $filepath |
309
|
|
|
* |
310
|
|
|
* @return string |
311
|
|
|
*/ |
312
|
14 |
|
public function save(string $filepath = ''): string |
313
|
|
|
{ |
314
|
14 |
|
$string = $this->html(); |
315
|
14 |
|
if ($filepath !== '') { |
316
|
2 |
|
\file_put_contents($filepath, $string, \LOCK_EX); |
317
|
|
|
} |
318
|
|
|
|
319
|
14 |
|
return $string; |
320
|
|
|
} |
321
|
|
|
|
322
|
|
|
/** |
323
|
|
|
* @param callable $functionName |
324
|
|
|
*/ |
325
|
|
|
public function set_callback($functionName) |
326
|
|
|
{ |
327
|
|
|
static::$callback = $functionName; |
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
/** |
331
|
|
|
* Get dom node's plain text. |
332
|
|
|
* |
333
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
334
|
|
|
* |
335
|
|
|
* @return string |
336
|
|
|
*/ |
337
|
4 |
|
public function text(bool $multiDecodeNewHtmlEntity = false): string |
338
|
|
|
{ |
339
|
4 |
|
return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity); |
340
|
|
|
} |
341
|
|
|
|
342
|
|
|
/** |
343
|
|
|
* Get the HTML as XML or plain XML if needed. |
344
|
|
|
* |
345
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
346
|
|
|
* @param bool $htmlToXml |
347
|
|
|
* @param bool $removeXmlHeader |
348
|
|
|
* @param int $options |
349
|
|
|
* |
350
|
|
|
* @return string |
351
|
|
|
*/ |
352
|
2 |
View Code Duplication |
public function xml( |
|
|
|
|
353
|
|
|
bool $multiDecodeNewHtmlEntity = false, |
354
|
|
|
bool $htmlToXml = true, |
355
|
|
|
bool $removeXmlHeader = true, |
356
|
|
|
int $options = \LIBXML_NOEMPTYTAG |
357
|
|
|
): string { |
358
|
2 |
|
$xml = $this->document->saveXML(null, $options); |
359
|
2 |
|
if ($xml === false) { |
360
|
|
|
return ''; |
361
|
|
|
} |
362
|
|
|
|
363
|
2 |
|
if ($removeXmlHeader) { |
364
|
2 |
|
$xml = \ltrim((string) \preg_replace('/<\?xml.*\?>/', '', $xml)); |
365
|
|
|
} |
366
|
|
|
|
367
|
2 |
|
if ($htmlToXml) { |
368
|
|
|
$return = $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity); |
369
|
|
|
} else { |
370
|
2 |
|
$xml = $this->decodeHtmlEntity($xml, $multiDecodeNewHtmlEntity); |
371
|
|
|
|
372
|
2 |
|
$return = self::putReplacedBackToPreserveHtmlEntities($xml); |
373
|
|
|
} |
374
|
|
|
|
375
|
2 |
|
return $return; |
376
|
|
|
} |
377
|
|
|
|
378
|
|
|
/** |
379
|
|
|
* Get the encoding to use. |
380
|
|
|
* |
381
|
|
|
* @return string |
382
|
|
|
*/ |
383
|
215 |
|
protected function getEncoding(): string |
384
|
|
|
{ |
385
|
215 |
|
return $this->encoding; |
386
|
|
|
} |
387
|
|
|
|
388
|
|
|
/** |
389
|
|
|
* workaround for bug: https://bugs.php.net/bug.php?id=74628 |
390
|
|
|
* |
391
|
|
|
* @param string $html |
392
|
|
|
*/ |
393
|
23 |
|
protected function html5FallbackForScriptTags(string &$html) |
394
|
|
|
{ |
395
|
|
|
// regEx for e.g.: [<script id="elements-image-2">...<script>] |
396
|
|
|
/** @noinspection HtmlDeprecatedTag */ |
397
|
23 |
|
$regExSpecialScript = '/<(script)(?<attr>[^>]*)>(?<content>.*)<\/\1>/isU'; |
398
|
23 |
|
$htmlTmp = \preg_replace_callback( |
399
|
23 |
|
$regExSpecialScript, |
400
|
|
|
static function ($scripts) { |
401
|
21 |
|
if (empty($scripts['content'])) { |
402
|
8 |
|
return $scripts[0]; |
403
|
|
|
} |
404
|
|
|
|
405
|
18 |
|
return '<script' . $scripts['attr'] . '>' . \str_replace('</', '<\/', $scripts['content']) . '</script>'; |
406
|
23 |
|
}, |
407
|
23 |
|
$html |
408
|
|
|
); |
409
|
|
|
|
410
|
23 |
|
if ($htmlTmp !== null) { |
411
|
23 |
|
$html = $htmlTmp; |
412
|
|
|
} |
413
|
23 |
|
} |
414
|
|
|
|
415
|
|
|
/** |
416
|
|
|
* @param string $html |
417
|
|
|
* |
418
|
|
|
* @return string |
419
|
|
|
*/ |
420
|
140 |
|
public static function putReplacedBackToPreserveHtmlEntities(string $html): string |
421
|
|
|
{ |
422
|
140 |
|
static $DOM_REPLACE__HELPER_CACHE = null; |
423
|
|
|
|
424
|
140 |
|
if ($DOM_REPLACE__HELPER_CACHE === null) { |
425
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge( |
426
|
30 |
|
self::$domLinkReplaceHelper['tmp'], |
427
|
30 |
|
self::$domReplaceHelper['tmp'] |
428
|
|
|
); |
429
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge( |
430
|
30 |
|
self::$domLinkReplaceHelper['orig'], |
431
|
30 |
|
self::$domReplaceHelper['orig'] |
432
|
|
|
); |
433
|
|
|
|
434
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>'; |
435
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>'; |
436
|
|
|
|
437
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = ''; |
438
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = ''; |
439
|
|
|
|
440
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__start'] = '<' . self::$domHtmlSpecialScriptHelper; |
441
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_special_script__end'] = '</' . self::$domHtmlSpecialScriptHelper . '>'; |
442
|
|
|
|
443
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__start'] = '<script'; |
444
|
30 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_special_script__end'] = '</script>'; |
445
|
|
|
} |
446
|
|
|
|
447
|
|
|
if ( |
448
|
140 |
|
isset(self::$domBrokenReplaceHelper['tmp']) |
449
|
|
|
&& |
450
|
140 |
|
\count(self::$domBrokenReplaceHelper['tmp']) > 0 |
451
|
|
|
) { |
452
|
6 |
|
$html = \str_ireplace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html); |
453
|
|
|
} |
454
|
|
|
|
455
|
140 |
|
return \str_ireplace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html); |
456
|
|
|
} |
457
|
|
|
|
458
|
|
|
/** |
459
|
|
|
* @param string $html |
460
|
|
|
* |
461
|
|
|
* @return string |
462
|
|
|
*/ |
463
|
200 |
|
public static function replaceToPreserveHtmlEntities(string $html): string |
464
|
|
|
{ |
465
|
|
|
// init |
466
|
200 |
|
$linksNew = []; |
467
|
200 |
|
$linksOld = []; |
468
|
|
|
|
469
|
200 |
|
if (\strpos($html, 'http') !== false) { |
470
|
|
|
|
471
|
|
|
// regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo] |
472
|
63 |
|
$regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i'; |
473
|
63 |
|
\preg_match_all($regExUrl, $html, $linksOld); |
474
|
|
|
|
475
|
63 |
|
if (!empty($linksOld[1])) { |
476
|
60 |
|
$linksOld = $linksOld[1]; |
477
|
60 |
|
foreach ((array) $linksOld as $linkKey => $linkOld) { |
478
|
60 |
|
$linksNew[$linkKey] = \str_replace( |
479
|
60 |
|
self::$domLinkReplaceHelper['orig'], |
480
|
60 |
|
self::$domLinkReplaceHelper['tmp'], |
481
|
60 |
|
$linkOld |
482
|
|
|
); |
483
|
|
|
} |
484
|
|
|
} |
485
|
|
|
} |
486
|
|
|
|
487
|
200 |
|
$linksNewCount = \count($linksNew); |
488
|
200 |
|
if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) { |
489
|
60 |
|
$search = \array_merge($linksOld, self::$domReplaceHelper['orig']); |
490
|
60 |
|
$replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']); |
491
|
|
|
} else { |
492
|
146 |
|
$search = self::$domReplaceHelper['orig']; |
493
|
146 |
|
$replace = self::$domReplaceHelper['tmp']; |
494
|
|
|
} |
495
|
|
|
|
496
|
200 |
|
return \str_replace($search, $replace, $html); |
497
|
|
|
} |
498
|
|
|
} |
499
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.