1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace voku\helper; |
6
|
|
|
|
7
|
|
|
/** |
8
|
|
|
* Class HtmlDomParser |
9
|
|
|
* |
10
|
|
|
* @package voku\helper |
11
|
|
|
* |
12
|
|
|
* @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p> |
13
|
|
|
* @property-read string outerHtml <p>Get dom node's outer html.</p> |
14
|
|
|
* @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p> |
15
|
|
|
* @property-read string innerHtml <p>Get dom node's inner html.</p> |
16
|
|
|
* @property-read string plaintext <p>Get dom node's plain text.</p> |
17
|
|
|
* |
18
|
|
|
* @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p> |
19
|
|
|
* @method string outerHtml() <p>Get dom node's outer html.</p> |
20
|
|
|
* @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p> |
21
|
|
|
* |
22
|
|
|
* @method HtmlDomParser load() load($html) <p>Load HTML from string.</p> |
23
|
|
|
* @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p> |
24
|
|
|
* |
25
|
|
|
* @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from |
26
|
|
|
* file.</p> |
27
|
|
|
* @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from |
28
|
|
|
* string.</p> |
29
|
|
|
*/ |
30
|
|
|
class HtmlDomParser |
31
|
|
|
{ |
32
|
|
|
/** |
33
|
|
|
* @var array |
34
|
|
|
*/ |
35
|
|
|
protected static $functionAliases = [ |
36
|
|
|
'outertext' => 'html', |
37
|
|
|
'outerhtml' => 'html', |
38
|
|
|
'innertext' => 'innerHtml', |
39
|
|
|
'innerhtml' => 'innerHtml', |
40
|
|
|
'load' => 'loadHtml', |
41
|
|
|
'load_file' => 'loadHtmlFile', |
42
|
|
|
]; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* @var string[][] |
46
|
|
|
*/ |
47
|
|
|
protected static $domLinkReplaceHelper = [ |
48
|
|
|
'orig' => ['[', ']', '{', '}',], |
49
|
|
|
'tmp' => [ |
50
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____', |
51
|
|
|
'____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____', |
52
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____', |
53
|
|
|
'____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____', |
54
|
|
|
], |
55
|
|
|
]; |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* @var array |
59
|
|
|
*/ |
60
|
|
|
protected static $domReplaceHelper = [ |
61
|
|
|
'orig' => ['&', '|', '+', '%', '@'], |
62
|
|
|
'tmp' => [ |
63
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AMP____', |
64
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PIPE____', |
65
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PLUS____', |
66
|
|
|
'____SIMPLE_HTML_DOM__VOKU__PERCENT____', |
67
|
|
|
'____SIMPLE_HTML_DOM__VOKU__AT____', |
68
|
|
|
], |
69
|
|
|
]; |
70
|
|
|
|
71
|
|
|
protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____'; |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* @var array |
75
|
|
|
*/ |
76
|
|
|
protected static $domBrokenReplaceHelper = []; |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* @var Callable |
80
|
|
|
*/ |
81
|
|
|
protected static $callback; |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* @var \DOMDocument |
85
|
|
|
*/ |
86
|
|
|
protected $document; |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* @var string |
90
|
|
|
*/ |
91
|
|
|
protected $encoding = 'UTF-8'; |
92
|
|
|
|
93
|
|
|
/** |
94
|
|
|
* @var bool |
95
|
|
|
*/ |
96
|
|
|
protected $isDOMDocumentCreatedWithoutHtml = false; |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* @var bool |
100
|
|
|
*/ |
101
|
|
|
protected $isDOMDocumentCreatedWithoutWrapper = false; |
102
|
|
|
|
103
|
|
|
/** |
104
|
|
|
* @var bool |
105
|
|
|
*/ |
106
|
|
|
protected $isDOMDocumentCreatedWithoutHeadWrapper = false; |
107
|
|
|
|
108
|
|
|
/** |
109
|
|
|
* @var bool |
110
|
|
|
*/ |
111
|
|
|
protected $isDOMDocumentCreatedWithoutHtmlWrapper = false; |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* @var bool |
115
|
|
|
*/ |
116
|
|
|
protected $keepBrokenHtml; |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* Constructor |
120
|
|
|
* |
121
|
|
|
* @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode |
122
|
|
|
* |
123
|
|
|
* @throws \InvalidArgumentException |
124
|
|
|
*/ |
125
|
128 |
|
public function __construct($element = null) |
126
|
|
|
{ |
127
|
128 |
|
$this->document = new \DOMDocument('1.0', $this->getEncoding()); |
128
|
|
|
|
129
|
|
|
// reset |
130
|
128 |
|
self::$domBrokenReplaceHelper = []; |
131
|
|
|
|
132
|
|
|
// DOMDocument settings |
133
|
128 |
|
$this->document->preserveWhiteSpace = true; |
134
|
128 |
|
$this->document->formatOutput = true; |
135
|
|
|
|
136
|
128 |
|
if ($element instanceof SimpleHtmlDom) { |
137
|
63 |
|
$element = $element->getNode(); |
138
|
|
|
} |
139
|
|
|
|
140
|
128 |
|
if ($element instanceof \DOMNode) { |
141
|
63 |
|
$domNode = $this->document->importNode($element, true); |
142
|
|
|
|
143
|
63 |
|
if ($domNode instanceof \DOMNode) { |
144
|
63 |
|
$this->document->appendChild($domNode); |
145
|
|
|
} |
146
|
|
|
|
147
|
63 |
|
return; |
148
|
|
|
} |
149
|
|
|
|
150
|
128 |
|
if ($element !== null) { |
151
|
73 |
|
$this->loadHtml($element); |
152
|
|
|
} |
153
|
127 |
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* @param $name |
157
|
|
|
* @param $arguments |
158
|
|
|
* |
159
|
|
|
* @return bool|mixed |
160
|
|
|
*/ |
161
|
46 |
View Code Duplication |
public function __call($name, $arguments) |
|
|
|
|
162
|
|
|
{ |
163
|
46 |
|
$name = \strtolower($name); |
164
|
|
|
|
165
|
46 |
|
if (isset(self::$functionAliases[$name])) { |
166
|
45 |
|
return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments); |
167
|
|
|
} |
168
|
|
|
|
169
|
1 |
|
throw new \BadMethodCallException('Method does not exist: ' . $name); |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
/** |
173
|
|
|
* @param $name |
174
|
|
|
* @param $arguments |
175
|
|
|
* |
176
|
|
|
* @return HtmlDomParser |
177
|
|
|
* |
178
|
|
|
* @throws \BadMethodCallException |
179
|
|
|
* @throws \RuntimeException |
180
|
|
|
* @throws \InvalidArgumentException |
181
|
|
|
*/ |
182
|
18 |
|
public static function __callStatic($name, $arguments) |
183
|
|
|
{ |
184
|
18 |
|
$arguments0 = ''; |
185
|
18 |
|
if (isset($arguments[0])) { |
186
|
17 |
|
$arguments0 = $arguments[0]; |
187
|
|
|
} |
188
|
|
|
|
189
|
18 |
|
$arguments1 = null; |
190
|
18 |
|
if (isset($arguments[1])) { |
191
|
1 |
|
$arguments1 = $arguments[1]; |
192
|
|
|
} |
193
|
|
|
|
194
|
18 |
|
if ($name === 'str_get_html') { |
195
|
13 |
|
$parser = new self(); |
196
|
|
|
|
197
|
13 |
|
return $parser->loadHtml($arguments0, $arguments1); |
198
|
|
|
} |
199
|
|
|
|
200
|
5 |
|
if ($name === 'file_get_html') { |
201
|
4 |
|
$parser = new self(); |
202
|
|
|
|
203
|
4 |
|
return $parser->loadHtmlFile($arguments0, $arguments1); |
204
|
|
|
} |
205
|
|
|
|
206
|
1 |
|
throw new \BadMethodCallException('Method does not exist'); |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
/** @noinspection MagicMethodsValidityInspection */ |
210
|
|
|
/** |
211
|
|
|
* @param $name |
212
|
|
|
* |
213
|
|
|
* @return string |
214
|
|
|
*/ |
215
|
13 |
|
public function __get($name) |
216
|
|
|
{ |
217
|
13 |
|
$name = \strtolower($name); |
218
|
|
|
|
219
|
13 |
|
switch ($name) { |
220
|
13 |
|
case 'outerhtml': |
221
|
13 |
|
case 'outertext': |
222
|
4 |
|
return $this->html(); |
223
|
9 |
|
case 'innerhtml': |
224
|
3 |
|
case 'innertext': |
225
|
7 |
|
return $this->innerHtml(); |
226
|
2 |
|
case 'text': |
227
|
2 |
|
case 'plaintext': |
228
|
1 |
|
return $this->text(); |
229
|
|
|
} |
230
|
|
|
|
231
|
1 |
|
return null; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
/** |
235
|
|
|
* @param string $selector |
236
|
|
|
* @param int $idx |
237
|
|
|
* |
238
|
|
|
* @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface |
239
|
|
|
*/ |
240
|
3 |
|
public function __invoke($selector, $idx = null) |
241
|
|
|
{ |
242
|
3 |
|
return $this->find($selector, $idx); |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
/** |
246
|
|
|
* @return string |
247
|
|
|
*/ |
248
|
15 |
|
public function __toString() |
249
|
|
|
{ |
250
|
15 |
|
return $this->html(); |
251
|
|
|
} |
252
|
|
|
|
253
|
|
|
/** |
254
|
|
|
* does nothing (only for api-compatibility-reasons) |
255
|
|
|
* |
256
|
|
|
* @deprecated |
257
|
|
|
* |
258
|
|
|
* @return bool |
259
|
|
|
*/ |
260
|
1 |
|
public function clear(): bool |
261
|
|
|
{ |
262
|
1 |
|
return true; |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
/** |
266
|
|
|
* @param string $html |
267
|
|
|
* |
268
|
|
|
* @return string |
269
|
|
|
*/ |
270
|
117 |
|
public static function replaceToPreserveHtmlEntities(string $html): string |
271
|
|
|
{ |
272
|
|
|
// init |
273
|
117 |
|
$linksNew = []; |
274
|
117 |
|
$linksOld = []; |
275
|
|
|
|
276
|
117 |
|
if (\strpos($html, 'http') !== false) { |
277
|
|
|
|
278
|
|
|
// regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo] |
279
|
55 |
|
$regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i'; |
280
|
55 |
|
\preg_match_all($regExUrl, $html, $linksOld); |
281
|
|
|
|
282
|
55 |
|
if (!empty($linksOld[1])) { |
283
|
53 |
|
$linksOld = $linksOld[1]; |
284
|
53 |
|
foreach ((array)$linksOld as $linkKey => $linkOld) { |
285
|
53 |
|
$linksNew[$linkKey] = \str_replace( |
286
|
53 |
|
self::$domLinkReplaceHelper['orig'], |
287
|
53 |
|
self::$domLinkReplaceHelper['tmp'], |
288
|
53 |
|
$linkOld |
289
|
|
|
); |
290
|
|
|
} |
291
|
|
|
} |
292
|
|
|
} |
293
|
|
|
|
294
|
117 |
|
$linksNewCount = \count($linksNew); |
295
|
117 |
|
if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) { |
296
|
53 |
|
$search = \array_merge($linksOld, self::$domReplaceHelper['orig']); |
297
|
53 |
|
$replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']); |
298
|
|
|
} else { |
299
|
68 |
|
$search = self::$domReplaceHelper['orig']; |
300
|
68 |
|
$replace = self::$domReplaceHelper['tmp']; |
301
|
|
|
} |
302
|
|
|
|
303
|
117 |
|
return \str_replace($search, $replace, $html); |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
/** |
307
|
|
|
* @param string $html |
308
|
|
|
* |
309
|
|
|
* @return string |
310
|
|
|
*/ |
311
|
73 |
|
public static function putReplacedBackToPreserveHtmlEntities(string $html): string |
312
|
|
|
{ |
313
|
73 |
|
static $DOM_REPLACE__HELPER_CACHE = null; |
314
|
|
|
|
315
|
73 |
|
if ($DOM_REPLACE__HELPER_CACHE === null) { |
316
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge( |
317
|
1 |
|
self::$domLinkReplaceHelper['tmp'], |
318
|
1 |
|
self::$domReplaceHelper['tmp'] |
319
|
|
|
); |
320
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge( |
321
|
1 |
|
self::$domLinkReplaceHelper['orig'], |
322
|
1 |
|
self::$domReplaceHelper['orig'] |
323
|
|
|
); |
324
|
|
|
|
325
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>'; |
326
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>'; |
327
|
|
|
|
328
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = ''; |
329
|
1 |
|
$DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = ''; |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
if ( |
333
|
73 |
|
isset(self::$domBrokenReplaceHelper['tmp']) |
334
|
|
|
&& |
335
|
73 |
|
\count(self::$domBrokenReplaceHelper['tmp']) > 0 |
336
|
|
|
) { |
337
|
2 |
|
$html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html); |
338
|
|
|
} |
339
|
|
|
|
340
|
73 |
|
return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html); |
341
|
|
|
} |
342
|
|
|
|
343
|
|
|
/** |
344
|
|
|
* Create DOMDocument from HTML. |
345
|
|
|
* |
346
|
|
|
* @param string $html |
347
|
|
|
* @param int|null $libXMLExtraOptions |
348
|
|
|
* |
349
|
|
|
* @return \DOMDocument |
350
|
|
|
*/ |
351
|
116 |
|
private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument |
352
|
|
|
{ |
353
|
116 |
|
if ($this->keepBrokenHtml === true) { |
354
|
2 |
|
$html = $this->keepBrokenHtml(trim($html)); |
355
|
|
|
} |
356
|
|
|
|
357
|
116 |
|
if (\strpos($html, '<') === false) { |
358
|
6 |
|
$this->isDOMDocumentCreatedWithoutHtml = true; |
359
|
115 |
|
} elseif (\strpos(\ltrim($html), '<') !== 0) { |
360
|
4 |
|
$this->isDOMDocumentCreatedWithoutWrapper = true; |
361
|
|
|
} |
362
|
|
|
|
363
|
116 |
|
if (\strpos($html, '<html') === false) { |
364
|
67 |
|
$this->isDOMDocumentCreatedWithoutHtmlWrapper = true; |
365
|
|
|
} |
366
|
|
|
|
367
|
116 |
|
if (\strpos($html, '<head>') === false) { |
368
|
69 |
|
$this->isDOMDocumentCreatedWithoutHeadWrapper = true; |
369
|
|
|
} |
370
|
|
|
|
371
|
|
|
// set error level |
372
|
116 |
|
$internalErrors = \libxml_use_internal_errors(true); |
373
|
116 |
|
$disableEntityLoader = \libxml_disable_entity_loader(true); |
374
|
116 |
|
\libxml_clear_errors(); |
375
|
|
|
|
376
|
116 |
|
$optionsXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET; |
377
|
|
|
|
378
|
116 |
|
if (\defined('LIBXML_BIGLINES')) { |
379
|
116 |
|
$optionsXml |= LIBXML_BIGLINES; |
380
|
|
|
} |
381
|
|
|
|
382
|
116 |
|
if (\defined('LIBXML_COMPACT')) { |
383
|
116 |
|
$optionsXml |= LIBXML_COMPACT; |
384
|
|
|
} |
385
|
|
|
|
386
|
116 |
|
if (\defined('LIBXML_HTML_NODEFDTD')) { |
387
|
116 |
|
$optionsXml |= LIBXML_HTML_NODEFDTD; |
388
|
|
|
} |
389
|
|
|
|
390
|
116 |
|
if ($libXMLExtraOptions !== null) { |
391
|
1 |
|
$optionsXml |= $libXMLExtraOptions; |
392
|
|
|
} |
393
|
|
|
|
394
|
|
|
if ( |
395
|
116 |
|
$this->isDOMDocumentCreatedWithoutWrapper === true |
396
|
|
|
|| |
397
|
116 |
|
$this->keepBrokenHtml === true |
398
|
|
|
) { |
399
|
5 |
|
$html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>'; |
400
|
|
|
} |
401
|
|
|
|
402
|
116 |
|
$html = self::replaceToPreserveHtmlEntities($html); |
403
|
|
|
|
404
|
116 |
|
$sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml); |
405
|
116 |
|
if ($sxe !== false && \count(\libxml_get_errors()) === 0) { |
406
|
|
|
|
407
|
41 |
|
$this->document = \dom_import_simplexml($sxe)->ownerDocument; |
408
|
|
|
|
409
|
|
|
} else { |
410
|
|
|
|
411
|
|
|
// UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251 |
412
|
79 |
|
$xmlHackUsed = false; |
413
|
79 |
|
if (\stripos('<?xml', $html) !== 0) { |
414
|
79 |
|
$xmlHackUsed = true; |
415
|
79 |
|
$html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html; |
416
|
|
|
} |
417
|
|
|
|
418
|
79 |
|
$this->document->loadHTML($html, $optionsXml); |
419
|
|
|
|
420
|
|
|
// remove the "xml-encoding" hack |
421
|
79 |
|
if ($xmlHackUsed === true) { |
422
|
79 |
|
foreach ($this->document->childNodes as $child) { |
423
|
79 |
|
if ($child->nodeType === XML_PI_NODE) { |
424
|
79 |
|
$this->document->removeChild($child); |
425
|
79 |
|
break; |
426
|
|
|
} |
427
|
|
|
} |
428
|
|
|
} |
429
|
|
|
} |
430
|
|
|
|
431
|
|
|
// set encoding |
432
|
116 |
|
$this->document->encoding = $this->getEncoding(); |
433
|
|
|
|
434
|
|
|
// restore lib-xml settings |
435
|
116 |
|
\libxml_clear_errors(); |
436
|
116 |
|
\libxml_use_internal_errors($internalErrors); |
437
|
116 |
|
\libxml_disable_entity_loader($disableEntityLoader); |
438
|
|
|
|
439
|
116 |
|
return $this->document; |
440
|
|
|
} |
441
|
|
|
|
442
|
|
|
/** |
443
|
|
|
* @param string $html |
444
|
|
|
* |
445
|
|
|
* @return string |
446
|
|
|
*/ |
447
|
2 |
|
protected function keepBrokenHtml(string $html): string |
448
|
|
|
{ |
449
|
|
|
do { |
450
|
2 |
|
$original = $html; |
451
|
|
|
|
452
|
2 |
|
$html = (string)preg_replace_callback( |
453
|
2 |
|
'/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui', |
454
|
2 |
|
function ($matches) { |
455
|
2 |
|
return $matches['start'] . |
456
|
2 |
|
'°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' . |
457
|
2 |
|
$matches['value'] . |
458
|
2 |
|
'°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' . |
459
|
2 |
|
$matches['end']; |
460
|
2 |
|
}, |
461
|
2 |
|
$html |
462
|
|
|
); |
463
|
|
|
|
464
|
2 |
|
} while ($original !== $html); |
465
|
|
|
|
466
|
|
|
do { |
467
|
2 |
|
$original = $html; |
468
|
|
|
|
469
|
2 |
|
$html = (string)preg_replace_callback( |
470
|
2 |
|
'/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u', |
471
|
2 |
|
function ($matches) { |
472
|
|
|
|
473
|
2 |
|
$matches['broken'] = str_replace( |
474
|
2 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
475
|
2 |
|
['</', '<', '>'], |
476
|
2 |
|
$matches['broken'] |
477
|
|
|
); |
478
|
|
|
|
479
|
2 |
|
self::$domBrokenReplaceHelper['orig'][] = $matches['broken']; |
480
|
2 |
|
self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . crc32($matches['broken']); |
481
|
|
|
|
482
|
2 |
|
return $matches['start'] . $matchesHash . $matches['end']; |
483
|
2 |
|
}, |
484
|
2 |
|
$html |
485
|
|
|
); |
486
|
|
|
|
487
|
2 |
|
} while ($original !== $html); |
488
|
|
|
|
489
|
2 |
|
$html = str_replace( |
490
|
2 |
|
['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'], |
491
|
2 |
|
['</', '<', '>'], |
492
|
2 |
|
$html |
493
|
|
|
); |
494
|
|
|
|
495
|
2 |
|
return $html; |
496
|
|
|
} |
497
|
|
|
|
498
|
|
|
/** |
499
|
|
|
* Return element by #id. |
500
|
|
|
* |
501
|
|
|
* @param string $id |
502
|
|
|
* |
503
|
|
|
* @return SimpleHtmlDom|SimpleHtmlDomNodeBlank |
504
|
|
|
*/ |
505
|
2 |
|
public function getElementById(string $id) |
506
|
|
|
{ |
507
|
2 |
|
return $this->find("#$id", 0); |
508
|
|
|
} |
509
|
|
|
|
510
|
|
|
/** |
511
|
|
|
* Return element by tag name. |
512
|
|
|
* |
513
|
|
|
* @param string $name |
514
|
|
|
* |
515
|
|
|
* @return SimpleHtmlDom|SimpleHtmlDomNodeBlank |
516
|
|
|
*/ |
517
|
1 |
|
public function getElementByTagName(string $name) |
518
|
|
|
{ |
519
|
1 |
|
$node = $this->document->getElementsByTagName($name)->item(0); |
520
|
|
|
|
521
|
1 |
|
if ($node === null) { |
522
|
|
|
return new SimpleHtmlDomNodeBlank(); |
523
|
|
|
} |
524
|
|
|
|
525
|
1 |
|
return new SimpleHtmlDom($node); |
526
|
|
|
} |
527
|
|
|
|
528
|
|
|
/** |
529
|
|
|
* Returns elements by #id. |
530
|
|
|
* |
531
|
|
|
* @param string $id |
532
|
|
|
* @param null|int $idx |
533
|
|
|
* |
534
|
|
|
* @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface |
535
|
|
|
*/ |
536
|
|
|
public function getElementsById(string $id, $idx = null) |
537
|
|
|
{ |
538
|
|
|
return $this->find("#$id", $idx); |
539
|
|
|
} |
540
|
|
|
|
541
|
|
|
/** |
542
|
|
|
* Returns elements by tag name. |
543
|
|
|
* |
544
|
|
|
* @param string $name |
545
|
|
|
* @param null|int $idx |
546
|
|
|
* |
547
|
|
|
* @return SimpleHtmlDomNode|SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeBlank |
548
|
|
|
*/ |
549
|
3 |
View Code Duplication |
public function getElementsByTagName(string $name, $idx = null) |
|
|
|
|
550
|
|
|
{ |
551
|
3 |
|
$nodesList = $this->document->getElementsByTagName($name); |
552
|
|
|
|
553
|
3 |
|
$elements = new SimpleHtmlDomNode(); |
554
|
|
|
|
555
|
3 |
|
foreach ($nodesList as $node) { |
556
|
3 |
|
$elements[] = new SimpleHtmlDom($node); |
557
|
|
|
} |
558
|
|
|
|
559
|
|
|
// return all elements |
560
|
3 |
|
if (null === $idx) { |
561
|
2 |
|
return $elements; |
562
|
|
|
} |
563
|
|
|
|
564
|
|
|
// handle negative values |
565
|
1 |
|
if ($idx < 0) { |
566
|
|
|
$idx = \count($elements) + $idx; |
567
|
|
|
} |
568
|
|
|
|
569
|
|
|
// return one element |
570
|
1 |
|
if (isset($elements[$idx])) { |
571
|
1 |
|
return $elements[$idx]; |
572
|
|
|
} |
573
|
|
|
|
574
|
|
|
// return a blank-element |
575
|
|
|
return new SimpleHtmlDomNodeBlank(); |
576
|
|
|
} |
577
|
|
|
|
578
|
|
|
/** |
579
|
|
|
* Find one node with a CSS selector. |
580
|
|
|
* |
581
|
|
|
* @param string $selector |
582
|
|
|
* |
583
|
|
|
* @return SimpleHtmlDom|SimpleHtmlDomNodeInterface |
584
|
|
|
*/ |
585
|
1 |
|
public function findOne(string $selector) |
586
|
|
|
{ |
587
|
1 |
|
return $this->find($selector, 0); |
588
|
|
|
} |
589
|
|
|
|
590
|
|
|
/** |
591
|
|
|
* Find list of nodes with a CSS selector. |
592
|
|
|
* |
593
|
|
|
* @param string $selector |
594
|
|
|
* @param int $idx |
595
|
|
|
* |
596
|
|
|
* @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface |
597
|
|
|
*/ |
598
|
83 |
|
public function find(string $selector, $idx = null) |
599
|
|
|
{ |
600
|
83 |
|
$xPathQuery = SelectorConverter::toXPath($selector); |
601
|
|
|
|
602
|
83 |
|
$xPath = new \DOMXPath($this->document); |
603
|
83 |
|
$nodesList = $xPath->query($xPathQuery); |
604
|
83 |
|
$elements = new SimpleHtmlDomNode(); |
605
|
|
|
|
606
|
83 |
|
foreach ($nodesList as $node) { |
607
|
79 |
|
$elements[] = new SimpleHtmlDom($node); |
608
|
|
|
} |
609
|
|
|
|
610
|
|
|
// return all elements |
611
|
83 |
|
if (null === $idx) { |
612
|
54 |
|
return $elements; |
613
|
|
|
} |
614
|
|
|
|
615
|
|
|
// handle negative values |
616
|
41 |
|
if ($idx < 0) { |
617
|
11 |
|
$idx = \count($elements) + $idx; |
618
|
|
|
} |
619
|
|
|
|
620
|
|
|
// return one element |
621
|
41 |
|
if (isset($elements[$idx])) { |
622
|
39 |
|
return $elements[$idx]; |
623
|
|
|
} |
624
|
|
|
|
625
|
|
|
// return a blank-element |
626
|
5 |
|
return new SimpleHtmlDomNodeBlank(); |
627
|
|
|
} |
628
|
|
|
|
629
|
|
|
/** |
630
|
|
|
* @param string $content |
631
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
632
|
|
|
* |
633
|
|
|
* @return string |
634
|
|
|
*/ |
635
|
64 |
|
public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string |
636
|
|
|
{ |
637
|
|
|
// INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>), |
638
|
|
|
// so we try to remove it here again ... |
639
|
|
|
|
640
|
64 |
|
if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) { |
641
|
26 |
|
$content = \str_replace( |
642
|
|
|
[ |
643
|
26 |
|
"\n", |
644
|
|
|
"\r\n", |
645
|
|
|
"\r", |
646
|
|
|
'<body>', |
647
|
|
|
'</body>', |
648
|
|
|
'<html>', |
649
|
|
|
'</html>', |
650
|
|
|
], |
651
|
26 |
|
'', |
652
|
26 |
|
$content |
653
|
|
|
); |
654
|
|
|
} |
655
|
|
|
|
656
|
64 |
|
if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) { |
657
|
27 |
|
$content = \str_replace( |
658
|
|
|
[ |
659
|
27 |
|
'<head>', |
660
|
|
|
'</head>', |
661
|
|
|
], |
662
|
27 |
|
'', |
663
|
27 |
|
$content |
664
|
|
|
); |
665
|
|
|
} |
666
|
|
|
|
667
|
64 |
|
if ($this->isDOMDocumentCreatedWithoutWrapper === true) { |
668
|
3 |
|
$content = (string)\preg_replace('/^<p>/', '', $content); |
669
|
3 |
|
$content = (string)\preg_replace('/<\/p>/', '', $content); |
670
|
|
|
} |
671
|
|
|
|
672
|
64 |
|
if ($this->isDOMDocumentCreatedWithoutHtml === true) { |
673
|
5 |
|
$content = \str_replace( |
674
|
|
|
[ |
675
|
5 |
|
'<p>', |
676
|
|
|
'</p>', |
677
|
|
|
'<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">', |
678
|
|
|
], |
679
|
5 |
|
'', |
680
|
5 |
|
$content |
681
|
|
|
); |
682
|
|
|
} |
683
|
|
|
|
684
|
|
|
/** @noinspection CheckTagEmptyBody */ |
685
|
|
|
/** @noinspection HtmlExtraClosingTag */ |
686
|
64 |
|
$content = \trim( |
687
|
64 |
|
\str_replace( |
688
|
|
|
[ |
689
|
64 |
|
'<simpleHtmlDomP>', |
690
|
|
|
'</simpleHtmlDomP>', |
691
|
|
|
'<head><head>', |
692
|
|
|
'</head></head>', |
693
|
|
|
'<br></br>', |
694
|
|
|
], |
695
|
|
|
[ |
696
|
64 |
|
'', |
697
|
|
|
'', |
698
|
|
|
'<head>', |
699
|
|
|
'</head>', |
700
|
|
|
'<br>', |
701
|
|
|
], |
702
|
64 |
|
$content |
703
|
|
|
) |
704
|
|
|
); |
705
|
|
|
|
706
|
64 |
|
if ($multiDecodeNewHtmlEntity === true) { |
707
|
3 |
|
if (\class_exists('\voku\helper\UTF8')) { |
708
|
|
|
|
709
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
710
|
|
|
$content = UTF8::rawurldecode($content); |
711
|
|
|
|
712
|
|
|
} else { |
713
|
|
|
|
714
|
|
|
do { |
715
|
3 |
|
$content_compare = $content; |
716
|
|
|
|
717
|
3 |
|
$content = \rawurldecode( |
718
|
3 |
|
\html_entity_decode( |
719
|
3 |
|
$content, |
720
|
3 |
|
ENT_QUOTES | ENT_HTML5 |
721
|
|
|
) |
722
|
|
|
); |
723
|
|
|
|
724
|
3 |
|
} while ($content_compare !== $content); |
725
|
|
|
|
726
|
|
|
} |
727
|
|
|
|
728
|
|
|
} else { |
729
|
|
|
|
730
|
63 |
|
$content = \rawurldecode( |
731
|
63 |
|
\html_entity_decode( |
732
|
63 |
|
$content, |
733
|
63 |
|
ENT_QUOTES | ENT_HTML5 |
734
|
|
|
) |
735
|
|
|
); |
736
|
|
|
} |
737
|
|
|
|
738
|
64 |
|
$content = self::putReplacedBackToPreserveHtmlEntities($content); |
739
|
|
|
|
740
|
64 |
|
return $content; |
741
|
|
|
} |
742
|
|
|
|
743
|
|
|
/** |
744
|
|
|
* @return \DOMDocument |
745
|
|
|
*/ |
746
|
37 |
|
public function getDocument(): \DOMDocument |
747
|
|
|
{ |
748
|
37 |
|
return $this->document; |
749
|
|
|
} |
750
|
|
|
|
751
|
|
|
/** |
752
|
|
|
* Get the encoding to use. |
753
|
|
|
* |
754
|
|
|
* @return string |
755
|
|
|
*/ |
756
|
128 |
|
private function getEncoding(): string |
757
|
|
|
{ |
758
|
128 |
|
return $this->encoding; |
759
|
|
|
} |
760
|
|
|
|
761
|
|
|
/** |
762
|
|
|
* @return bool |
763
|
|
|
*/ |
764
|
8 |
|
public function getIsDOMDocumentCreatedWithoutHtml(): bool |
765
|
|
|
{ |
766
|
8 |
|
return $this->isDOMDocumentCreatedWithoutHtml; |
767
|
|
|
} |
768
|
|
|
|
769
|
|
|
/** |
770
|
|
|
* @return bool |
771
|
|
|
*/ |
772
|
39 |
|
public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool |
773
|
|
|
{ |
774
|
39 |
|
return $this->isDOMDocumentCreatedWithoutHtmlWrapper; |
775
|
|
|
} |
776
|
|
|
|
777
|
|
|
/** |
778
|
|
|
* @return bool |
779
|
|
|
*/ |
780
|
6 |
|
public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool |
781
|
|
|
{ |
782
|
6 |
|
return $this->isDOMDocumentCreatedWithoutHeadWrapper; |
783
|
|
|
} |
784
|
|
|
|
785
|
|
|
/** |
786
|
|
|
* @return bool |
787
|
|
|
*/ |
788
|
|
|
public function getIsDOMDocumentCreatedWithoutWrapper(): bool |
789
|
|
|
{ |
790
|
|
|
return $this->isDOMDocumentCreatedWithoutWrapper; |
791
|
|
|
} |
792
|
|
|
|
793
|
|
|
/** |
794
|
|
|
* Get dom node's outer html. |
795
|
|
|
* |
796
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
797
|
|
|
* |
798
|
|
|
* @return string |
799
|
|
|
*/ |
800
|
39 |
|
public function html(bool $multiDecodeNewHtmlEntity = false): string |
801
|
|
|
{ |
802
|
39 |
|
if ($this::$callback !== null) { |
803
|
|
|
\call_user_func($this::$callback, [$this]); |
804
|
|
|
} |
805
|
|
|
|
806
|
39 |
|
if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) { |
807
|
19 |
|
$content = $this->document->saveHTML($this->document->documentElement); |
808
|
|
|
} else { |
809
|
25 |
|
$content = $this->document->saveHTML(); |
810
|
|
|
} |
811
|
|
|
|
812
|
39 |
|
return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity); |
813
|
|
|
} |
814
|
|
|
|
815
|
|
|
/** |
816
|
|
|
* @param bool $keepBrokenHtml |
817
|
|
|
* |
818
|
|
|
* @return HtmlDomParser |
819
|
|
|
*/ |
820
|
2 |
|
public function useKeepBrokenHtml(bool $keepBrokenHtml): self |
821
|
|
|
{ |
822
|
2 |
|
$this->keepBrokenHtml = $keepBrokenHtml; |
823
|
|
|
|
824
|
2 |
|
return $this; |
825
|
|
|
} |
826
|
|
|
|
827
|
|
|
/** |
828
|
|
|
* Get the HTML as XML. |
829
|
|
|
* |
830
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
831
|
|
|
* |
832
|
|
|
* @return string |
833
|
|
|
*/ |
834
|
2 |
|
public function xml(bool $multiDecodeNewHtmlEntity = false): string |
835
|
|
|
{ |
836
|
2 |
|
$xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG); |
837
|
|
|
|
838
|
|
|
// remove the XML-header |
839
|
2 |
|
$xml = \ltrim((string)\preg_replace('/<\?xml.*\?>/', '', $xml)); |
840
|
|
|
|
841
|
2 |
|
return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity); |
842
|
|
|
} |
843
|
|
|
|
844
|
|
|
/** |
845
|
|
|
* Get dom node's inner html. |
846
|
|
|
* |
847
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
848
|
|
|
* |
849
|
|
|
* @return string |
850
|
|
|
*/ |
851
|
19 |
|
public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string |
852
|
|
|
{ |
853
|
|
|
// init |
854
|
19 |
|
$text = ''; |
855
|
|
|
|
856
|
19 |
|
foreach ($this->document->documentElement->childNodes as $node) { |
857
|
19 |
|
$text .= $this->document->saveHTML($node); |
858
|
|
|
} |
859
|
|
|
|
860
|
19 |
|
return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity); |
861
|
|
|
} |
862
|
|
|
|
863
|
|
|
/** |
864
|
|
|
* Load HTML from string. |
865
|
|
|
* |
866
|
|
|
* @param string $html |
867
|
|
|
* @param int|null $libXMLExtraOptions |
868
|
|
|
* |
869
|
|
|
* @return HtmlDomParser |
870
|
|
|
* |
871
|
|
|
* @throws \InvalidArgumentException if argument is not string |
872
|
|
|
*/ |
873
|
116 |
|
public function loadHtml(string $html, $libXMLExtraOptions = null): self |
874
|
|
|
{ |
875
|
116 |
|
$this->document = $this->createDOMDocument($html, $libXMLExtraOptions); |
876
|
|
|
|
877
|
116 |
|
return $this; |
878
|
|
|
} |
879
|
|
|
|
880
|
|
|
/** |
881
|
|
|
* Load HTML from file. |
882
|
|
|
* |
883
|
|
|
* @param string $filePath |
884
|
|
|
* @param int|null $libXMLExtraOptions |
885
|
|
|
* |
886
|
|
|
* @return HtmlDomParser |
887
|
|
|
* |
888
|
|
|
* @throws \RuntimeException |
889
|
|
|
* @throws \InvalidArgumentException |
890
|
|
|
*/ |
891
|
11 |
|
public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self |
892
|
|
|
{ |
893
|
|
|
if ( |
894
|
11 |
|
!\preg_match("/^https?:\/\//i", $filePath) |
895
|
|
|
&& |
896
|
11 |
|
!\file_exists($filePath) |
897
|
|
|
) { |
898
|
1 |
|
throw new \RuntimeException("File $filePath not found"); |
899
|
|
|
} |
900
|
|
|
|
901
|
|
|
try { |
902
|
10 |
|
if (\class_exists('\voku\helper\UTF8')) { |
903
|
|
|
/** @noinspection PhpUndefinedClassInspection */ |
904
|
|
|
$html = UTF8::file_get_contents($filePath); |
905
|
|
|
} else { |
906
|
10 |
|
$html = \file_get_contents($filePath); |
907
|
|
|
} |
908
|
1 |
|
} catch (\Exception $e) { |
909
|
1 |
|
throw new \RuntimeException("Could not load file $filePath"); |
910
|
|
|
} |
911
|
|
|
|
912
|
9 |
|
if ($html === false) { |
913
|
|
|
throw new \RuntimeException("Could not load file $filePath"); |
914
|
|
|
} |
915
|
|
|
|
916
|
9 |
|
$this->loadHtml($html, $libXMLExtraOptions); |
917
|
|
|
|
918
|
9 |
|
return $this; |
919
|
|
|
} |
920
|
|
|
|
921
|
|
|
/** |
922
|
|
|
* Save the html-dom as string. |
923
|
|
|
* |
924
|
|
|
* @param string $filepath |
925
|
|
|
* |
926
|
|
|
* @return string |
927
|
|
|
*/ |
928
|
1 |
|
public function save(string $filepath = ''): string |
929
|
|
|
{ |
930
|
1 |
|
$string = $this->innerHtml(); |
931
|
1 |
|
if ($filepath !== '') { |
932
|
|
|
\file_put_contents($filepath, $string, LOCK_EX); |
933
|
|
|
} |
934
|
|
|
|
935
|
1 |
|
return $string; |
936
|
|
|
} |
937
|
|
|
|
938
|
|
|
/** |
939
|
|
|
* @param $functionName |
940
|
|
|
*/ |
941
|
|
|
public function set_callback($functionName) |
942
|
|
|
{ |
943
|
|
|
$this::$callback = $functionName; |
944
|
|
|
} |
945
|
|
|
|
946
|
|
|
/** |
947
|
|
|
* Get dom node's plain text. |
948
|
|
|
* |
949
|
|
|
* @param bool $multiDecodeNewHtmlEntity |
950
|
|
|
* |
951
|
|
|
* @return string |
952
|
|
|
*/ |
953
|
2 |
|
public function text(bool $multiDecodeNewHtmlEntity = false): string |
954
|
|
|
{ |
955
|
2 |
|
return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity); |
956
|
|
|
} |
957
|
|
|
|
958
|
|
|
public function __clone() |
959
|
|
|
{ |
960
|
|
|
$this->document = clone $this->document; |
961
|
|
|
} |
962
|
|
|
} |
963
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.