1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace simplehtmldom; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Website: http://sourceforge.net/projects/simplehtmldom/ |
7
|
|
|
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/). |
8
|
|
|
* |
9
|
|
|
* Licensed under The MIT License |
10
|
|
|
* See the LICENSE file in the project root for more information. |
11
|
|
|
* |
12
|
|
|
* Authors: |
13
|
|
|
* S.C. Chen |
14
|
|
|
* John Schlick |
15
|
|
|
* Rus Carroll |
16
|
|
|
* logmanoriginal |
17
|
|
|
* |
18
|
|
|
* Contributors: |
19
|
|
|
* Yousuke Kumakura |
20
|
|
|
* Vadim Voituk |
21
|
|
|
* Antcs |
22
|
|
|
* |
23
|
|
|
* Version Rev. 2.0-RC2 (415) |
24
|
|
|
*/ |
25
|
|
|
include_once __DIR__ . '/constants.php'; |
26
|
|
|
include_once __DIR__ . '/Debug.php'; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* HTMLNode class |
30
|
|
|
* @property string $innertext |
31
|
|
|
* @property string|null $title |
32
|
|
|
* @property string|null $alt |
33
|
|
|
* @property string|null $src |
34
|
|
|
* @property string|null $href |
35
|
|
|
* @property string|null $async |
36
|
|
|
* @property string|null $defer |
37
|
|
|
*/ |
38
|
|
|
class HtmlNode |
39
|
|
|
{ |
40
|
|
|
const HDOM_TYPE_ELEMENT = 1; |
41
|
|
|
const HDOM_TYPE_COMMENT = 2; |
42
|
|
|
const HDOM_TYPE_TEXT = 3; |
43
|
|
|
const HDOM_TYPE_ROOT = 5; |
44
|
|
|
const HDOM_TYPE_UNKNOWN = 6; |
45
|
|
|
const HDOM_TYPE_CDATA = 7; |
46
|
|
|
|
47
|
|
|
const HDOM_QUOTE_DOUBLE = 0; |
48
|
|
|
const HDOM_QUOTE_SINGLE = 1; |
49
|
|
|
const HDOM_QUOTE_NO = 3; |
50
|
|
|
|
51
|
|
|
const HDOM_INFO_BEGIN = 0; |
52
|
|
|
const HDOM_INFO_END = 1; |
53
|
|
|
const HDOM_INFO_QUOTE = 2; |
54
|
|
|
const HDOM_INFO_SPACE = 3; |
55
|
|
|
const HDOM_INFO_TEXT = 4; |
56
|
|
|
const HDOM_INFO_INNER = 5; |
57
|
|
|
const HDOM_INFO_OUTER = 6; |
58
|
|
|
const HDOM_INFO_ENDSPACE = 7; |
59
|
|
|
|
60
|
|
|
public $nodetype = self::HDOM_TYPE_TEXT; |
61
|
|
|
public $tag = 'text'; |
62
|
|
|
public $attr = []; |
63
|
|
|
public $children = []; |
64
|
|
|
public $nodes = []; |
65
|
|
|
public $parent = null; |
66
|
|
|
public $_ = []; |
67
|
|
|
private $dom = null; |
68
|
|
|
|
69
|
|
|
public function __call($func, $args) |
70
|
|
|
{ |
71
|
|
|
// Allow users to call methods with lower_case syntax |
72
|
|
|
switch ($func) { |
73
|
|
|
case 'children': |
74
|
|
|
$actual_function = 'childNodes'; |
75
|
|
|
break; |
76
|
|
|
case 'first_child': |
77
|
|
|
$actual_function = 'firstChild'; |
78
|
|
|
break; |
79
|
|
|
case 'has_child': |
80
|
|
|
$actual_function = 'hasChildNodes'; |
81
|
|
|
break; |
82
|
|
|
case 'last_child': |
83
|
|
|
$actual_function = 'lastChild'; |
84
|
|
|
break; |
85
|
|
|
case 'next_sibling': |
86
|
|
|
$actual_function = 'nextSibling'; |
87
|
|
|
break; |
88
|
|
|
case 'prev_sibling': |
89
|
|
|
$actual_function = 'previousSibling'; |
90
|
|
|
break; |
91
|
|
|
default: |
92
|
|
|
trigger_error( |
93
|
|
|
'Call to undefined method ' . __CLASS__ . '::' . $func . '()', |
94
|
|
|
E_USER_ERROR |
95
|
|
|
); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
// phpcs:ignore Generic.Files.LineLength |
99
|
|
|
Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.'); |
|
|
|
|
100
|
|
|
|
101
|
|
|
return call_user_func_array([$this, $actual_function], $args); |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
public function __construct($dom) |
105
|
|
|
{ |
106
|
|
|
if (null === $dom) { |
107
|
|
|
return $this; |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
$this->dom = $dom; |
111
|
|
|
$dom->nodes[] = $this; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
public function __debugInfo() |
115
|
|
|
{ |
116
|
|
|
// Translate node type to human-readable form |
117
|
|
|
switch ($this->nodetype) { |
118
|
|
|
case self::HDOM_TYPE_ELEMENT: |
119
|
|
|
$nodetype = "HDOM_TYPE_ELEMENT ($this->nodetype)"; |
120
|
|
|
break; |
121
|
|
|
case self::HDOM_TYPE_COMMENT: |
122
|
|
|
$nodetype = "HDOM_TYPE_COMMENT ($this->nodetype)"; |
123
|
|
|
break; |
124
|
|
|
case self::HDOM_TYPE_TEXT: |
125
|
|
|
$nodetype = "HDOM_TYPE_TEXT ($this->nodetype)"; |
126
|
|
|
break; |
127
|
|
|
case self::HDOM_TYPE_ROOT: |
128
|
|
|
$nodetype = "HDOM_TYPE_ROOT ($this->nodetype)"; |
129
|
|
|
break; |
130
|
|
|
case self::HDOM_TYPE_CDATA: |
131
|
|
|
$nodetype = "HDOM_TYPE_CDATA ($this->nodetype)"; |
132
|
|
|
break; |
133
|
|
|
case self::HDOM_TYPE_UNKNOWN: |
134
|
|
|
default: |
135
|
|
|
$nodetype = "HDOM_TYPE_UNKNOWN ($this->nodetype)"; |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
return [ |
139
|
|
|
'nodetype' => $nodetype, |
140
|
|
|
'tag' => $this->tag, |
141
|
|
|
'attributes' => empty($this->attr) ? 'none' : $this->attr, |
142
|
|
|
'nodes' => empty($this->nodes) ? 'none' : $this->nodes, |
143
|
|
|
]; |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
public function __toString() |
147
|
|
|
{ |
148
|
|
|
return $this->outertext(); |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
public function clear() |
152
|
|
|
{ |
153
|
|
|
unset($this->dom, $this->parent); // Break link to origin |
154
|
|
|
// Break link to branch |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** @codeCoverageIgnore */ |
158
|
|
|
public function dump($show_attr = true, $depth = 0) |
159
|
|
|
{ |
160
|
|
|
echo str_repeat("\t", $depth) . $this->tag; |
161
|
|
|
|
162
|
|
|
if ($show_attr && count($this->attr) > 0) { |
163
|
|
|
echo '('; |
164
|
|
|
foreach ($this->attr as $k => $v) { |
165
|
|
|
echo "[$k]=>\"$v\", "; |
166
|
|
|
} |
167
|
|
|
echo ')'; |
168
|
|
|
} |
169
|
|
|
|
170
|
|
|
echo "\n"; |
171
|
|
|
|
172
|
|
|
if ($this->nodes) { |
|
|
|
|
173
|
|
|
foreach ($this->nodes as $node) { |
174
|
|
|
$node->dump($show_attr, $depth + 1); |
175
|
|
|
} |
176
|
|
|
} |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
/** @codeCoverageIgnore */ |
180
|
|
|
public function dump_node($echo = true) |
181
|
|
|
{ |
182
|
|
|
$string = $this->tag; |
183
|
|
|
|
184
|
|
|
if (count($this->attr) > 0) { |
185
|
|
|
$string .= '('; |
186
|
|
|
foreach ($this->attr as $k => $v) { |
187
|
|
|
$string .= "[$k]=>\"$v\", "; |
188
|
|
|
} |
189
|
|
|
$string .= ')'; |
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
if (count($this->_) > 0) { |
193
|
|
|
$string .= ' $_ ('; |
194
|
|
|
foreach ($this->_ as $k => $v) { |
195
|
|
|
if (is_array($v)) { |
196
|
|
|
$string .= "[$k]=>("; |
197
|
|
|
foreach ($v as $k2 => $v2) { |
198
|
|
|
$string .= "[$k2]=>\"$v2\", "; |
199
|
|
|
} |
200
|
|
|
$string .= ')'; |
201
|
|
|
} else { |
202
|
|
|
$string .= "[$k]=>\"$v\", "; |
203
|
|
|
} |
204
|
|
|
} |
205
|
|
|
$string .= ')'; |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
if (isset($this->text)) { |
|
|
|
|
209
|
|
|
$string .= " text: ({$this->text})"; |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
$string .= ' HDOM_INNER_INFO: '; |
213
|
|
|
/** |
214
|
|
|
* @var mixed |
215
|
|
|
*/ |
216
|
|
|
if (isset($node)) { |
|
|
|
|
217
|
|
|
if (isset($node->_[self::HDOM_INFO_INNER])) { |
218
|
|
|
$string .= "'" . $node->_[self::HDOM_INFO_INNER] . "'"; |
219
|
|
|
} else { |
220
|
|
|
$string .= ' NULL '; |
221
|
|
|
} |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
$string .= ' children: ' . count($this->children); |
225
|
|
|
$string .= ' nodes: ' . count($this->nodes); |
226
|
|
|
$string .= "\n"; |
227
|
|
|
|
228
|
|
|
if ($echo) { |
229
|
|
|
echo $string; |
230
|
|
|
|
231
|
|
|
return; |
232
|
|
|
} else { |
233
|
|
|
return $string; |
234
|
|
|
} |
235
|
|
|
} |
236
|
|
|
|
237
|
|
|
public function parent($parent = null) |
238
|
|
|
{ |
239
|
|
|
// I am SURE that this doesn't work properly. |
240
|
|
|
// It fails to unset the current node from it's current parents nodes or |
241
|
|
|
// children list first. |
242
|
|
|
if (null !== $parent) { |
243
|
|
|
$this->parent = $parent; |
244
|
|
|
$this->parent->nodes[] = $this; |
245
|
|
|
$this->parent->children[] = $this; |
246
|
|
|
} |
247
|
|
|
|
248
|
|
|
return $this->parent; |
249
|
|
|
} |
250
|
|
|
|
251
|
|
|
public function find_ancestor_tag($tag) |
252
|
|
|
{ |
253
|
|
|
if (null === $this->parent) { |
254
|
|
|
return null; |
255
|
|
|
} |
256
|
|
|
|
257
|
|
|
$ancestor = $this->parent; |
258
|
|
|
|
259
|
|
|
while (!is_null($ancestor)) { |
260
|
|
|
if ($ancestor->tag === $tag) { |
261
|
|
|
break; |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
$ancestor = $ancestor->parent; |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
return $ancestor; |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
public function innertext() |
271
|
|
|
{ |
272
|
|
|
if (isset($this->_[self::HDOM_INFO_INNER])) { |
273
|
|
|
$ret = $this->_[self::HDOM_INFO_INNER]; |
274
|
|
|
} elseif (isset($this->_[self::HDOM_INFO_TEXT])) { |
275
|
|
|
$ret = $this->_[self::HDOM_INFO_TEXT]; |
276
|
|
|
} else { |
277
|
|
|
$ret = ''; |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
foreach ($this->nodes as $n) { |
281
|
|
|
$ret .= $n->outertext(); |
282
|
|
|
} |
283
|
|
|
|
284
|
|
|
return $this->convert_text($ret); |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
public function outertext() |
288
|
|
|
{ |
289
|
|
|
if ('root' === $this->tag) { |
290
|
|
|
return $this->innertext(); |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
// todo: What is the use of this callback? Remove? |
294
|
|
|
if ($this->dom && null !== $this->dom->callback) { |
295
|
|
|
call_user_func_array($this->dom->callback, [$this]); |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
if (isset($this->_[self::HDOM_INFO_OUTER])) { |
299
|
|
|
return $this->convert_text($this->_[self::HDOM_INFO_OUTER]); |
300
|
|
|
} |
301
|
|
|
|
302
|
|
|
if (isset($this->_[self::HDOM_INFO_TEXT])) { |
303
|
|
|
return $this->convert_text($this->_[self::HDOM_INFO_TEXT]); |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
$ret = ''; |
307
|
|
|
|
308
|
|
|
if (isset($this->_[self::HDOM_INFO_BEGIN])) { |
309
|
|
|
$ret = $this->makeup(); |
310
|
|
|
} |
311
|
|
|
|
312
|
|
|
if (isset($this->_[self::HDOM_INFO_INNER])) { |
313
|
|
|
// todo: <br> should either never have self::HDOM_INFO_INNER or always |
314
|
|
|
if ('br' !== $this->tag) { |
315
|
|
|
$ret .= $this->_[self::HDOM_INFO_INNER]; |
316
|
|
|
} |
317
|
|
|
} |
318
|
|
|
|
319
|
|
|
if ($this->nodes) { |
|
|
|
|
320
|
|
|
foreach ($this->nodes as $n) { |
321
|
|
|
$ret .= $n->outertext(); |
322
|
|
|
} |
323
|
|
|
} |
324
|
|
|
|
325
|
|
|
if (isset($this->_[self::HDOM_INFO_END]) && 0 != $this->_[self::HDOM_INFO_END]) { |
326
|
|
|
$ret .= '</' . $this->tag . '>'; |
327
|
|
|
} |
328
|
|
|
|
329
|
|
|
return $this->convert_text($ret); |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
/** |
333
|
|
|
* Returns true if the provided element is a block level element. |
334
|
|
|
* |
335
|
|
|
* @see https://www.w3resource.com/html/HTML-block-level-and-inline-elements.php |
336
|
|
|
*/ |
337
|
|
|
protected function is_block_element($node) |
338
|
|
|
{ |
339
|
|
|
// todo: When we have the utility class this should be moved there |
340
|
|
|
return in_array(strtolower($node->tag), [ |
341
|
|
|
'p', |
342
|
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
343
|
|
|
'ol', 'ul', |
344
|
|
|
'pre', |
345
|
|
|
'address', |
346
|
|
|
'blockquote', |
347
|
|
|
'dl', |
348
|
|
|
'div', |
349
|
|
|
'fieldset', |
350
|
|
|
'form', |
351
|
|
|
'hr', |
352
|
|
|
'noscript', |
353
|
|
|
'table', |
354
|
|
|
]); |
355
|
|
|
} |
356
|
|
|
|
357
|
|
|
/** |
358
|
|
|
* Returns true if the provided element is an inline level element. |
359
|
|
|
* |
360
|
|
|
* @see https://www.w3resource.com/html/HTML-block-level-and-inline-elements.php |
361
|
|
|
*/ |
362
|
|
|
protected function is_inline_element($node) |
363
|
|
|
{ |
364
|
|
|
// todo: When we have the utility class this should be moved there |
365
|
|
|
return in_array(strtolower($node->tag), [ |
366
|
|
|
'b', 'big', 'i', 'small', 'tt', |
367
|
|
|
'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var', |
368
|
|
|
'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', |
369
|
|
|
'button', 'input', 'label', 'select', 'textarea', |
370
|
|
|
]); |
371
|
|
|
} |
372
|
|
|
|
373
|
|
|
public function text($trim = true) |
374
|
|
|
{ |
375
|
|
|
$ret = ''; |
376
|
|
|
|
377
|
|
|
if ('script' === strtolower($this->tag)) { |
378
|
|
|
$ret = ''; |
379
|
|
|
} elseif ('style' === strtolower($this->tag)) { |
380
|
|
|
$ret = ''; |
381
|
|
|
} elseif (self::HDOM_TYPE_COMMENT === $this->nodetype) { |
382
|
|
|
$ret = ''; |
383
|
|
|
} elseif (self::HDOM_TYPE_CDATA === $this->nodetype) { |
384
|
|
|
$ret = $this->_[self::HDOM_INFO_INNER]; |
385
|
|
|
} elseif (self::HDOM_TYPE_UNKNOWN === $this->nodetype) { |
386
|
|
|
$ret = ''; |
387
|
|
|
} elseif (isset($this->_[self::HDOM_INFO_INNER])) { |
388
|
|
|
$ret = $this->_[self::HDOM_INFO_INNER]; |
389
|
|
|
} elseif (self::HDOM_TYPE_TEXT === $this->nodetype) { |
390
|
|
|
$ret = $this->_[self::HDOM_INFO_TEXT]; |
391
|
|
|
} |
392
|
|
|
|
393
|
|
|
if (is_null($this->nodes)) { |
|
|
|
|
394
|
|
|
return ''; |
395
|
|
|
} |
396
|
|
|
|
397
|
|
|
foreach ($this->nodes as $n) { |
398
|
|
|
if ($this->is_block_element($n)) { |
399
|
|
|
$block = ltrim($this->convert_text($n->text(false))); |
400
|
|
|
|
401
|
|
|
if (empty($block)) { |
402
|
|
|
continue; |
403
|
|
|
} |
404
|
|
|
|
405
|
|
|
$ret = rtrim($ret) . "\n\n" . $block; |
406
|
|
|
} elseif ($this->is_inline_element($n)) { |
407
|
|
|
// todo: <br> introduces code smell because no space but \n |
408
|
|
|
if ('br' === strtolower($n->tag)) { |
409
|
|
|
$ret .= $this->dom->default_br_text ?: DEFAULT_BR_TEXT; |
410
|
|
|
} else { |
411
|
|
|
$inline = ltrim($this->convert_text($n->text(false))); |
412
|
|
|
|
413
|
|
|
if (empty($inline)) { |
414
|
|
|
continue; |
415
|
|
|
} |
416
|
|
|
|
417
|
|
|
$ret = $ret . $this->convert_text($n->text(false)); |
418
|
|
|
} |
419
|
|
|
} else { |
420
|
|
|
$ret .= $this->convert_text($n->text(false)); |
421
|
|
|
} |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
// Reduce whitespace at start/end to a single (or none) space |
425
|
|
|
$ret = preg_replace('/[ \t\n\r\0\x0B\xC2\xA0]+$/u', $trim ? '' : ' ', $ret); |
426
|
|
|
$ret = preg_replace('/^[ \t\n\r\0\x0B\xC2\xA0]+/u', $trim ? '' : ' ', $ret); |
427
|
|
|
|
428
|
|
|
return $ret; |
429
|
|
|
} |
430
|
|
|
|
431
|
|
|
public function xmltext() |
432
|
|
|
{ |
433
|
|
|
$ret = $this->innertext(); |
434
|
|
|
$ret = str_ireplace('<![CDATA[', '', $ret); |
435
|
|
|
$ret = str_replace(']]>', '', $ret); |
436
|
|
|
|
437
|
|
|
return $ret; |
438
|
|
|
} |
439
|
|
|
|
440
|
|
|
public function makeup() |
441
|
|
|
{ |
442
|
|
|
// text, comment, unknown |
443
|
|
|
if (isset($this->_[self::HDOM_INFO_TEXT])) { |
444
|
|
|
return $this->_[self::HDOM_INFO_TEXT]; |
445
|
|
|
} |
446
|
|
|
|
447
|
|
|
$ret = '<' . $this->tag; |
448
|
|
|
|
449
|
|
|
foreach ($this->attr as $key => $val) { |
450
|
|
|
// skip removed attribute |
451
|
|
|
if (null === $val || false === $val) { |
452
|
|
|
continue; |
453
|
|
|
} |
454
|
|
|
|
455
|
|
|
if (isset($this->_[self::HDOM_INFO_SPACE][$key])) { |
456
|
|
|
$ret .= $this->_[self::HDOM_INFO_SPACE][$key][0]; |
457
|
|
|
} else { |
458
|
|
|
$ret .= ' '; |
459
|
|
|
} |
460
|
|
|
|
461
|
|
|
//no value attr: nowrap, checked selected... |
462
|
|
|
if (true === $val) { |
463
|
|
|
$ret .= $key; |
464
|
|
|
} else { |
465
|
|
|
if (isset($this->_[self::HDOM_INFO_QUOTE][$key])) { |
466
|
|
|
$quote_type = $this->_[self::HDOM_INFO_QUOTE][$key]; |
467
|
|
|
} else { |
468
|
|
|
$quote_type = self::HDOM_QUOTE_DOUBLE; |
469
|
|
|
} |
470
|
|
|
|
471
|
|
|
switch ($quote_type) { |
472
|
|
|
case self::HDOM_QUOTE_SINGLE: |
473
|
|
|
$quote = '\''; |
474
|
|
|
$val = htmlentities($val, ENT_QUOTES, $this->dom->target_charset); |
475
|
|
|
break; |
476
|
|
|
case self::HDOM_QUOTE_NO: |
477
|
|
|
$quote = ''; |
478
|
|
|
break; |
479
|
|
|
case self::HDOM_QUOTE_DOUBLE: |
480
|
|
|
default: |
481
|
|
|
$quote = '"'; |
482
|
|
|
$val = htmlentities($val, ENT_COMPAT, $this->dom->target_charset); |
483
|
|
|
} |
484
|
|
|
|
485
|
|
|
$ret .= $key |
486
|
|
|
. (isset($this->_[self::HDOM_INFO_SPACE][$key]) ? $this->_[self::HDOM_INFO_SPACE][$key][1] : '') |
487
|
|
|
. '=' |
488
|
|
|
. (isset($this->_[self::HDOM_INFO_SPACE][$key]) ? $this->_[self::HDOM_INFO_SPACE][$key][2] : '') |
489
|
|
|
. $quote |
490
|
|
|
. $val |
491
|
|
|
. $quote; |
492
|
|
|
} |
493
|
|
|
} |
494
|
|
|
|
495
|
|
|
if (isset($this->_[self::HDOM_INFO_ENDSPACE])) { |
496
|
|
|
$ret .= $this->_[self::HDOM_INFO_ENDSPACE]; |
497
|
|
|
} |
498
|
|
|
|
499
|
|
|
return $ret . '>'; |
500
|
|
|
} |
501
|
|
|
|
502
|
|
|
/** |
503
|
|
|
* Element selector |
504
|
|
|
* |
505
|
|
|
* @param string $selector |
506
|
|
|
* @param int $idx |
507
|
|
|
* @param boolean $lowercase |
508
|
|
|
* @return HtmlNode |
509
|
|
|
*/ |
510
|
|
|
public function find($selector, $idx = null, $lowercase = false) |
511
|
|
|
{ |
512
|
|
|
$selectors = $this->parse_selector($selector); |
513
|
|
|
if (0 === ($count = count($selectors))) { |
514
|
|
|
return []; |
|
|
|
|
515
|
|
|
} |
516
|
|
|
$found_keys = []; |
517
|
|
|
|
518
|
|
|
// find each selector |
519
|
|
|
for ($c = 0; $c < $count; ++$c) { |
520
|
|
|
// The change on the below line was documented on the sourceforge |
521
|
|
|
// code tracker id 2788009 |
522
|
|
|
// used to be: if (($levle=count($selectors[0]))===0) return array(); |
523
|
|
|
if (0 === ($levle = count($selectors[$c]))) { |
524
|
|
|
Debug::log_once('Empty selector (' . $selector . ') matches nothing.'); |
525
|
|
|
|
526
|
|
|
return []; |
|
|
|
|
527
|
|
|
} |
528
|
|
|
|
529
|
|
|
if (!isset($this->_[self::HDOM_INFO_BEGIN])) { |
530
|
|
|
Debug::log_once('Invalid operation. The current node has no start tag.'); |
531
|
|
|
|
532
|
|
|
return []; |
|
|
|
|
533
|
|
|
} |
534
|
|
|
|
535
|
|
|
$head = [$this->_[self::HDOM_INFO_BEGIN] => 1]; |
536
|
|
|
$cmd = ' '; // Combinator |
537
|
|
|
|
538
|
|
|
// handle descendant selectors, no recursive! |
539
|
|
|
for ($l = 0; $l < $levle; ++$l) { |
540
|
|
|
$ret = []; |
541
|
|
|
|
542
|
|
|
foreach ($head as $k => $v) { |
543
|
|
|
$n = (-1 === $k) ? $this->dom->root : $this->dom->nodes[$k]; |
544
|
|
|
//PaperG - Pass this optional parameter on to the seek function. |
545
|
|
|
$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); |
546
|
|
|
} |
547
|
|
|
|
548
|
|
|
$head = $ret; |
549
|
|
|
$cmd = $selectors[$c][$l][6]; // Next Combinator |
550
|
|
|
} |
551
|
|
|
|
552
|
|
|
foreach ($head as $k => $v) { |
553
|
|
|
if (!isset($found_keys[$k])) { |
554
|
|
|
$found_keys[$k] = 1; |
555
|
|
|
} |
556
|
|
|
} |
557
|
|
|
} |
558
|
|
|
|
559
|
|
|
// sort keys |
560
|
|
|
ksort($found_keys); |
561
|
|
|
|
562
|
|
|
$found = []; |
563
|
|
|
foreach ($found_keys as $k => $v) { |
564
|
|
|
$found[] = $this->dom->nodes[$k]; |
565
|
|
|
} |
566
|
|
|
|
567
|
|
|
// return nth-element or array |
568
|
|
|
if (is_null($idx)) { |
569
|
|
|
return $found; |
|
|
|
|
570
|
|
|
} elseif ($idx < 0) { |
571
|
|
|
$idx = count($found) + $idx; |
572
|
|
|
} |
573
|
|
|
|
574
|
|
|
return (isset($found[$idx])) ? $found[$idx] : null; |
575
|
|
|
} |
576
|
|
|
|
577
|
|
|
public function expect($selector, $idx = null, $lowercase = false) |
578
|
|
|
{ |
579
|
|
|
return $this->find($selector, $idx, $lowercase) ?: null; |
580
|
|
|
} |
581
|
|
|
|
582
|
|
|
protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) |
583
|
|
|
{ |
584
|
|
|
list($ps_selector, $tag, $ps_element, $id, $class, $attributes, $cmb) = $selector; |
585
|
|
|
$nodes = []; |
586
|
|
|
|
587
|
|
|
if (' ' === $parent_cmd) { // Descendant Combinator |
588
|
|
|
// Find parent closing tag if the current element doesn't have a closing |
589
|
|
|
// tag (i.e. void element) |
590
|
|
|
$end = (!empty($this->_[self::HDOM_INFO_END])) ? $this->_[self::HDOM_INFO_END] : 0; |
591
|
|
|
if (0 == $end && $this->parent) { |
592
|
|
|
$parent = $this->parent; |
593
|
|
|
while (null !== $parent && !isset($parent->_[self::HDOM_INFO_END])) { |
594
|
|
|
--$end; |
595
|
|
|
$parent = $parent->parent; |
596
|
|
|
} |
597
|
|
|
$end += $parent->_[self::HDOM_INFO_END]; |
598
|
|
|
} |
599
|
|
|
|
600
|
|
|
if (0 === $end) { |
601
|
|
|
$end = count($this->dom->nodes); |
602
|
|
|
} |
603
|
|
|
|
604
|
|
|
// Get list of target nodes |
605
|
|
|
$nodes_start = $this->_[self::HDOM_INFO_BEGIN] + 1; |
606
|
|
|
|
607
|
|
|
// remove() makes $this->dom->nodes non-contiguous; use what is left. |
608
|
|
|
$nodes = array_intersect_key( |
609
|
|
|
$this->dom->nodes, |
610
|
|
|
array_flip(range($nodes_start, $end)) |
611
|
|
|
); |
612
|
|
|
} elseif ('>' === $parent_cmd) { // Child Combinator |
613
|
|
|
$nodes = $this->children; |
614
|
|
|
} elseif ( |
615
|
|
|
'+' === $parent_cmd |
616
|
|
|
&& $this->parent |
617
|
|
|
&& in_array($this, $this->parent->children) |
618
|
|
|
) { // Next-Sibling Combinator |
619
|
|
|
$index = array_search($this, $this->parent->children, true) + 1; |
620
|
|
|
if ($index < count($this->parent->children)) { |
621
|
|
|
$nodes[] = $this->parent->children[$index]; |
622
|
|
|
} |
623
|
|
|
} elseif ( |
624
|
|
|
'~' === $parent_cmd |
625
|
|
|
&& $this->parent |
626
|
|
|
&& in_array($this, $this->parent->children) |
627
|
|
|
) { // Subsequent Sibling Combinator |
628
|
|
|
$index = array_search($this, $this->parent->children, true); |
629
|
|
|
$nodes = array_slice($this->parent->children, $index); |
|
|
|
|
630
|
|
|
} |
631
|
|
|
|
632
|
|
|
// Go throgh each element starting at this element until the end tag |
633
|
|
|
// Note: If this element is a void tag, any previous void element is |
634
|
|
|
// skipped. |
635
|
|
|
foreach ($nodes as $node) { |
636
|
|
|
$pass = true; |
637
|
|
|
|
638
|
|
|
// Skip root nodes |
639
|
|
|
if (!$node->parent) { |
640
|
|
|
unset($node); |
641
|
|
|
continue; |
642
|
|
|
} |
643
|
|
|
|
644
|
|
|
// Handle 'text' selector |
645
|
|
|
if ($pass && 'text' === $tag) { |
646
|
|
|
if ('text' === $node->tag) { |
647
|
|
|
$ret[array_search($node, $this->dom->nodes, true)] = 1; |
648
|
|
|
} |
649
|
|
|
|
650
|
|
|
if (isset($node->_[self::HDOM_INFO_INNER])) { |
651
|
|
|
$ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
652
|
|
|
} |
653
|
|
|
|
654
|
|
|
unset($node); |
655
|
|
|
continue; |
656
|
|
|
} |
657
|
|
|
|
658
|
|
|
// Handle 'cdata' selector |
659
|
|
|
if ($pass && 'cdata' === $tag) { |
660
|
|
|
if ('cdata' === $node->tag) { |
661
|
|
|
$ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
662
|
|
|
} |
663
|
|
|
|
664
|
|
|
unset($node); |
665
|
|
|
continue; |
666
|
|
|
} |
667
|
|
|
|
668
|
|
|
// Handle 'comment' |
669
|
|
|
if ($pass && 'comment' === $tag && 'comment' === $node->tag) { |
670
|
|
|
$ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
671
|
|
|
unset($node); |
672
|
|
|
continue; |
673
|
|
|
} |
674
|
|
|
|
675
|
|
|
// Skip if node isn't a child node (i.e. text nodes) |
676
|
|
|
if ($pass && !in_array($node, $node->parent->children, true)) { |
677
|
|
|
unset($node); |
678
|
|
|
continue; |
679
|
|
|
} |
680
|
|
|
|
681
|
|
|
// Skip if tag doesn't match |
682
|
|
|
if ($pass && '' !== $tag && $tag !== $node->tag && '*' !== $tag) { |
683
|
|
|
$pass = false; |
684
|
|
|
} |
685
|
|
|
|
686
|
|
|
// Skip if ID doesn't exist |
687
|
|
|
if ($pass && '' !== $id && !isset($node->attr['id'])) { |
688
|
|
|
$pass = false; |
689
|
|
|
} |
690
|
|
|
|
691
|
|
|
// Check if ID matches |
692
|
|
|
if ($pass && '' !== $id && isset($node->attr['id'])) { |
693
|
|
|
// Note: Only consider the first ID (as browsers do) |
694
|
|
|
$node_id = explode(' ', trim($node->attr['id']))[0]; |
695
|
|
|
|
696
|
|
|
if ($id !== $node_id) { |
697
|
|
|
$pass = false; |
698
|
|
|
} |
699
|
|
|
} |
700
|
|
|
|
701
|
|
|
// Check if all class(es) exist |
702
|
|
|
if ($pass && '' !== $class && is_array($class) && !empty($class)) { |
703
|
|
|
if (isset($node->attr['class'])) { |
704
|
|
|
// Apply the same rules for the pattern and attribute value |
705
|
|
|
// Attribute values must not contain control characters other than space |
706
|
|
|
// https://www.w3.org/TR/html/dom.html#text-content |
707
|
|
|
// https://www.w3.org/TR/html/syntax.html#attribute-values |
708
|
|
|
// https://www.w3.org/TR/xml/#AVNormalize |
709
|
|
|
$node_classes = preg_replace("/[\r\n\t\s]+/u", ' ', $node->attr['class']); |
710
|
|
|
$node_classes = trim($node_classes); |
711
|
|
|
$node_classes = explode(' ', $node_classes); |
712
|
|
|
|
713
|
|
|
if ($lowercase) { |
714
|
|
|
$node_classes = array_map('strtolower', $node_classes); |
715
|
|
|
} |
716
|
|
|
|
717
|
|
|
foreach ($class as $c) { |
718
|
|
|
if (!in_array($c, $node_classes)) { |
719
|
|
|
$pass = false; |
720
|
|
|
break; |
721
|
|
|
} |
722
|
|
|
} |
723
|
|
|
} else { |
724
|
|
|
$pass = false; |
725
|
|
|
} |
726
|
|
|
} |
727
|
|
|
|
728
|
|
|
// Check attributes |
729
|
|
|
if ( |
730
|
|
|
$pass |
731
|
|
|
&& '' !== $attributes |
732
|
|
|
&& is_array($attributes) |
733
|
|
|
&& !empty($attributes) |
734
|
|
|
) { |
735
|
|
|
foreach ($attributes as $a) { |
736
|
|
|
list( |
737
|
|
|
$att_name, |
738
|
|
|
$att_expr, |
739
|
|
|
$att_val, |
740
|
|
|
$att_inv, |
741
|
|
|
$att_case_sensitivity |
742
|
|
|
) = $a; |
743
|
|
|
|
744
|
|
|
// Handle indexing attributes (i.e. "[2]") |
745
|
|
|
/* |
746
|
|
|
* Note: This is not supported by the CSS Standard but adds |
747
|
|
|
* the ability to select items compatible to XPath (i.e. |
748
|
|
|
* the 3rd element within it's parent). |
749
|
|
|
* |
750
|
|
|
* Note: This doesn't conflict with the CSS Standard which |
751
|
|
|
* doesn't work on numeric attributes anyway. |
752
|
|
|
*/ |
753
|
|
|
if ( |
754
|
|
|
is_numeric($att_name) |
755
|
|
|
&& '' === $att_expr |
756
|
|
|
&& '' === $att_val |
757
|
|
|
) { |
758
|
|
|
$count = 0; |
759
|
|
|
|
760
|
|
|
// Find index of current element in parent |
761
|
|
|
foreach ($node->parent->children as $c) { |
762
|
|
|
if ($c->tag === $node->tag) { |
763
|
|
|
++$count; |
764
|
|
|
} |
765
|
|
|
if ($c === $node) { |
766
|
|
|
break; |
767
|
|
|
} |
768
|
|
|
} |
769
|
|
|
|
770
|
|
|
// If this is the correct node, continue with next |
771
|
|
|
// attribute |
772
|
|
|
if ($count === (int) $att_name) { |
773
|
|
|
continue; |
774
|
|
|
} |
775
|
|
|
} |
776
|
|
|
|
777
|
|
|
// Check attribute availability |
778
|
|
|
if ($att_inv) { // Attribute should NOT be set |
779
|
|
|
if (isset($node->attr[$att_name])) { |
780
|
|
|
$pass = false; |
781
|
|
|
break; |
782
|
|
|
} |
783
|
|
|
} else { // Attribute should be set |
784
|
|
|
// todo: "plaintext" is not a valid CSS selector! |
785
|
|
|
if ( |
786
|
|
|
'plaintext' !== $att_name |
787
|
|
|
&& !isset($node->attr[$att_name]) |
788
|
|
|
) { |
789
|
|
|
$pass = false; |
790
|
|
|
break; |
791
|
|
|
} |
792
|
|
|
} |
793
|
|
|
|
794
|
|
|
// Continue with next attribute if expression isn't defined |
795
|
|
|
if ('' === $att_expr) { |
796
|
|
|
continue; |
797
|
|
|
} |
798
|
|
|
|
799
|
|
|
// If they have told us that this is a "plaintext" |
800
|
|
|
// search then we want the plaintext of the node - right? |
801
|
|
|
// todo "plaintext" is not a valid CSS selector! |
802
|
|
|
if ('plaintext' === $att_name) { |
803
|
|
|
$nodeKeyValue = $node->text(); |
804
|
|
|
} else { |
805
|
|
|
$nodeKeyValue = $node->attr[$att_name]; |
806
|
|
|
} |
807
|
|
|
|
808
|
|
|
// If lowercase is set, do a case insensitive test of |
809
|
|
|
// the value of the selector. |
810
|
|
|
if ($lowercase) { |
811
|
|
|
$check = $this->match( |
812
|
|
|
$att_expr, |
813
|
|
|
strtolower($att_val), |
814
|
|
|
strtolower($nodeKeyValue), |
815
|
|
|
$att_case_sensitivity |
816
|
|
|
); |
817
|
|
|
} else { |
818
|
|
|
$check = $this->match( |
819
|
|
|
$att_expr, |
820
|
|
|
$att_val, |
821
|
|
|
$nodeKeyValue, |
822
|
|
|
$att_case_sensitivity |
823
|
|
|
); |
824
|
|
|
} |
825
|
|
|
|
826
|
|
|
$check = 'not' === $ps_element ? !$check : $check; |
827
|
|
|
|
828
|
|
|
if (!$check) { |
829
|
|
|
$pass = false; |
830
|
|
|
break; |
831
|
|
|
} |
832
|
|
|
} |
833
|
|
|
} |
834
|
|
|
|
835
|
|
|
// Found a match. Add to list and clear node |
836
|
|
|
$pass = 'not' === $ps_selector ? !$pass : $pass; |
837
|
|
|
if ($pass) { |
838
|
|
|
$ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
839
|
|
|
} |
840
|
|
|
unset($node); |
841
|
|
|
} |
842
|
|
|
} |
843
|
|
|
|
844
|
|
|
protected function match($exp, $pattern, $value, $case_sensitivity) |
845
|
|
|
{ |
846
|
|
|
if ('i' === $case_sensitivity) { |
847
|
|
|
$pattern = strtolower($pattern); |
848
|
|
|
$value = strtolower($value); |
849
|
|
|
} |
850
|
|
|
|
851
|
|
|
// Apply the same rules for the pattern and attribute value |
852
|
|
|
// Attribute values must not contain control characters other than space |
853
|
|
|
// https://www.w3.org/TR/html/dom.html#text-content |
854
|
|
|
// https://www.w3.org/TR/html/syntax.html#attribute-values |
855
|
|
|
// https://www.w3.org/TR/xml/#AVNormalize |
856
|
|
|
$pattern = preg_replace("/[\r\n\t\s]+/u", ' ', $pattern); |
857
|
|
|
$pattern = trim($pattern); |
858
|
|
|
|
859
|
|
|
$value = preg_replace("/[\r\n\t\s]+/u", ' ', $value); |
860
|
|
|
$value = trim($value); |
861
|
|
|
|
862
|
|
|
switch ($exp) { |
863
|
|
|
case '=': |
864
|
|
|
return $value === $pattern; |
865
|
|
|
case '!=': |
866
|
|
|
return $value !== $pattern; |
867
|
|
|
case '^=': |
868
|
|
|
return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); |
869
|
|
|
case '$=': |
870
|
|
|
return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); |
871
|
|
|
case '*=': |
872
|
|
|
return preg_match('/' . preg_quote($pattern, '/') . '/', $value); |
873
|
|
|
case '|=': |
874
|
|
|
/* |
875
|
|
|
* [att|=val] |
876
|
|
|
* |
877
|
|
|
* Represents an element with the att attribute, its value |
878
|
|
|
* either being exactly "val" or beginning with "val" |
879
|
|
|
* immediately followed by "-" (U+002D). |
880
|
|
|
*/ |
881
|
|
|
return 0 === strpos($value, $pattern); |
882
|
|
|
case '~=': |
883
|
|
|
/* |
884
|
|
|
* [att~=val] |
885
|
|
|
* |
886
|
|
|
* Represents an element with the att attribute whose value is a |
887
|
|
|
* whitespace-separated list of words, one of which is exactly |
888
|
|
|
* "val". If "val" contains whitespace, it will never represent |
889
|
|
|
* anything (since the words are separated by spaces). Also if |
890
|
|
|
* "val" is the empty string, it will never represent anything. |
891
|
|
|
*/ |
892
|
|
|
return in_array($pattern, explode(' ', trim($value)), true); |
893
|
|
|
} |
894
|
|
|
|
895
|
|
|
Debug::log('Unhandled attribute selector: ' . $exp . '!'); |
896
|
|
|
|
897
|
|
|
return false; |
898
|
|
|
} |
899
|
|
|
|
900
|
|
|
protected function parse_selector($selector_string) |
901
|
|
|
{ |
902
|
|
|
/** |
903
|
|
|
* Pattern of CSS selectors, modified from mootools (https://mootools.net/). |
904
|
|
|
* |
905
|
|
|
* Paperg: Add the colon to the attribute, so that it properly finds |
906
|
|
|
* <tag attr:ibute="something" > like google does. |
907
|
|
|
* |
908
|
|
|
* Note: if you try to look at this attribute, you MUST use getAttribute |
909
|
|
|
* since $dom->x:y will fail the php syntax check. |
910
|
|
|
* |
911
|
|
|
* Notice the \[ starting the attribute? and the @? following? This |
912
|
|
|
* implies that an attribute can begin with an @ sign that is not |
913
|
|
|
* captured. This implies that an html attribute specifier may start |
914
|
|
|
* with an @ sign that is NOT captured by the expression. Farther study |
915
|
|
|
* is required to determine of this should be documented or removed. |
916
|
|
|
* |
917
|
|
|
* Matches selectors in this order: |
918
|
|
|
* |
919
|
|
|
* [0] - full match |
920
|
|
|
* |
921
|
|
|
* [1] - pseudo selector |
922
|
|
|
* (?:\:(\w+)\()? |
923
|
|
|
* Matches the pseudo selector (optional) |
924
|
|
|
* |
925
|
|
|
* [2] - tag name |
926
|
|
|
* ([\w:\*-]*) |
927
|
|
|
* Matches the tag name consisting of zero or more words, colons, |
928
|
|
|
* asterisks and hyphens. |
929
|
|
|
* |
930
|
|
|
* [3] - pseudo selector |
931
|
|
|
* (?:\:(\w+)\()? |
932
|
|
|
* Matches the pseudo selector (optional) |
933
|
|
|
* |
934
|
|
|
* [4] - id name |
935
|
|
|
* (?:\#([\w-]+)) |
936
|
|
|
* Optionally matches a id name, consisting of an "#" followed by |
937
|
|
|
* the id name (one or more words and hyphens). |
938
|
|
|
* |
939
|
|
|
* [5] - class names (including dots) |
940
|
|
|
* (?:\.([\w\.-]+))? |
941
|
|
|
* Optionally matches a list of classs, consisting of an "." |
942
|
|
|
* followed by the class name (one or more words and hyphens) |
943
|
|
|
* where multiple classes can be chained (i.e. ".foo.bar.baz") |
944
|
|
|
* |
945
|
|
|
* [6] - attributes |
946
|
|
|
* ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? |
947
|
|
|
* Optionally matches the attributes list |
948
|
|
|
* |
949
|
|
|
* [7] - separator |
950
|
|
|
* ([\/, >+~]+) |
951
|
|
|
* Matches the selector list separator |
952
|
|
|
*/ |
953
|
|
|
// phpcs:ignore Generic.Files.LineLength |
954
|
|
|
$pattern = "/(?:\:(\w+)\()?([\w:\*-]*)(?:\:(\w+)\()?(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?(?:\))?(?:\))?([\/, >+~]+)/is"; |
955
|
|
|
|
956
|
|
|
preg_match_all( |
957
|
|
|
$pattern, |
958
|
|
|
trim($selector_string) . ' ', // Add final ' ' as pseudo separator |
959
|
|
|
$matches, |
960
|
|
|
PREG_SET_ORDER |
961
|
|
|
); |
962
|
|
|
|
963
|
|
|
$selectors = []; |
964
|
|
|
$result = []; |
965
|
|
|
|
966
|
|
|
foreach ($matches as $m) { |
967
|
|
|
$m[0] = trim($m[0]); |
968
|
|
|
|
969
|
|
|
// Skip NoOps |
970
|
|
|
if ('' === $m[0] || '/' === $m[0] || '//' === $m[0]) { |
971
|
|
|
continue; |
972
|
|
|
} |
973
|
|
|
|
974
|
|
|
array_shift($m); |
975
|
|
|
|
976
|
|
|
// Convert to lowercase |
977
|
|
|
if ($this->dom->lowercase) { |
978
|
|
|
$m[1] = strtolower($m[1]); |
979
|
|
|
} |
980
|
|
|
|
981
|
|
|
// Extract classes |
982
|
|
|
if ('' !== $m[4]) { |
983
|
|
|
$m[4] = explode('.', $m[4]); |
984
|
|
|
} |
985
|
|
|
|
986
|
|
|
/* Extract attributes (pattern based on the pattern above!) |
987
|
|
|
|
988
|
|
|
* [0] - full match |
989
|
|
|
* [1] - attribute name |
990
|
|
|
* [2] - attribute expression |
991
|
|
|
* [3] - attribute value |
992
|
|
|
* [4] - case sensitivity |
993
|
|
|
* |
994
|
|
|
* Note: Attributes can be negated with a "!" prefix to their name |
995
|
|
|
*/ |
996
|
|
|
if ('' !== $m[5]) { |
997
|
|
|
preg_match_all( |
998
|
|
|
"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", |
999
|
|
|
trim($m[5]), |
1000
|
|
|
$attributes, |
1001
|
|
|
PREG_SET_ORDER |
1002
|
|
|
); |
1003
|
|
|
|
1004
|
|
|
// Replace element by array |
1005
|
|
|
$m[5] = []; |
1006
|
|
|
|
1007
|
|
|
foreach ($attributes as $att) { |
1008
|
|
|
// Skip empty matches |
1009
|
|
|
if ('' === trim($att[0])) { |
1010
|
|
|
continue; |
1011
|
|
|
} |
1012
|
|
|
|
1013
|
|
|
$inverted = (isset($att[1][0]) && '!' === $att[1][0]); |
1014
|
|
|
$m[5][] = [ |
1015
|
|
|
$inverted ? substr($att[1], 1) : $att[1], // Name |
1016
|
|
|
(isset($att[2])) ? $att[2] : '', // Expression |
1017
|
|
|
(isset($att[3])) ? $att[3] : '', // Value |
1018
|
|
|
$inverted, // Inverted Flag |
1019
|
|
|
(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity |
1020
|
|
|
]; |
1021
|
|
|
} |
1022
|
|
|
} |
1023
|
|
|
|
1024
|
|
|
// Sanitize Separator |
1025
|
|
|
if ('' !== $m[6] && '' === trim($m[6])) { // Descendant Separator |
1026
|
|
|
$m[6] = ' '; |
1027
|
|
|
} else { // Other Separator |
1028
|
|
|
$m[6] = trim($m[6]); |
1029
|
|
|
} |
1030
|
|
|
|
1031
|
|
|
// Clear Separator if it's a Selector List |
1032
|
|
|
if ($is_list = (',' === $m[6])) { |
1033
|
|
|
$m[6] = ''; |
1034
|
|
|
} |
1035
|
|
|
|
1036
|
|
|
$result[] = $m; |
1037
|
|
|
|
1038
|
|
|
if ($is_list) { // Selector List |
1039
|
|
|
$selectors[] = $result; |
1040
|
|
|
$result = []; |
1041
|
|
|
} |
1042
|
|
|
} |
1043
|
|
|
|
1044
|
|
|
if (count($result) > 0) { |
1045
|
|
|
$selectors[] = $result; |
1046
|
|
|
} |
1047
|
|
|
|
1048
|
|
|
return $selectors; |
1049
|
|
|
} |
1050
|
|
|
|
1051
|
|
|
public function __get($name) |
1052
|
|
|
{ |
1053
|
|
|
if (isset($this->attr[$name])) { |
1054
|
|
|
return $this->convert_text($this->attr[$name]); |
1055
|
|
|
} |
1056
|
|
|
|
1057
|
|
|
switch ($name) { |
1058
|
|
|
case 'outertext': |
1059
|
|
|
return $this->outertext(); |
1060
|
|
|
case 'innertext': |
1061
|
|
|
return $this->innertext(); |
1062
|
|
|
case 'plaintext': |
1063
|
|
|
return $this->text(); |
1064
|
|
|
case 'xmltext': |
1065
|
|
|
return $this->xmltext(); |
1066
|
|
|
} |
1067
|
|
|
|
1068
|
|
|
return false; |
1069
|
|
|
} |
1070
|
|
|
|
1071
|
|
|
public function __set($name, $value) |
1072
|
|
|
{ |
1073
|
|
|
switch ($name) { |
1074
|
|
|
case 'outertext': |
1075
|
|
|
$this->_[self::HDOM_INFO_OUTER] = $value; |
1076
|
|
|
break; |
1077
|
|
|
case 'innertext': |
1078
|
|
|
if (isset($this->_[self::HDOM_INFO_TEXT])) { |
1079
|
|
|
$this->_[self::HDOM_INFO_TEXT] = ''; |
1080
|
|
|
} |
1081
|
|
|
$this->_[self::HDOM_INFO_INNER] = $value; |
1082
|
|
|
break; |
1083
|
|
|
default: |
1084
|
|
|
$this->attr[$name] = $value; |
1085
|
|
|
} |
1086
|
|
|
} |
1087
|
|
|
|
1088
|
|
|
public function __isset($name) |
1089
|
|
|
{ |
1090
|
|
|
switch ($name) { |
1091
|
|
|
case 'outertext': |
1092
|
|
|
return true; |
1093
|
|
|
case 'innertext': |
1094
|
|
|
return true; |
1095
|
|
|
case 'plaintext': |
1096
|
|
|
return true; |
1097
|
|
|
} |
1098
|
|
|
|
1099
|
|
|
return isset($this->attr[$name]); |
1100
|
|
|
} |
1101
|
|
|
|
1102
|
|
|
public function __unset($name) |
1103
|
|
|
{ |
1104
|
|
|
if (isset($this->attr[$name])) { |
1105
|
|
|
unset($this->attr[$name]); |
1106
|
|
|
} |
1107
|
|
|
} |
1108
|
|
|
|
1109
|
|
|
public function convert_text($text) |
1110
|
|
|
{ |
1111
|
|
|
$converted_text = $text; |
1112
|
|
|
|
1113
|
|
|
$sourceCharset = ''; |
1114
|
|
|
$targetCharset = ''; |
1115
|
|
|
|
1116
|
|
|
if ($this->dom) { |
1117
|
|
|
$sourceCharset = strtoupper($this->dom->_charset); |
1118
|
|
|
$targetCharset = strtoupper($this->dom->_target_charset); |
1119
|
|
|
} |
1120
|
|
|
|
1121
|
|
|
if (!empty($sourceCharset) && !empty($targetCharset)) { |
1122
|
|
|
if (strtoupper($sourceCharset) === strtoupper($targetCharset)) { |
1123
|
|
|
$converted_text = $text; |
1124
|
|
|
} elseif (('UTF-8' === strtoupper($targetCharset)) && (self::is_utf8($text))) { |
1125
|
|
|
Debug::log_once('The source charset was incorrectly detected as ' . $sourceCharset . ' but should have been UTF-8'); |
1126
|
|
|
$converted_text = $text; |
1127
|
|
|
} else { |
1128
|
|
|
$converted_text = iconv($sourceCharset, $targetCharset, $text); |
1129
|
|
|
} |
1130
|
|
|
} |
1131
|
|
|
|
1132
|
|
|
// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. |
1133
|
|
|
if ('UTF-8' === $targetCharset) { |
1134
|
|
|
if ("\xef\xbb\xbf" === substr($converted_text, 0, 3)) { |
1135
|
|
|
$converted_text = substr($converted_text, 3); |
1136
|
|
|
} |
1137
|
|
|
|
1138
|
|
|
if ("\xef\xbb\xbf" === substr($converted_text, -3)) { |
1139
|
|
|
$converted_text = substr($converted_text, 0, -3); |
1140
|
|
|
} |
1141
|
|
|
} |
1142
|
|
|
|
1143
|
|
|
return $converted_text; |
1144
|
|
|
} |
1145
|
|
|
|
1146
|
|
|
public static function is_utf8($str) |
1147
|
|
|
{ |
1148
|
|
|
$c = 0; |
|
|
|
|
1149
|
|
|
$b = 0; |
|
|
|
|
1150
|
|
|
$bits = 0; |
1151
|
|
|
$len = strlen($str); |
1152
|
|
|
for ($i = 0; $i < $len; ++$i) { |
1153
|
|
|
$c = ord($str[$i]); |
1154
|
|
|
if ($c > 128) { |
1155
|
|
|
if (($c >= 254)) { |
1156
|
|
|
return false; |
1157
|
|
|
} elseif ($c >= 252) { |
1158
|
|
|
$bits = 6; |
1159
|
|
|
} elseif ($c >= 248) { |
1160
|
|
|
$bits = 5; |
1161
|
|
|
} elseif ($c >= 240) { |
1162
|
|
|
$bits = 4; |
1163
|
|
|
} elseif ($c >= 224) { |
1164
|
|
|
$bits = 3; |
1165
|
|
|
} elseif ($c >= 192) { |
1166
|
|
|
$bits = 2; |
1167
|
|
|
} else { |
1168
|
|
|
return false; |
1169
|
|
|
} |
1170
|
|
|
if (($i + $bits) > $len) { |
1171
|
|
|
return false; |
1172
|
|
|
} |
1173
|
|
|
while ($bits > 1) { |
1174
|
|
|
++$i; |
1175
|
|
|
$b = ord($str[$i]); |
1176
|
|
|
if ($b < 128 || $b > 191) { |
1177
|
|
|
return false; |
1178
|
|
|
} |
1179
|
|
|
--$bits; |
1180
|
|
|
} |
1181
|
|
|
} |
1182
|
|
|
} |
1183
|
|
|
|
1184
|
|
|
return true; |
1185
|
|
|
} |
1186
|
|
|
|
1187
|
|
|
public function get_display_size() |
1188
|
|
|
{ |
1189
|
|
|
$width = -1; |
1190
|
|
|
$height = -1; |
1191
|
|
|
|
1192
|
|
|
if ('img' !== $this->tag) { |
1193
|
|
|
return false; |
1194
|
|
|
} |
1195
|
|
|
|
1196
|
|
|
// See if there is aheight or width attribute in the tag itself. |
1197
|
|
|
if (isset($this->attr['width'])) { |
1198
|
|
|
$width = $this->attr['width']; |
1199
|
|
|
} |
1200
|
|
|
|
1201
|
|
|
if (isset($this->attr['height'])) { |
1202
|
|
|
$height = $this->attr['height']; |
1203
|
|
|
} |
1204
|
|
|
|
1205
|
|
|
// Now look for an inline style. |
1206
|
|
|
if (isset($this->attr['style'])) { |
1207
|
|
|
// Thanks to user gnarf from stackoverflow for this regular expression. |
1208
|
|
|
$attributes = []; |
1209
|
|
|
|
1210
|
|
|
preg_match_all( |
1211
|
|
|
'/([\w-]+)\s*:\s*([^;]+)\s*;?/', |
1212
|
|
|
$this->attr['style'], |
1213
|
|
|
$matches, |
1214
|
|
|
PREG_SET_ORDER |
1215
|
|
|
); |
1216
|
|
|
|
1217
|
|
|
foreach ($matches as $match) { |
1218
|
|
|
$attributes[$match[1]] = $match[2]; |
1219
|
|
|
} |
1220
|
|
|
|
1221
|
|
|
// If there is a width in the style attributes: |
1222
|
|
|
if (isset($attributes['width']) && -1 == $width) { |
1223
|
|
|
// check that the last two characters are px (pixels) |
1224
|
|
|
if ('px' === strtolower(substr($attributes['width'], -2))) { |
1225
|
|
|
$proposed_width = substr($attributes['width'], 0, -2); |
1226
|
|
|
// Now make sure that it's an integer and not something stupid. |
1227
|
|
|
if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { |
1228
|
|
|
$width = $proposed_width; |
1229
|
|
|
} |
1230
|
|
|
} |
1231
|
|
|
} |
1232
|
|
|
|
1233
|
|
|
// If there is a width in the style attributes: |
1234
|
|
|
if (isset($attributes['height']) && -1 == $height) { |
1235
|
|
|
// check that the last two characters are px (pixels) |
1236
|
|
|
if ('px' == strtolower(substr($attributes['height'], -2))) { |
1237
|
|
|
$proposed_height = substr($attributes['height'], 0, -2); |
1238
|
|
|
// Now make sure that it's an integer and not something stupid. |
1239
|
|
|
if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { |
1240
|
|
|
$height = $proposed_height; |
1241
|
|
|
} |
1242
|
|
|
} |
1243
|
|
|
} |
1244
|
|
|
} |
1245
|
|
|
|
1246
|
|
|
// Future enhancement: |
1247
|
|
|
// Look in the tag to see if there is a class or id specified that has |
1248
|
|
|
// a height or width attribute to it. |
1249
|
|
|
|
1250
|
|
|
// Far future enhancement |
1251
|
|
|
// Look at all the parent tags of this image to see if they specify a |
1252
|
|
|
// class or id that has an img selector that specifies a height or width |
1253
|
|
|
// Note that in this case, the class or id will have the img subselector |
1254
|
|
|
// for it to apply to the image. |
1255
|
|
|
|
1256
|
|
|
// ridiculously far future development |
1257
|
|
|
// If the class or id is specified in a SEPARATE css file thats not on |
1258
|
|
|
// the page, go get it and do what we were just doing for the ones on |
1259
|
|
|
// the page. |
1260
|
|
|
|
1261
|
|
|
$result = [ |
1262
|
|
|
'height' => $height, |
1263
|
|
|
'width' => $width, |
1264
|
|
|
]; |
1265
|
|
|
|
1266
|
|
|
return $result; |
1267
|
|
|
} |
1268
|
|
|
|
1269
|
|
|
public function save($filepath = '') |
1270
|
|
|
{ |
1271
|
|
|
$ret = $this->outertext(); |
1272
|
|
|
|
1273
|
|
|
if ('' !== $filepath) { |
1274
|
|
|
file_put_contents($filepath, $ret, LOCK_EX); |
1275
|
|
|
} |
1276
|
|
|
|
1277
|
|
|
return $ret; |
1278
|
|
|
} |
1279
|
|
|
|
1280
|
|
|
public function addClass($class) |
1281
|
|
|
{ |
1282
|
|
|
if (is_string($class)) { |
1283
|
|
|
$class = explode(' ', $class); |
1284
|
|
|
} |
1285
|
|
|
|
1286
|
|
|
if (is_array($class)) { |
1287
|
|
|
foreach ($class as $c) { |
1288
|
|
|
if (isset($this->class)) { |
1289
|
|
|
if ($this->hasClass($c)) { |
1290
|
|
|
continue; |
1291
|
|
|
} else { |
1292
|
|
|
$this->class .= ' ' . $c; |
1293
|
|
|
} |
1294
|
|
|
} else { |
1295
|
|
|
$this->class = $c; |
|
|
|
|
1296
|
|
|
} |
1297
|
|
|
} |
1298
|
|
|
} |
1299
|
|
|
} |
1300
|
|
|
|
1301
|
|
|
public function hasClass($class) |
1302
|
|
|
{ |
1303
|
|
|
if (is_string($class)) { |
1304
|
|
|
if (isset($this->class)) { |
1305
|
|
|
return in_array($class, explode(' ', $this->class), true); |
1306
|
|
|
} |
1307
|
|
|
} |
1308
|
|
|
|
1309
|
|
|
return false; |
1310
|
|
|
} |
1311
|
|
|
|
1312
|
|
|
public function removeClass($class = null) |
1313
|
|
|
{ |
1314
|
|
|
if (!isset($this->class)) { |
1315
|
|
|
return; |
1316
|
|
|
} |
1317
|
|
|
|
1318
|
|
|
if (is_null($class)) { |
1319
|
|
|
$this->removeAttribute('class'); |
1320
|
|
|
|
1321
|
|
|
return; |
1322
|
|
|
} |
1323
|
|
|
|
1324
|
|
|
if (is_string($class)) { |
1325
|
|
|
$class = explode(' ', $class); |
1326
|
|
|
} |
1327
|
|
|
|
1328
|
|
|
if (is_array($class)) { |
1329
|
|
|
$class = array_diff(explode(' ', $this->class), $class); |
1330
|
|
|
if (empty($class)) { |
1331
|
|
|
$this->removeAttribute('class'); |
1332
|
|
|
} else { |
1333
|
|
|
$this->class = implode(' ', $class); |
|
|
|
|
1334
|
|
|
} |
1335
|
|
|
} |
1336
|
|
|
} |
1337
|
|
|
|
1338
|
|
|
public function getAllAttributes() |
1339
|
|
|
{ |
1340
|
|
|
return $this->attr; |
1341
|
|
|
} |
1342
|
|
|
|
1343
|
|
|
public function getAttribute($name) |
1344
|
|
|
{ |
1345
|
|
|
return $this->$name; |
1346
|
|
|
} |
1347
|
|
|
|
1348
|
|
|
public function setAttribute($name, $value) |
1349
|
|
|
{ |
1350
|
|
|
$this->$name = $value; |
1351
|
|
|
} |
1352
|
|
|
|
1353
|
|
|
public function hasAttribute($name) |
1354
|
|
|
{ |
1355
|
|
|
return isset($this->$name); |
1356
|
|
|
} |
1357
|
|
|
|
1358
|
|
|
public function removeAttribute($name) |
1359
|
|
|
{ |
1360
|
|
|
unset($this->$name); |
1361
|
|
|
} |
1362
|
|
|
|
1363
|
|
|
public function remove() |
1364
|
|
|
{ |
1365
|
|
|
if ($this->parent) { |
1366
|
|
|
$this->parent->removeChild($this); |
1367
|
|
|
} |
1368
|
|
|
} |
1369
|
|
|
|
1370
|
|
|
public function removeChild($node) |
1371
|
|
|
{ |
1372
|
|
|
foreach ($node->children as $child) { |
1373
|
|
|
$node->removeChild($child); |
1374
|
|
|
} |
1375
|
|
|
|
1376
|
|
|
// No need to re-index node->children because it is about to be removed! |
1377
|
|
|
|
1378
|
|
|
foreach ($node->nodes as $entity) { |
1379
|
|
|
$enidx = array_search($entity, $node->nodes, true); |
1380
|
|
|
$edidx = array_search($entity, $node->dom->nodes, true); |
1381
|
|
|
|
1382
|
|
|
if (false !== $enidx) { |
1383
|
|
|
unset($node->nodes[$enidx]); |
1384
|
|
|
} |
1385
|
|
|
|
1386
|
|
|
if (false !== $edidx) { |
1387
|
|
|
unset($node->dom->nodes[$edidx]); |
1388
|
|
|
} |
1389
|
|
|
} |
1390
|
|
|
|
1391
|
|
|
// No need to re-index node->nodes because it is about to be removed! |
1392
|
|
|
|
1393
|
|
|
$nidx = array_search($node, $this->nodes, true); |
1394
|
|
|
$cidx = array_search($node, $this->children, true); |
1395
|
|
|
$didx = array_search($node, $this->dom->nodes, true); |
1396
|
|
|
|
1397
|
|
|
if (false !== $nidx) { |
1398
|
|
|
unset($this->nodes[$nidx]); |
1399
|
|
|
} |
1400
|
|
|
|
1401
|
|
|
$this->nodes = array_values($this->nodes); |
1402
|
|
|
|
1403
|
|
|
if (false !== $cidx) { |
1404
|
|
|
unset($this->children[$cidx]); |
1405
|
|
|
} |
1406
|
|
|
|
1407
|
|
|
$this->children = array_values($this->children); |
1408
|
|
|
|
1409
|
|
|
if (false !== $didx) { |
1410
|
|
|
unset($this->dom->nodes[$didx]); |
1411
|
|
|
} |
1412
|
|
|
|
1413
|
|
|
// Do not re-index dom->nodes because nodes point to other nodes in the |
1414
|
|
|
// array explicitly! |
1415
|
|
|
|
1416
|
|
|
$node->clear(); |
1417
|
|
|
} |
1418
|
|
|
|
1419
|
|
|
public function getElementById($id) |
1420
|
|
|
{ |
1421
|
|
|
return $this->find("#$id", 0); |
1422
|
|
|
} |
1423
|
|
|
|
1424
|
|
|
public function getElementsById($id, $idx = null) |
1425
|
|
|
{ |
1426
|
|
|
return $this->find("#$id", $idx); |
1427
|
|
|
} |
1428
|
|
|
|
1429
|
|
|
public function getElementByTagName($name) |
1430
|
|
|
{ |
1431
|
|
|
return $this->find($name, 0); |
1432
|
|
|
} |
1433
|
|
|
|
1434
|
|
|
public function getElementsByTagName($name, $idx = null) |
1435
|
|
|
{ |
1436
|
|
|
return $this->find($name, $idx); |
1437
|
|
|
} |
1438
|
|
|
|
1439
|
|
|
public function parentNode() |
1440
|
|
|
{ |
1441
|
|
|
return $this->parent(); |
1442
|
|
|
} |
1443
|
|
|
|
1444
|
|
|
public function childNodes($idx = -1) |
1445
|
|
|
{ |
1446
|
|
|
if (-1 === $idx) { |
1447
|
|
|
return $this->children; |
1448
|
|
|
} |
1449
|
|
|
|
1450
|
|
|
if (isset($this->children[$idx])) { |
1451
|
|
|
return $this->children[$idx]; |
1452
|
|
|
} |
1453
|
|
|
|
1454
|
|
|
return null; |
1455
|
|
|
} |
1456
|
|
|
|
1457
|
|
|
public function firstChild() |
1458
|
|
|
{ |
1459
|
|
|
if (count($this->children) > 0) { |
1460
|
|
|
return $this->children[0]; |
1461
|
|
|
} |
1462
|
|
|
|
1463
|
|
|
return null; |
1464
|
|
|
} |
1465
|
|
|
|
1466
|
|
|
public function lastChild() |
1467
|
|
|
{ |
1468
|
|
|
if (count($this->children) > 0) { |
1469
|
|
|
return end($this->children); |
1470
|
|
|
} |
1471
|
|
|
|
1472
|
|
|
return null; |
1473
|
|
|
} |
1474
|
|
|
|
1475
|
|
|
public function nextSibling() |
1476
|
|
|
{ |
1477
|
|
|
if (null === $this->parent) { |
1478
|
|
|
return null; |
1479
|
|
|
} |
1480
|
|
|
|
1481
|
|
|
$idx = array_search($this, $this->parent->children, true); |
1482
|
|
|
|
1483
|
|
|
if (false !== $idx && isset($this->parent->children[$idx + 1])) { |
1484
|
|
|
return $this->parent->children[$idx + 1]; |
1485
|
|
|
} |
1486
|
|
|
|
1487
|
|
|
return null; |
1488
|
|
|
} |
1489
|
|
|
|
1490
|
|
|
public function previousSibling() |
1491
|
|
|
{ |
1492
|
|
|
if (null === $this->parent) { |
1493
|
|
|
return null; |
1494
|
|
|
} |
1495
|
|
|
|
1496
|
|
|
$idx = array_search($this, $this->parent->children, true); |
1497
|
|
|
|
1498
|
|
|
if (false !== $idx && $idx > 0) { |
1499
|
|
|
return $this->parent->children[$idx - 1]; |
1500
|
|
|
} |
1501
|
|
|
|
1502
|
|
|
return null; |
1503
|
|
|
} |
1504
|
|
|
|
1505
|
|
|
public function hasChildNodes() |
1506
|
|
|
{ |
1507
|
|
|
return !empty($this->children); |
1508
|
|
|
} |
1509
|
|
|
|
1510
|
|
|
public function nodeName() |
1511
|
|
|
{ |
1512
|
|
|
return $this->tag; |
1513
|
|
|
} |
1514
|
|
|
|
1515
|
|
|
public function appendChild($node) |
1516
|
|
|
{ |
1517
|
|
|
$node->parent = $this; |
1518
|
|
|
$this->nodes[] = $node; |
1519
|
|
|
$this->children[] = $node; |
1520
|
|
|
|
1521
|
|
|
if ($this->dom) { // Attach current node to DOM (recursively) |
1522
|
|
|
$children = [$node]; |
1523
|
|
|
|
1524
|
|
|
while ($children) { |
1525
|
|
|
$child = array_pop($children); |
1526
|
|
|
$children = array_merge($children, $child->children); |
1527
|
|
|
|
1528
|
|
|
$this->dom->nodes[] = $child; |
1529
|
|
|
$child->dom = $this->dom; |
1530
|
|
|
$child->_[self::HDOM_INFO_BEGIN] = count($this->dom->nodes) - 1; |
1531
|
|
|
$child->_[self::HDOM_INFO_END] = $child->_[self::HDOM_INFO_BEGIN]; |
1532
|
|
|
} |
1533
|
|
|
|
1534
|
|
|
$this->dom->root->_[self::HDOM_INFO_END] = count($this->dom->nodes) - 1; |
1535
|
|
|
} |
1536
|
|
|
|
1537
|
|
|
return $this; |
1538
|
|
|
} |
1539
|
|
|
} |
1540
|
|
|
|