1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace PHPHtmlParser\Selector; |
||
6 | |||
7 | use PHPHtmlParser\Dom\AbstractNode; |
||
8 | use PHPHtmlParser\Dom\Collection; |
||
9 | use PHPHtmlParser\Dom\InnerNode; |
||
10 | use PHPHtmlParser\Dom\LeafNode; |
||
11 | use PHPHtmlParser\Exceptions\ChildNotFoundException; |
||
12 | |||
13 | /** |
||
14 | * Class Selector |
||
15 | * |
||
16 | * @package PHPHtmlParser |
||
17 | */ |
||
18 | class Selector |
||
19 | { |
||
20 | |||
21 | /** |
||
22 | * @var array |
||
23 | */ |
||
24 | protected $selectors = []; |
||
25 | |||
26 | /** |
||
27 | * @var bool |
||
28 | */ |
||
29 | private $depthFirst = false; |
||
30 | |||
31 | /** |
||
32 | * Constructs with the selector string |
||
33 | * @param string $selector |
||
34 | * @param ParserInterface $parser |
||
35 | */ |
||
36 | 294 | public function __construct(string $selector, ParserInterface $parser) |
|
37 | { |
||
38 | 294 | $this->selectors = $parser->parseSelectorString($selector); |
|
39 | 294 | } |
|
40 | |||
41 | /** |
||
42 | * Returns the selectors that where found in __construct |
||
43 | * @return array |
||
44 | */ |
||
45 | 12 | public function getSelectors() |
|
46 | { |
||
47 | 12 | return $this->selectors; |
|
48 | } |
||
49 | |||
50 | /** |
||
51 | * @param bool $status |
||
52 | * @return void |
||
53 | */ |
||
54 | 231 | public function setDepthFirstFind(bool $status): void |
|
55 | { |
||
56 | 231 | $this->depthFirst = $status; |
|
57 | 231 | } |
|
58 | |||
59 | /** |
||
60 | * Attempts to find the selectors starting from the given |
||
61 | * node object. |
||
62 | * @param AbstractNode $node |
||
63 | * @return Collection |
||
64 | * @throws ChildNotFoundException |
||
65 | */ |
||
66 | 282 | public function find(AbstractNode $node): Collection |
|
67 | { |
||
68 | 282 | $results = new Collection; |
|
69 | 282 | foreach ($this->selectors as $selector) { |
|
70 | 282 | $nodes = [$node]; |
|
71 | 282 | if (count($selector) == 0) { |
|
72 | continue; |
||
73 | } |
||
74 | |||
75 | 282 | $options = []; |
|
76 | 282 | foreach ($selector as $rule) { |
|
77 | 282 | if ($rule['alterNext']) { |
|
78 | 3 | $options[] = $this->alterNext($rule); |
|
79 | 3 | continue; |
|
80 | } |
||
81 | 282 | $nodes = $this->seek($nodes, $rule, $options); |
|
82 | // clear the options |
||
83 | 282 | $options = []; |
|
84 | } |
||
85 | |||
86 | // this is the final set of nodes |
||
87 | 282 | foreach ($nodes as $result) { |
|
88 | 246 | $results[] = $result; |
|
89 | } |
||
90 | } |
||
91 | |||
92 | 282 | return $results; |
|
93 | } |
||
94 | |||
95 | |||
96 | /** |
||
97 | * Attempts to find all children that match the rule |
||
98 | * given. |
||
99 | * @param array $nodes |
||
100 | * @param array $rule |
||
101 | * @param array $options |
||
102 | * @return array |
||
103 | * @throws ChildNotFoundException |
||
104 | */ |
||
105 | 282 | protected function seek(array $nodes, array $rule, array $options): array |
|
106 | { |
||
107 | // XPath index |
||
108 | 282 | if (array_key_exists('tag', $rule) && array_key_exists('key', $rule) |
|
109 | 282 | && is_numeric($rule['key']) |
|
110 | ) { |
||
111 | 3 | $count = 0; |
|
112 | /** @var AbstractNode $node */ |
||
113 | 3 | foreach ($nodes as $node) { |
|
114 | 3 | if ($rule['tag'] == '*' |
|
115 | 3 | || $rule['tag'] == $node->getTag() |
|
116 | 3 | ->name() |
|
117 | ) { |
||
118 | 3 | ++$count; |
|
119 | 3 | if ($count == $rule['key']) { |
|
120 | // found the node we wanted |
||
121 | 3 | return [$node]; |
|
122 | } |
||
123 | } |
||
124 | } |
||
125 | |||
126 | return []; |
||
127 | } |
||
128 | |||
129 | 279 | $options = $this->flattenOptions($options); |
|
130 | |||
131 | 279 | $return = []; |
|
132 | /** @var InnerNode $node */ |
||
133 | 279 | foreach ($nodes as $node) { |
|
134 | // check if we are a leaf |
||
135 | 279 | if ($node instanceof LeafNode || !$node->hasChildren() |
|
136 | ) { |
||
137 | 12 | continue; |
|
138 | } |
||
139 | |||
140 | 279 | $children = []; |
|
141 | 279 | $child = $node->firstChild(); |
|
142 | 279 | while (!is_null($child)) { |
|
143 | // wild card, grab all |
||
144 | 279 | if ($rule['tag'] == '*' && is_null($rule['key'])) { |
|
145 | 12 | $return[] = $child; |
|
146 | 12 | $child = $this->getNextChild($node, $child); |
|
147 | 12 | continue; |
|
148 | } |
||
149 | |||
150 | 279 | $pass = $this->checkTag($rule, $child); |
|
151 | 279 | if ($pass && !is_null($rule['key'])) { |
|
152 | 96 | $pass = $this->checkKey($rule, $child); |
|
153 | } |
||
154 | 279 | if ($pass && !is_null($rule['key']) && !is_null($rule['value']) |
|
155 | 279 | && $rule['value'] != '*' |
|
156 | ) { |
||
157 | 93 | $pass = $this->checkComparison($rule, $child); |
|
158 | } |
||
159 | |||
160 | 279 | if ($pass) { |
|
161 | // it passed all checks |
||
162 | 225 | $return[] = $child; |
|
163 | } else { |
||
164 | // this child failed to be matched |
||
165 | 264 | if ($child instanceof InnerNode && $child->hasChildren() |
|
166 | ) { |
||
167 | 237 | if ($this->depthFirst) { |
|
168 | 3 | if (!isset($options['checkGrandChildren']) |
|
169 | 3 | || $options['checkGrandChildren'] |
|
170 | ) { |
||
171 | // we have a child that failed but are not leaves. |
||
172 | 3 | $matches = $this->seek([$child], $rule, |
|
173 | 2 | $options); |
|
174 | 3 | foreach ($matches as $match) { |
|
175 | 3 | $return[] = $match; |
|
176 | } |
||
177 | } |
||
178 | } else { |
||
179 | // we still want to check its children |
||
180 | 237 | $children[] = $child; |
|
181 | } |
||
182 | } |
||
183 | } |
||
184 | |||
185 | 279 | $child = $this->getNextChild($node, $child); |
|
186 | } |
||
187 | |||
188 | 279 | if ((!isset($options['checkGrandChildren']) |
|
189 | 279 | || $options['checkGrandChildren']) |
|
190 | 279 | && count($children) > 0 |
|
191 | ) { |
||
192 | // we have children that failed but are not leaves. |
||
193 | 234 | $matches = $this->seek($children, $rule, $options); |
|
194 | 234 | foreach ($matches as $match) { |
|
195 | 171 | $return[] = $match; |
|
196 | } |
||
197 | } |
||
198 | } |
||
199 | |||
200 | 279 | return $return; |
|
201 | } |
||
202 | |||
203 | /** |
||
204 | * Attempts to match the given arguments with the given operator. |
||
205 | * @param string $operator |
||
206 | * @param string $pattern |
||
207 | * @param string $value |
||
208 | * @return bool |
||
209 | */ |
||
210 | 93 | protected function match( |
|
211 | string $operator, |
||
212 | string $pattern, |
||
213 | string $value |
||
214 | ): bool { |
||
215 | 93 | $value = strtolower($value); |
|
216 | 93 | $pattern = strtolower($pattern); |
|
217 | 62 | switch ($operator) { |
|
218 | 93 | case '=': |
|
219 | 93 | return $value === $pattern; |
|
220 | case '!=': |
||
221 | return $value !== $pattern; |
||
222 | case '^=': |
||
223 | return preg_match('/^' . preg_quote($pattern, '/') . '/', |
||
224 | $value) == 1; |
||
225 | case '$=': |
||
226 | return preg_match('/' . preg_quote($pattern, '/') . '$/', |
||
227 | $value) == 1; |
||
228 | case '*=': |
||
229 | if ($pattern[0] == '/') { |
||
230 | return preg_match($pattern, $value) == 1; |
||
231 | } |
||
232 | |||
233 | return preg_match("/" . $pattern . "/i", $value) == 1; |
||
234 | } |
||
235 | |||
236 | return false; |
||
237 | } |
||
238 | |||
239 | /** |
||
240 | * Attempts to figure out what the alteration will be for |
||
241 | * the next element. |
||
242 | * @param array $rule |
||
243 | * @return array |
||
244 | */ |
||
245 | 3 | protected function alterNext(array $rule): array |
|
246 | { |
||
247 | 3 | $options = []; |
|
248 | 3 | if ($rule['tag'] == '>') { |
|
249 | 3 | $options['checkGrandChildren'] = false; |
|
250 | } |
||
251 | |||
252 | 3 | return $options; |
|
253 | } |
||
254 | |||
255 | /** |
||
256 | * Flattens the option array. |
||
257 | * @param array $optionsArray |
||
258 | * @return array |
||
259 | */ |
||
260 | 279 | protected function flattenOptions(array $optionsArray) |
|
261 | { |
||
262 | 279 | $options = []; |
|
263 | 279 | foreach ($optionsArray as $optionArray) { |
|
264 | 3 | foreach ($optionArray as $key => $option) { |
|
265 | 3 | $options[$key] = $option; |
|
266 | } |
||
267 | } |
||
268 | |||
269 | 279 | return $options; |
|
270 | } |
||
271 | |||
272 | /** |
||
273 | * Returns the next child or null if no more children. |
||
274 | * @param AbstractNode $node |
||
275 | * @param AbstractNode $currentChild |
||
276 | * @return AbstractNode|null |
||
277 | */ |
||
278 | 279 | protected function getNextChild( |
|
279 | AbstractNode $node, |
||
280 | AbstractNode $currentChild |
||
281 | ) { |
||
282 | try { |
||
283 | 279 | $child = null; |
|
284 | 279 | if ($node instanceof InnerNode) { |
|
285 | // get next child |
||
286 | 279 | $child = $node->nextChild($currentChild->id()); |
|
287 | } |
||
288 | 279 | } catch (ChildNotFoundException $e) { |
|
289 | // no more children |
||
290 | 279 | $child = null; |
|
291 | } |
||
292 | |||
293 | 279 | return $child; |
|
294 | } |
||
295 | |||
296 | /** |
||
297 | * Checks tag condition from rules against node. |
||
298 | * @param array $rule |
||
299 | * @param AbstractNode $node |
||
300 | * @return bool |
||
301 | */ |
||
302 | 279 | protected function checkTag(array $rule, AbstractNode $node): bool |
|
303 | { |
||
304 | 279 | if (!empty($rule['tag']) && $rule['tag'] != $node->getTag()->name() |
|
305 | 279 | && $rule['tag'] != '*' |
|
306 | ) { |
||
307 | 252 | return false; |
|
308 | } |
||
309 | |||
310 | 225 | return true; |
|
311 | } |
||
312 | |||
313 | /** |
||
314 | * Checks key condition from rules against node. |
||
315 | * @param array $rule |
||
316 | * @param AbstractNode $node |
||
317 | * @return bool |
||
318 | */ |
||
319 | 96 | protected function checkKey(array $rule, AbstractNode $node): bool |
|
320 | { |
||
321 | 96 | if (!is_array($rule['key'])) { |
|
322 | 93 | if ($rule['noKey']) { |
|
323 | if (!is_null($node->getAttribute($rule['key']))) { |
||
324 | return false; |
||
325 | } |
||
326 | } else { |
||
327 | 93 | if ($rule['key'] != 'plaintext' |
|
328 | 93 | && !$node->hasAttribute($rule['key']) |
|
329 | ) { |
||
330 | 93 | return false; |
|
331 | } |
||
332 | } |
||
333 | } else { |
||
334 | 3 | if ($rule['noKey']) { |
|
335 | foreach ($rule['key'] as $key) { |
||
336 | if (!is_null($node->getAttribute($key))) { |
||
337 | return false; |
||
338 | } |
||
339 | } |
||
340 | } else { |
||
341 | 3 | foreach ($rule['key'] as $key) { |
|
342 | 3 | if ($key != 'plaintext' |
|
343 | 3 | && !$node->hasAttribute($key) |
|
344 | ) { |
||
345 | 1 | return false; |
|
346 | } |
||
347 | } |
||
348 | } |
||
349 | } |
||
350 | |||
351 | 96 | return true; |
|
352 | } |
||
353 | |||
354 | /** |
||
355 | * Checks comparison condition from rules against node. |
||
356 | * @param array $rule |
||
357 | * @param AbstractNode $node |
||
358 | * @return bool |
||
359 | */ |
||
360 | 93 | public function checkComparison(array $rule, AbstractNode $node): bool |
|
361 | { |
||
362 | 93 | if ($rule['key'] == 'plaintext') { |
|
363 | // plaintext search |
||
364 | $nodeValue = $node->text(); |
||
365 | $result = $this->checkNodeValue($nodeValue, $rule, $node); |
||
366 | } else { |
||
367 | // normal search |
||
368 | 93 | if (!is_array($rule['key'])) { |
|
369 | 90 | $nodeValue = $node->getAttribute($rule['key']); |
|
370 | 90 | $result = $this->checkNodeValue($nodeValue, $rule, $node); |
|
371 | } else { |
||
372 | 3 | $result = true; |
|
373 | 3 | foreach ($rule['key'] as $index => $key) { |
|
374 | 3 | $nodeValue = $node->getAttribute($key); |
|
375 | 3 | $result = $result && |
|
376 | 3 | $this->checkNodeValue($nodeValue, $rule, $node, $index); |
|
377 | } |
||
378 | } |
||
379 | } |
||
380 | |||
381 | 93 | return $result; |
|
382 | } |
||
383 | |||
384 | /** |
||
385 | * @param string|null $nodeValue |
||
386 | * @param array $rule |
||
387 | * @param AbstractNode $node |
||
388 | * @param int|null $index |
||
389 | * @return bool |
||
390 | */ |
||
391 | 93 | private function checkNodeValue( |
|
392 | ?string $nodeValue, |
||
393 | array $rule, |
||
394 | AbstractNode $node, |
||
395 | ?int $index = null |
||
396 | ) : bool { |
||
397 | 93 | $check = false; |
|
398 | 93 | if (!is_array($rule['value'])) { |
|
399 | 48 | $check = $this->match($rule['operator'], $rule['value'], $nodeValue); |
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
400 | } |
||
401 | |||
402 | // handle multiple classes |
||
403 | 93 | $key = $rule['key']; |
|
404 | 93 | if (!$check && $key == 'class') { |
|
405 | 51 | $nodeClasses = explode(' ', $node->getAttribute('class') ?? ''); |
|
406 | 51 | foreach ($rule['value'] as $value) { |
|
407 | 51 | foreach ($nodeClasses as $class) { |
|
408 | 51 | if ( ! empty($class)) { |
|
409 | 51 | $check = $this->match($rule['operator'], $value, $class); |
|
410 | } |
||
411 | 51 | if ($check) { |
|
412 | 51 | break; |
|
413 | } |
||
414 | } |
||
415 | 51 | if (!$check) { |
|
416 | 43 | break; |
|
417 | } |
||
418 | } |
||
419 | 51 | } elseif (!$check && is_array($key)) { |
|
420 | 3 | $check = $this->match($rule['operator'], $rule['value'][$index], $nodeValue); |
|
421 | } |
||
422 | |||
423 | 93 | return $check; |
|
424 | } |
||
425 | } |
||
426 |