Completed
Push — master ( 8b759a...268bdc )
by Gilles
02:53
created

Selector::find()   A

Complexity

Conditions 6
Paths 8

Size

Total Lines 27
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 6.0087

Importance

Changes 0
Metric Value
cc 6
eloc 15
nc 8
nop 1
dl 0
loc 27
ccs 15
cts 16
cp 0.9375
crap 6.0087
rs 9.2222
c 0
b 0
f 0
1
<?php
2
namespace PHPHtmlParser;
3
4
use PHPHtmlParser\Dom\AbstractNode;
5
use PHPHtmlParser\Dom\Collection;
6
use PHPHtmlParser\Dom\InnerNode;
7
use PHPHtmlParser\Dom\LeafNode;
8
use PHPHtmlParser\Exceptions\ChildNotFoundException;
9
use Countable;
10
11
/**
12
 * Class Selector
13
 *
14
 * @package PHPHtmlParser
15
 */
16
class Selector
17
{
18
19
    /**
20
     * Pattern of CSS selectors, modified from 'mootools'
21
     *
22
     * @var string
23
     */
24
    protected $pattern = "/([\w\-:\*>]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
25
26
    protected $selectors = [];
27
28
    /**
29
     * Constructs with the selector string
30
     *
31
     * @param string $selector
32
     */
33 249
    public function __construct($selector)
34
    {
35 249
        $this->parseSelectorString($selector);
36 249
    }
37
38
    /**
39
     * Returns the selectors that where found in __construct
40
     *
41
     * @return array
42
     */
43 12
    public function getSelectors()
44
    {
45 12
        return $this->selectors;
46
    }
47
48
    /**
49
     * Attempts to find the selectors starting from the given
50
     * node object.
51
     *
52
     * @param AbstractNode $node
53
     * @return Collection
54
     */
55 237
    public function find(AbstractNode $node): Collection
56
    {
57 237
        $results = new Collection;
58 237
        foreach ($this->selectors as $selector) {
59 237
            $nodes = [$node];
60 237
            if (count($selector) == 0) {
61
                continue;
62
            }
63
64 237
            $options = [];
65 237
            foreach ($selector as $rule) {
66 237
                if ($rule['alterNext']) {
67 3
                    $options[] = $this->alterNext($rule);
68 3
                    continue;
69
                }
70 237
                $nodes = $this->seek($nodes, $rule, $options);
71
                // clear the options
72 237
                $options = [];
73
            }
74
75
            // this is the final set of nodes
76 237
            foreach ($nodes as $result) {
77 222
                $results[] = $result;
78
            }
79
        }
80
81 237
        return $results;
82
    }
83
84
    /**
85
     * Parses the selector string
86
     *
87
     * @param string $selector
88
     */
89 249
    protected function parseSelectorString(string $selector): void
90
    {
91 249
        $matches = [];
92 249
        preg_match_all($this->pattern, trim($selector).' ', $matches, PREG_SET_ORDER);
93
94
        // skip tbody
95 249
        $result = [];
96 249
        foreach ($matches as $match) {
97
            // default values
98 249
            $tag       = strtolower(trim($match[1]));
99 249
            $operator  = '=';
100 249
            $key       = null;
101 249
            $value     = null;
102 249
            $noKey     = false;
103 249
            $alterNext = false;
104
105
            // check for elements that alter the behavior of the next element
106 249
            if ($tag == '>') {
107 3
                $alterNext = true;
108
            }
109
110
            // check for id selector
111 249
            if ( ! empty($match[2])) {
112 18
                $key   = 'id';
113 18
                $value = $match[2];
114
            }
115
116
            // check for class selector
117 249
            if ( ! empty($match[3])) {
118 48
                $key   = 'class';
119 48
                $value = $match[3];
120
            }
121
122
            // and final attribute selector
123 249
            if ( ! empty($match[4])) {
124 201
                $key = strtolower($match[4]);
125
            }
126 249
            if ( ! empty($match[5])) {
127 192
                $operator = $match[5];
128
            }
129 249
            if ( ! empty($match[6])) {
130 192
                $value = $match[6];
131
            }
132
133
            // check for elements that do not have a specified attribute
134 249
            if (isset($key[0]) && $key[0] == '!') {
135 3
                $key   = substr($key, 1);
136 3
                $noKey = true;
137
            }
138
139 249
            $result[] = [
140 249
                'tag'       => $tag,
141 249
                'key'       => $key,
142 249
                'value'     => $value,
143 249
                'operator'  => $operator,
144 249
                'noKey'     => $noKey,
145 249
                'alterNext' => $alterNext,
146
            ];
147 249
            if (trim($match[7]) == ',') {
148 3
                $this->selectors[] = $result;
149 167
                $result            = [];
150
            }
151
        }
152
153
        // save last results
154 249
        if (count($result) > 0) {
155 249
            $this->selectors[] = $result;
156
        }
157 249
    }
158
159
    /**
160
     * Attempts to find all children that match the rule
161
     * given.
162
     *
163
     * @param array $nodes
164
     * @param array $rule
165
     * @param array $options
166
     * @return array
167
     * @recursive
168
     */
169 237
    protected function seek(array $nodes, array $rule, array $options): array
170
    {
171
        // XPath index
172 237
        if (array_key_exists('tag', $rule) &&
173 237
            array_key_exists('key', $rule) &&
174 237
            is_numeric($rule['key'])
175
        ) {
176 3
            $count = 0;
177
            /** @var AbstractNode $node */
178 3
            foreach ($nodes as $node) {
179 3
                if ($rule['tag'] == '*' ||
180 3
                    $rule['tag'] == $node->getTag()->name()
181
                ) {
182 3
                    ++$count;
183 3
                    if ($count == $rule['key']) {
184
                        // found the node we wanted
185 3
                        return [$node];
186
                    }
187
                }
188
            }
189
190
            return [];
191
        }
192
193 234
        $options = $this->flattenOptions($options);
194
195 234
        $return = [];
196
        /** @var InnerNode $node */
197 234
        foreach ($nodes as $node) {
198
            // check if we are a leaf
199 234
            if ($node instanceof LeafNode ||
200 234
                ! $node->hasChildren()
201
            ) {
202 12
                continue;
203
            }
204
205 234
            $children = [];
206 234
            $child    = $node->firstChild();
207 234
            while ( ! is_null($child)) {
208
                // wild card, grab all
209 234
                if ($rule['tag'] == '*' && is_null($rule['key'])) {
210 12
                    $return[] = $child;
211
                    try {
212 12
                        $child = $node->nextChild($child->id());
213 12
                    } catch (ChildNotFoundException $e) {
214
                        // no more children
215 12
                        $child = null;
216
                    }
217 12
                    continue;
218
                }
219
220 234
                $pass = true;
221
                // check tag
222 234
                if ( ! empty($rule['tag']) && $rule['tag'] != $child->getTag()->name() &&
223 234
                    $rule['tag'] != '*'
224
                ) {
225
                    // child failed tag check
226 210
                    $pass = false;
227
                }
228
229
                // check key
230 234
                if ($pass && ! is_null($rule['key'])) {
231 84
                    if ($rule['noKey']) {
232
                        if ( ! is_null($child->getAttribute($rule['key']))) {
233
                            $pass = false;
234
                        }
235
                    } else {
236 84
                        if ($rule['key'] != 'plaintext' && !$child->hasAttribute($rule['key'])) {
237 81
                            $pass = false;
238
                        }
239
                    }
240
                }
241
242
                // compare values
243 234
                if ($pass && ! is_null($rule['key']) &&
244 234
                    ! is_null($rule['value']) && $rule['value'] != '*'
245
                ) {
246 81
                    if ($rule['key'] == 'plaintext') {
247
                        // plaintext search
248
                        $nodeValue = $child->text();
249
                    } else {
250
                        // normal search
251 81
                        $nodeValue = $child->getAttribute($rule['key']);
252
                    }
253
254 81
                    $check = $this->match($rule['operator'], $rule['value'], $nodeValue);
255
256
                    // handle multiple classes
257 81
                    if ( ! $check && $rule['key'] == 'class') {
258 36
                        $childClasses = explode(' ', $child->getAttribute('class'));
259 36
                        foreach ($childClasses as $class) {
260 36
                            if ( ! empty($class)) {
261 36
                                $check = $this->match($rule['operator'], $rule['value'], $class);
262
                            }
263 36
                            if ($check) {
264 31
                                break;
265
                            }
266
                        }
267
                    }
268
269 81
                    if ( ! $check) {
270 63
                        $pass = false;
271
                    }
272
                }
273
274 234
                if ($pass) {
275
                    // it passed all checks
276 189
                    $return[] = $child;
277
                } else {
278
                    // this child failed to be matched
279 219
                    if ($child instanceof InnerNode &&
280 219
                        $child->hasChildren()
281
                    ) {
282
                        // we still want to check its children
283 207
                        $children[] = $child;
284
                    }
285
                }
286
287
                try {
288
                    // get next child
289 234
                    $child = $node->nextChild($child->id());
290 234
                } catch (ChildNotFoundException $e) {
291
                    // no more children
292 234
                    $child = null;
293
                }
294
            }
295
296 234
            if (( ! isset($options['checkGrandChildren']) ||
297 234
                    $options['checkGrandChildren'])
298 234
                && count($children) > 0
299
            ) {
300
                // we have children that failed but are not leaves.
301 204
                $matches = $this->seek($children, $rule, $options);
302 204
                foreach ($matches as $match) {
303 190
                    $return[] = $match;
304
                }
305
            }
306
        }
307
308 234
        return $return;
309
    }
310
311
    /**
312
     * Attempts to match the given arguments with the given operator.
313
     *
314
     * @param string $operator
315
     * @param string $pattern
316
     * @param string $value
317
     * @return bool
318
     */
319 81
    protected function match(string $operator, string $pattern, string $value): bool
320
    {
321 81
        $value   = strtolower($value);
322 81
        $pattern = strtolower($pattern);
323 54
        switch ($operator) {
324 81
            case '=':
325 81
                return $value === $pattern;
326
            case '!=':
327
                return $value !== $pattern;
328
            case '^=':
329
                return preg_match('/^'.preg_quote($pattern, '/').'/', $value);
0 ignored issues
show
Bug Best Practice introduced by
The expression return preg_match('/^' ....rn, '/') . '/', $value) returns the type integer which is incompatible with the type-hinted return boolean.
Loading history...
330
            case '$=':
331
                return preg_match('/'.preg_quote($pattern, '/').'$/', $value);
0 ignored issues
show
Bug Best Practice introduced by
The expression return preg_match('/' . ...n, '/') . '$/', $value) returns the type integer which is incompatible with the type-hinted return boolean.
Loading history...
332
            case '*=':
333
                if ($pattern[0] == '/') {
334
                    return preg_match($pattern, $value);
0 ignored issues
show
Bug Best Practice introduced by
The expression return preg_match($pattern, $value) returns the type integer which is incompatible with the type-hinted return boolean.
Loading history...
335
                }
336
337
                return preg_match("/".$pattern."/i", $value);
0 ignored issues
show
Bug Best Practice introduced by
The expression return preg_match('/' . $pattern . '/i', $value) returns the type integer which is incompatible with the type-hinted return boolean.
Loading history...
338
        }
339
340
        return false;
341
    }
342
343
    /**
344
     * Attempts to figure out what the alteration will be for
345
     * the next element.
346
     *
347
     * @param array $rule
348
     * @return array
349
     */
350 3
    protected function alterNext(array $rule): array
351
    {
352 3
        $options = [];
353 3
        if ($rule['tag'] == '>') {
354 3
            $options['checkGrandChildren'] = false;
355
        }
356
357 3
        return $options;
358
    }
359
360
    /**
361
     * Flattens the option array.
362
     *
363
     * @param array $optionsArray
364
     * @return array
365
     */
366 234
    protected function flattenOptions(array $optionsArray)
367
    {
368 234
        $options = [];
369 234
        foreach ($optionsArray as $optionArray) {
370 3
            foreach ($optionArray as $key => $option) {
371 3
                $options[$key] = $option;
372
            }
373
        }
374
375 234
        return $options;
376
    }
377
}
378