selector::remove()   A
last analyzed

Complexity

Conditions 6
Paths 5

Size

Total Lines 23
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 12
nc 5
nop 3
dl 0
loc 23
rs 9.2222
c 0
b 0
f 0
1
<?php
2
// +----------------------------------------------------------------------
3
// | PHPSpider [ A PHP Framework For Crawler ]
4
// +----------------------------------------------------------------------
5
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
6
// +----------------------------------------------------------------------
7
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
8
// +----------------------------------------------------------------------
9
// | Author: Seatle Yang <[email protected]>
10
// +----------------------------------------------------------------------
11
12
//----------------------------------
13
// PHPSpider选择器类文件
14
//----------------------------------
15
16
namespace phpspider\core;
17
18
use phpspider\library\phpquery;
19
use DOMDocument;
20
use DOMXpath;
21
use Exception;
22
23
class selector
24
{
25
    /**
26
     * 版本号
27
     * @var string
28
     */
29
    const VERSION = '1.0.2';
30
    public static $dom = null;
31
    public static $dom_auth = '';
32
    public static $xpath = null;
33
    public static $error = null;
34
35
    public static function select($html, $selector, $selector_type = 'xpath')
36
    {
37
        if (empty($html) || empty($selector)) 
38
        {
39
            return false;
40
        }
41
42
        $selector_type = strtolower($selector_type);
43
        if ($selector_type == 'xpath') 
44
        {
45
            return self::_xpath_select($html, $selector);
0 ignored issues
show
Bug introduced by
Are you sure the usage of self::_xpath_select($html, $selector) targeting phpspider\core\selector::_xpath_select() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
46
        }
47
        elseif ($selector_type == 'regex') 
48
        {
49
            return self::_regex_select($html, $selector);
0 ignored issues
show
Bug introduced by
Are you sure the usage of self::_regex_select($html, $selector) targeting phpspider\core\selector::_regex_select() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
50
        }
51
        elseif ($selector_type == 'css') 
52
        {
53
            return self::_css_select($html, $selector);
54
        }
55
    }
56
57
    public static function remove($html, $selector, $selector_type = 'xpath')
58
    {
59
        if (empty($html) || empty($selector)) 
60
        {
61
            return false;
62
        }
63
64
        $remove_html = "";
65
        $selector_type = strtolower($selector_type);
66
        if ($selector_type == 'xpath') 
67
        {
68
            $remove_html = self::_xpath_select($html, $selector, true);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $remove_html is correct as self::_xpath_select($html, $selector, true) targeting phpspider\core\selector::_xpath_select() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
69
        }
70
        elseif ($selector_type == 'regex') 
71
        {
72
            $remove_html = self::_regex_select($html, $selector, true);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $remove_html is correct as self::_regex_select($html, $selector, true) targeting phpspider\core\selector::_regex_select() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
73
        }
74
        elseif ($selector_type == 'css') 
75
        {
76
            $remove_html =  self::_css_select($html, $selector, true);
77
        }
78
        $html = str_replace($remove_html, "", $html);
79
        return $html;
80
    }
81
82
    /**
83
     * xpath选择器
84
     * 
85
     * @param mixed $html
86
     * @param mixed $selector
87
     * @return void
88
     * @author seatle <[email protected]> 
89
     * @created time :2016-10-26 12:53
90
     */
91
    private static function _xpath_select($html, $selector, $remove = false)
92
    {
93
        if (!is_object(self::$dom))
94
        {
95
            self::$dom = new DOMDocument();
96
        }
97
98
        // 如果加载的不是之前的HTML内容,替换一下验证标识
99
        if (self::$dom_auth != md5($html)) 
100
        {
101
            self::$dom_auth = md5($html);
102
            @self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for loadHTML(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

102
            /** @scrutinizer ignore-unhandled */ @self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
103
            self::$xpath = new DOMXpath(self::$dom);
104
        }
105
106
        //libxml_use_internal_errors(true);
107
        //self::$dom->loadHTML('<?xml encoding="UTF-8">'.$html);
108
        //$errors = libxml_get_errors();
109
        //if (!empty($errors)) 
110
        //{
111
            //print_r($errors);
112
            //exit;
113
        //}
114
115
        $elements = @self::$xpath->query($selector);
116
        if ($elements === false)
117
        {
118
            self::$error = "the selector in the xpath(\"{$selector}\") syntax errors";
119
            // 不应该返回false,因为isset(false)为true,更不能通过 !$values 去判断,因为!0为true,所以这里只能返回null
120
            //return false;
121
            return null;
122
        }
123
124
        $result = array();
125
        if (!is_null($elements)) 
126
        {
127
            foreach ($elements as $element) 
128
            {
129
                // 如果是删除操作,取一整块代码
130
                if ($remove) 
131
                {
132
                    $content = self::$dom->saveXml($element);
133
                }
134
                else 
135
                {
136
                    $nodeName = $element->nodeName;
137
                    $nodeType = $element->nodeType;     // 1.Element 2.Attribute 3.Text
138
                    //$nodeAttr = $element->getAttribute('src');
139
                    //$nodes = util::node_to_array(self::$dom, $element);
140
                    //echo $nodes['@src']."\n";
141
                    // 如果是img标签,直接取src值
142
                    if ($nodeType == 1 && in_array($nodeName, array('img'))) 
143
                    {
144
                        $content = $element->getAttribute('src');
145
                    }
146
                    // 如果是标签属性,直接取节点值
147
                    elseif ($nodeType == 2 || $nodeType == 3 || $nodeType == 4) 
148
                    {
149
                        $content = $element->nodeValue;
150
                    }
151
                    else 
152
                    {
153
                        // 保留nodeValue里的html符号,给children二次提取
154
                        $content = self::$dom->saveXml($element);
155
                        //$content = trim(self::$dom->saveHtml($element));
156
                        $content = preg_replace(array("#^<{$nodeName}.*>#isU","#</{$nodeName}>$#isU"), array('', ''), $content);
157
                    }
158
                }
159
                $result[] = $content;
160
            }
161
        }
162
        if (empty($result)) 
163
        {
164
            return null;
165
        }
166
        // 如果只有一个元素就直接返回string,否则返回数组
167
        return count($result) > 1 ? $result : $result[0];
0 ignored issues
show
Bug Best Practice introduced by
The expression return count($result) > 1 ? $result : $result[0] also could return the type array|string[] which is incompatible with the documented return type void.
Loading history...
168
    }
169
170
    /**
171
     * css选择器
172
     * 
173
     * @param mixed $html
174
     * @param mixed $selector
175
     * @return void
176
     * @author seatle <[email protected]> 
177
     * @created time :2016-10-26 12:53
178
     */
179
    private static function _css_select($html, $selector, $remove = false)
180
    {
181
        $selector = self::css_to_xpath($selector);
182
        //echo $selector."\n";
183
        //exit("\n");
184
        return self::_xpath_select($html, $selector, $remove);
185
        // 如果加载的不是之前的HTML内容,替换一下验证标识
186
        //if (self::$dom_auth['css'] != md5($html)) 
187
        //{
188
            //self::$dom_auth['css'] = md5($html);
189
            //phpQuery::loadDocumentHTML($html); 
190
        //}
191
        //if ($remove) 
192
        //{
193
            //return phpQuery::pq($selector)->remove(); 
194
        //}
195
        //else 
196
        //{
197
            //return phpQuery::pq($selector)->html(); 
198
        //}
199
    }
200
201
    /**
202
     * 正则选择器
203
     * 
204
     * @param mixed $html
205
     * @param mixed $selector
206
     * @return void
207
     * @author seatle <[email protected]> 
208
     * @created time :2016-10-26 12:53
209
     */
210
    private static function _regex_select($html, $selector, $remove = false)
211
    {
212
        if(@preg_match_all($selector, $html, $out) === false)
213
        {
214
            self::$error = "the selector in the regex(\"{$selector}\") syntax errors";
215
            return null;
216
        }
217
        $count = count($out);
218
        $result = array();
219
        // 一个都没有匹配到
220
        if ($count == 0) 
221
        {
222
            return null;
223
        }
224
        // 只匹配一个,就是只有一个 ()
225
        elseif ($count == 2) 
226
        {
227
            // 删除的话取匹配到的所有内容
228
            if ($remove) 
229
            {
230
                $result = $out[0];
231
            }
232
            else 
233
            {
234
                $result = $out[1];
235
            }
236
        }
237
        else 
238
        {
239
            for ($i = 1; $i < $count; $i++) 
240
            {
241
                // 如果只有一个元素,就直接返回好了
242
                $result[] = count($out[$i]) > 1 ? $out[$i] : $out[$i][0];
243
            }
244
        }
245
        if (empty($result)) 
246
        {
247
            return null;
248
        }
249
        
250
        return count($result) > 1 ? $result : $result[0];
251
    }
252
253
    public static function find_all($html, $selector)
254
    {
255
    }
256
257
    
258
    public static function css_to_xpath($selectors) 
259
    {
260
		$queries = self::parse_selector($selectors);
261
        $delimiter_before = false;
262
        $xquery = '';
263
        foreach($queries as $s) 
264
        {
265
            // TAG
266
            $is_tag = preg_match('@^[\w|\||-]+$@', $s) || $s == '*';
267
            if ($is_tag) 
268
            {
269
                $xquery .= $s;
270
            } 
271
            // ID
272
            else if ($s[0] == '#') 
273
            {
274
                if ($delimiter_before)
275
                {
276
                    $xquery .= '*';
277
                }
278
                // ID用精确查询
279
                $xquery .= "[@id='".substr($s, 1)."']";
280
            }
281
            // CLASSES
282
            else if ($s[0] == '.') 
283
            {
284
                if ($delimiter_before)
285
                {
286
                    $xquery .= '*';
287
                }
288
                // CLASS用模糊查询
289
                $xquery .= "[contains(@class,'".substr($s, 1)."')]";
290
            }
291
            // ATTRIBUTES
292
            else if ($s[0] == '[') 
293
            {
294
                if ($delimiter_before)
295
                {
296
                    $xquery .= '*';
297
                }
298
                // strip side brackets
299
                $attr = trim($s, '][');
300
                // attr with specifed value
301
                if (mb_strpos($s, '=')) 
302
                {
303
                    $value = null;
0 ignored issues
show
Unused Code introduced by
The assignment to $value is dead and can be removed.
Loading history...
304
                    list($attr, $value) = explode('=', $attr);
305
                    $value = trim($value, "'\"");
306
                    if (self::is_regexp($attr)) 
307
                    {
308
                        // cut regexp character
309
                        $attr = substr($attr, 0, -1);
310
                        $xquery .= "[@{$attr}]";
311
                    } 
312
                    else 
313
                    {
314
                        $xquery .= "[@{$attr}='{$value}']";
315
                    }
316
                } 
317
                // attr without specified value
318
                else 
319
                {
320
                    $xquery .= "[@{$attr}]";
321
                }
322
            } 
323
            // ~ General Sibling Selector
324
            else if ($s[0] == '~')
325
            {
326
            }
327
            // + Adjacent sibling selectors
328
            else if ($s[0] == '+') 
329
            {
330
            } 
331
            // PSEUDO CLASSES
332
            else if ($s[0] == ':') 
333
            {
334
            }
335
            // DIRECT DESCENDANDS
336
            else if ($s == '>') 
337
            {
338
                $xquery .= '/';
339
                $delimiter_before = 2;
340
            } 
341
            // ALL DESCENDANDS
342
            else if ($s == ' ') 
343
            {
344
                $xquery .= '//';
345
                $delimiter_before = 2;
346
            } 
347
            // ERRORS
348
            else 
349
            {
350
                exit("Unrecognized token '$s'");
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
351
            }
352
            $delimiter_before = $delimiter_before === 2;
353
        }
354
        return $xquery;
355
    }
356
357
	/**
358
	 * @access private
359
	 */
360
    public static function parse_selector($query) 
361
    {
362
        $query = trim( preg_replace( '@\s+@', ' ', preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) ) );
363
        $queries = array();
364
        if ( !$query )
365
        {
366
            return $queries;
367
        }
368
369
        $special_chars = array('>',' ');
370
        $special_chars_mapping = array();
371
        $strlen = mb_strlen($query);
372
        $class_chars = array('.', '-');
373
        $pseudo_chars = array('-');
374
        $tag_chars = array('*', '|', '-');
375
        // split multibyte string
376
        // http://code.google.com/p/phpquery/issues/detail?id=76
377
        $_query = array();
378
        for ( $i=0; $i<$strlen; $i++ )
379
        {
380
            $_query[] = mb_substr($query, $i, 1);
381
        }
382
        $query = $_query;
383
        // it works, but i dont like it...
384
        $i = 0;
385
        while( $i < $strlen ) 
386
        {
387
            $c = $query[$i];
388
            $tmp = '';
389
            // TAG
390
            if ( self::is_char($c) || in_array($c, $tag_chars) ) 
391
            {
392
                while(isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $tag_chars))) 
393
                {
394
                    $tmp .= $query[$i];
395
                    $i++;
396
                }
397
                $queries[] = $tmp;
398
            } 
399
            // IDs
400
            else if ( $c == '#' ) 
401
            {
402
                $i++;
403
                while( isset($query[$i]) && (self::is_char($query[$i]) || $query[$i] == '-') ) 
404
                {
405
                    $tmp .= $query[$i];
406
                    $i++;
407
                }
408
                $queries[] = '#'.$tmp;
409
            } 
410
            // SPECIAL CHARS
411
            else if ( in_array($c, $special_chars) ) 
412
            {
413
                $queries[] = $c;
414
                $i++;
415
                // MAPPED SPECIAL MULTICHARS
416
                //			} else if ( $c.$query[$i+1] == '//') {
417
                //				$return[] = ' ';
418
                //				$i = $i+2;
419
            } 
420
            // MAPPED SPECIAL CHARS
421
            else if ( isset($special_chars_mapping[$c])) 
422
            {
423
                $queries[] = $special_chars_mapping[$c];
424
                $i++;
425
            } 
426
            // COMMA
427
            else if ( $c == ',' ) 
428
            {
429
                $i++;
430
                while( isset($query[$i]) && $query[$i] == ' ')
431
                {
432
                    $i++;
433
                }
434
            } 
435
            // CLASSES
436
            else if ($c == '.') 
437
            {
438
                while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $class_chars))) 
439
                {
440
                    $tmp .= $query[$i];
441
                    $i++;
442
                }
443
                $queries[] = $tmp;
444
            } 
445
            // ~ General Sibling Selector
446
            else if ($c == '~')
447
            {
448
                $space_allowed = true;
449
                $tmp .= $query[$i++];
450
                while( isset($query[$i])
451
                    && (self::is_char($query[$i])
452
                    || in_array($query[$i], $class_chars)
453
                    || $query[$i] == '*'
454
                    || ($query[$i] == ' ' && $space_allowed)
455
                )) 
456
                {
457
                    if ($query[$i] != ' ')
458
                    {
459
                        $space_allowed = false;
460
                    }
461
                    $tmp .= $query[$i];
462
                    $i++;
463
                }
464
                $queries[] = $tmp;
465
            }
466
            // + Adjacent sibling selectors
467
            else if ($c == '+') 
468
            {
469
                $space_allowed = true;
470
                $tmp .= $query[$i++];
471
                while( isset($query[$i])
472
                    && (self::is_char($query[$i])
473
                    || in_array($query[$i], $class_chars)
474
                    || $query[$i] == '*'
475
                    || ($space_allowed && $query[$i] == ' ')
476
                )) 
477
                {
478
                    if ($query[$i] != ' ')
479
                        $space_allowed = false;
480
                    $tmp .= $query[$i];
481
                    $i++;
482
                }
483
                $queries[] = $tmp;
484
            } 
485
            // ATTRS
486
            else if ($c == '[') 
487
            {
488
                $stack = 1;
489
                $tmp .= $c;
490
                while( isset($query[++$i])) 
491
                {
492
                    $tmp .= $query[$i];
493
                    if ( $query[$i] == '[') 
494
                    {
495
                        $stack++;
496
                    } 
497
                    else if ( $query[$i] == ']')
498
                    {
499
                        $stack--;
500
                        if (! $stack )
501
                        {
502
                            break;
503
                        }
504
                    }
505
                }
506
                $queries[] = $tmp;
507
                $i++;
508
            } 
509
            // PSEUDO CLASSES
510
            else if ($c == ':') 
511
            {
512
                $stack = 1;
513
                $tmp .= $query[$i++];
514
                while( isset($query[$i]) && (self::is_char($query[$i]) || in_array($query[$i], $pseudo_chars))) 
515
                {
516
                    $tmp .= $query[$i];
517
                    $i++;
518
                }
519
                // with arguments ?
520
                if ( isset($query[$i]) && $query[$i] == '(') 
521
                {
522
                    $tmp .= $query[$i];
523
                    $stack = 1;
524
                    while( isset($query[++$i])) 
525
                    {
526
                        $tmp .= $query[$i];
527
                        if ( $query[$i] == '(') 
528
                        {
529
                            $stack++;
530
                        } 
531
                        else if ( $query[$i] == ')')
532
                        {
533
                            $stack--;
534
                            if (! $stack )
535
                            {
536
                                break;
537
                            }
538
                        }
539
                    }
540
                    $queries[] = $tmp;
541
                    $i++;
542
                } 
543
                else 
544
                {
545
                    $queries[] = $tmp;
546
                }
547
            }
548
            else
549
            {
550
                $i++;
551
            }
552
        }
553
554
        if (isset($queries[0])) 
555
        {
556
            if (isset($queries[0][0]) && $queries[0][0] == ':')
557
            {
558
                array_unshift($queries, '*');
559
            }
560
            if ($queries[0] != '>')
561
            {
562
                array_unshift($queries, ' ');
563
            }
564
        }
565
566
        return $queries;
567
    }
568
569
    public static function is_char($char)
570
    {
571
        return preg_match('@\w@', $char);
572
    }
573
574
    /**
575
     * 模糊匹配
576
     * ^ 前缀字符串
577
     * * 包含字符串
578
     * $ 后缀字符串
579
	 * @access private
580
	 */
581
    protected static function is_regexp($pattern) 
582
    {
583
		return in_array(
584
			$pattern[ mb_strlen($pattern)-1 ],
585
			array('^','*','$')
586
		);
587
	}
588
}
589