phpspider::collect_page()   F
last analyzed

Complexity

Conditions 21
Paths 1444

Size

Total Lines 140
Code Lines 64

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 21
eloc 64
nc 1444
nop 0
dl 0
loc 140
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
// +----------------------------------------------------------------------
3
// | PHPSpider [ A PHP Framework For Crawler ]
4
// +----------------------------------------------------------------------
5
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
6
// +----------------------------------------------------------------------
7
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
8
// +----------------------------------------------------------------------
9
// | Author: Seatle Yang <[email protected]>
10
// +----------------------------------------------------------------------
11
12
//----------------------------------
13
// PHPSpider核心类文件
14
//----------------------------------
15
16
namespace phpspider\core;
17
18
require_once __DIR__ . '/constants.php';
19
20
use phpspider\core\requests;
21
use phpspider\core\selector;
22
use phpspider\core\queue;
23
use phpspider\core\db;
24
use phpspider\core\util;
25
use phpspider\core\log;
26
use Exception;
27
28
//require CORE.'/log.php';
29
//require CORE.'/requests.php';
30
//require CORE.'/selector.php';
31
//require CORE.'/util.php';
32
//require CORE.'/db.php';
33
//require CORE.'/cache.php';
34
//require CORE."/worker.php"; 
35
//require CORE."/phpspider.php"; 
36
37
// 启动的时候生成data目录
38
util::path_exists(PATH_DATA);
39
util::path_exists(PATH_DATA."/lock");
40
util::path_exists(PATH_DATA."/log");
41
util::path_exists(PATH_DATA."/cache");
42
util::path_exists(PATH_DATA."/status");
43
44
class phpspider
45
{
46
    /**
47
     * 版本号
48
     * @var string
49
     */
50
    const VERSION = '3.0.4';
51
52
    /**
53
     * 爬虫爬取每个网页的时间间隔,0表示不延时, 单位: 毫秒
54
     */
55
    const INTERVAL = 0;
56
57
    /**
58
     * 爬虫爬取每个网页的超时时间, 单位: 秒 
59
     */
60
    const TIMEOUT = 5;
61
62
    /**
63
     * 爬取失败次数, 不想失败重新爬取则设置为0 
64
     */
65
    const MAX_TRY = 0;
66
67
    /**
68
     * 爬虫爬取网页所使用的浏览器类型: pc、ios、android
69
     * 默认类型是PC
70
     */
71
    const AGENT_PC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36";
72
    const AGENT_IOS = "Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1";
73
    const AGENT_ANDROID = "Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S";
74
75
    /**
76
     * pid文件的路径及名称
77
     * @var string
78
     */
79
    //public static $pid_file = '';
80
81
    /**
82
     * 日志目录, 默认在data根目录下
83
     * @var mixed
84
     */
85
    //public static $log_file = '';
86
87
    /**
88
     * 主任务进程ID 
89
     */
90
    //public static $master_pid = 0;
91
92
    /**
93
     * 所有任务进程ID 
94
     */
95
    //public static $taskpids = array();
96
97
    /**
98
     * Daemonize.
99
     *
100
     * @var bool
101
     */
102
    public static $daemonize = false;
103
104
    /**
105
     * 当前进程是否终止 
106
     */
107
    public static $terminate = false;
108
109
    /**
110
     * 是否分布式 
111
     */
112
    public static $multiserver = false;
113
114
    /**
115
     * 当前服务器ID 
116
     */
117
    public static $serverid = 1;
118
119
    /**
120
     * 主任务进程 
121
     */
122
    public static $taskmaster = true;
123
124
    /**
125
     * 当前任务ID 
126
     */
127
    public static $taskid = 1;
128
129
    /**
130
     * 当前任务进程ID 
131
     */
132
    public static $taskpid = 1;
133
134
    /**
135
     * 并发任务数
136
     */
137
    public static $tasknum = 1;
138
139
    /**
140
     * 生成 
141
     */
142
    public static $fork_task_complete = false;
143
144
    /**
145
     * 是否使用Redis 
146
     */
147
    public static $use_redis = false;
148
149
    /**
150
     * 是否保存爬虫运行状态 
151
     */
152
    public static $save_running_state = false;
153
154
    /**
155
     * 配置 
156
     */
157
    public static $configs = array();
158
159
    /**
160
     * 要抓取的URL队列 
161
     md5(url) => array(
162
         'url'         => '',      // 要爬取的URL
163
         'url_type'    => '',      // 要爬取的URL类型,scan_page、list_page、content_page
164
         'method'      => 'get',   // 默认为"GET"请求, 也支持"POST"请求
165
         'headers'     => array(), // 此url的Headers, 可以为空
166
         'params'      => array(), // 发送请求时需添加的参数, 可以为空
167
         'context_data'=> '',      // 此url附加的数据, 可以为空
168
         'proxies'     => false,   // 是否使用代理
169
         'try_num'     => 0        // 抓取次数
170
         'max_try'     => 0        // 允许抓取失败次数
171
     ) 
172
     */
173
    public static $collect_queue = array();
174
175
    /**
176
     * 要抓取的URL数组
177
     * md5($url) => time()
178
     */
179
    public static $collect_urls = array();
180
181
    /**
182
     * 要抓取的URL数量
183
     */
184
    public static $collect_urls_num = 0;
185
186
    /**
187
     * 已经抓取的URL数量
188
     */
189
    public static $collected_urls_num = 0;
190
191
    /**
192
     * 当前进程采集成功数 
193
     */
194
    public static $collect_succ = 0;
195
196
    /**
197
     * 当前进程采集失败数 
198
     */
199
    public static $collect_fail = 0;
200
201
    /**
202
     * 提取到的字段数 
203
     */
204
    public static $fields_num = 0;
205
206
    /**
207
     * 采集深度 
208
     */
209
    public static $depth_num = 0;
210
211
    /**
212
     * 爬虫开始时间 
213
     */
214
    public static $time_start = 0;
215
216
    /**
217
     * 任务状态 
218
     */
219
    public static $task_status = array();
220
221
    // 导出类型配置
222
    public static $export_type = '';
223
    public static $export_file = '';
224
    public static $export_conf = '';
225
    public static $export_table = '';
226
227
    // 数据库配置
228
    public static $db_config = array();
229
    // 队列配置
230
    public static $queue_config = array();
231
232
    // 运行面板参数长度
233
    public static $server_length = 10;
234
    public static $tasknum_length = 8;
235
    public static $taskid_length = 8;
236
    public static $pid_length = 8;
237
    public static $mem_length = 8;
238
    public static $urls_length = 15;
239
    public static $speed_length = 6;
240
241
    /**
242
     * 爬虫初始化时调用, 用来指定一些爬取前的操作 
243
     * 
244
     * @var mixed
245
     * @access public
246
     */
247
    public $on_start = null;
248
249
    /**
250
     * 网页状态码回调 
251
     * 
252
     * @var mixed
253
     * @access public
254
     */
255
    public $on_status_code = null;
256
257
    /**
258
     * 判断当前网页是否被反爬虫, 需要开发者实现 
259
     * 
260
     * @var mixed
261
     * @access public
262
     */
263
    public $is_anti_spider = null;
264
265
    /**
266
     * 在一个网页下载完成之后调用, 主要用来对下载的网页进行处理 
267
     * 
268
     * @var mixed
269
     * @access public
270
     */
271
    public $on_download_page = null;
272
273
    /**
274
     * 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理 
275
     * 
276
     * @var mixed
277
     * @access public
278
     */
279
    public $on_download_attached_page = null;
280
281
    /**
282
     * 当前页面抽取到URL 
283
     * 
284
     * @var mixed
285
     * @access public
286
     */
287
    public $on_fetch_url = null;
288
289
    /**
290
     * URL属于入口页 
291
     * 在爬取到入口url的内容之后, 添加新的url到待爬队列之前调用 
292
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
293
     * 
294
     * @var mixed
295
     * @access public
296
     */
297
    public $on_scan_page = null;
298
299
    /**
300
     * URL属于列表页
301
     * 在爬取到列表页url的内容之后, 添加新的url到待爬队列之前调用 
302
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
303
     * 
304
     * @var mixed
305
     * @access public
306
     */
307
    public $on_list_page = null;
308
309
    /**
310
     * URL属于内容页 
311
     * 在爬取到内容页url的内容之后, 添加新的url到待爬队列之前调用 
312
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
313
     * 
314
     * @var mixed
315
     * @access public
316
     */
317
    public $on_content_page = null;
318
319
    /**
320
     * 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理 
321
     * 
322
     * @var mixed
323
     * @access public
324
     */
325
    public $on_handle_img = null;
326
327
    /**
328
     * 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理 
329
     * 
330
     * @var mixed
331
     * @access public
332
     */
333
    public $on_extract_field = null;
334
335
    /**
336
     * 在一个网页的所有field抽取完成之后, 可能需要对field进一步处理, 以发布到自己的网站 
337
     * 
338
     * @var mixed
339
     * @access public
340
     */
341
    public $on_extract_page = null;
342
343
    /**
344
     * 如果抓取的页面是一个附件文件, 比如图片、视频、二进制文件、apk、ipad、exe 
345
     * 就不去分析他的内容提取field了, 提取field只针对HTML
346
     * 
347
     * @var mixed
348
     * @access public
349
     */
350
    public $on_attachment_file = null;
351
352
    function __construct($configs = array())
353
    {
354
        // 产生时钟云,解决php7下面ctrl+c无法停止bug
355
        declare(ticks = 1);
356
357
        // 先打开以显示验证报错内容
358
        log::$log_show = true;
359
        log::$log_file = isset($configs['log_file']) ? $configs['log_file'] : PATH_DATA.'/phpspider.log';
360
        log::$log_type = isset($configs['log_type']) ? $configs['log_type'] : false;
361
362
        // 彩蛋
363
        $included_files = get_included_files();
364
        $content = file_get_contents($included_files[0]);
365
        if (!preg_match("#/\* Do NOT delete this comment \*/#", $content) || !preg_match("#/\* 不要删除这段注释 \*/#", $content))
366
        {
367
            $msg = "Unknown error...";
368
            log::error($msg);
369
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
370
        }
371
372
        $configs['name']        = isset($configs['name'])        ? $configs['name']        : 'phpspider';
373
        $configs['proxies']     = isset($configs['proxies'])     ? $configs['proxies']     : '';
374
        $configs['user_agent']  = isset($configs['user_agent'])  ? $configs['user_agent']  : self::AGENT_PC;
375
        $configs['user_agents'] = isset($configs['user_agents']) ? $configs['user_agents'] : null;
376
        $configs['client_ip']   = isset($configs['client_ip'])   ? $configs['client_ip']   : null;
377
        $configs['client_ips']  = isset($configs['client_ips'])  ? $configs['client_ips']  : null;
378
        $configs['interval']    = isset($configs['interval'])    ? $configs['interval']    : self::INTERVAL;
379
        $configs['timeout']     = isset($configs['timeout'])     ? $configs['timeout']     : self::TIMEOUT;
380
        $configs['max_try']     = isset($configs['max_try'])     ? $configs['max_try']     : self::MAX_TRY;
381
        $configs['max_depth']   = isset($configs['max_depth'])   ? $configs['max_depth']   : 0;
382
        $configs['max_fields']  = isset($configs['max_fields'])  ? $configs['max_fields']  : 0;
383
        $configs['export']      = isset($configs['export'])      ? $configs['export']      : array();
384
385
        // csv、sql、db
386
        self::$export_type  = isset($configs['export']['type'])  ? $configs['export']['type']  : '';
387
        self::$export_file  = isset($configs['export']['file'])  ? $configs['export']['file']  : '';
388
        self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : '';
389
        self::$db_config    = isset($configs['db_config'])       ? $configs['db_config']       : array();
390
        self::$queue_config = isset($configs['queue_config'])    ? $configs['queue_config']    : array();
391
392
        // 是否设置了并发任务数, 并且大于1, 而且不是windows环境
393
        if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win()) 
394
        {
395
            self::$tasknum = $configs['tasknum'];
396
        }
397
398
        // 是否设置了保留运行状态
399
        if (isset($configs['save_running_state'])) 
400
        {
401
            self::$save_running_state = $configs['save_running_state'];
402
        }
403
404
        // 是否分布式
405
        if (isset($configs['multiserver'])) 
406
        {
407
            self::$multiserver = $configs['multiserver'];
408
        }
409
410
        // 当前服务器ID
411
        if (isset($configs['serverid'])) 
412
        {
413
            self::$serverid = $configs['serverid'];
414
        }
415
416
        // 不同项目的采集以采集名称作为前缀区分
417
        if (isset($GLOBALS['config']['redis']['prefix'])) 
418
        {
419
            $GLOBALS['config']['redis']['prefix'] = $GLOBALS['config']['redis']['prefix'].'-'.md5($configs['name']);
420
        }
421
422
        self::$configs = $configs;
423
    }
424
425
    public function add_scan_url($url, $options = array(), $allowed_repeat = true)
426
    {
427
        // 投递状态
428
        $status = false;
429
430
        $link = $options;
431
        $link['url'] = $url;
432
        $link['url_type'] = 'scan_page';
433
        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
434
435
        if ($this->is_list_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_list_page($url) targeting phpspider\core\phpspider::is_list_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
436
        {
437
            $link['url_type'] = 'list_page';
438
            $status = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
439
        }
440
        elseif ($this->is_content_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_content_page($url) targeting phpspider\core\phpspider::is_content_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
441
        {
442
            $link['url_type'] = 'content_page';
443
            $status = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
444
        }
445
        else
446
        {
447
            $status = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
448
        }
449
450
        if ($status) 
0 ignored issues
show
introduced by
$status is of type void, thus it always evaluated to false.
Loading history...
451
        {
452
            if ($link['url_type'] == 'scan_page') 
453
            {
454
                log::debug("Find scan page: {$url}");
455
            }
456
            elseif ($link['url_type'] == 'list_page') 
457
            {
458
                log::debug("Find list page: {$url}");
459
            }
460
            elseif ($link['url_type'] == 'content_page') 
461
            {
462
                log::debug("Find content page: {$url}");
463
            }
464
        }
465
466
        return $status;
467
    }
468
469
    /**
470
     * 一般在 on_scan_page 和 on_list_page 回调函数中调用, 用来往待爬队列中添加url
471
     * 两个进程同时调用这个方法, 传递相同url的时候, 就会出现url重复进入队列
472
     * 
473
     * @param mixed $url
474
     * @param mixed $options
475
     * @return void
476
     * @author seatle <[email protected]> 
477
     * @created time :2016-09-18 10:17
478
     */
479
    public function add_url($url, $options = array(), $depth = 0)
480
    {
481
        // 投递状态
482
        $status = false;
483
484
        $link = $options;
485
        $link['url'] = $url;
486
        $link['depth'] = $depth;
487
        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
488
489
        if ($this->is_list_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_list_page($url) targeting phpspider\core\phpspider::is_list_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
490
        {
491
            $link['url_type'] = 'list_page';
492
            $status = $this->queue_lpush($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
493
        }
494
495
        if ($this->is_content_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_content_page($url) targeting phpspider\core\phpspider::is_content_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
496
        {
497
            $link['url_type'] = 'content_page';
498
            $status = $this->queue_lpush($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
499
        }
500
501
        if ($status) 
0 ignored issues
show
introduced by
The condition $status is always false.
Loading history...
502
        {
503
            if ($link['url_type'] == 'scan_page') 
504
            {
505
                log::debug("Find scan page: {$url}");
506
            }
507
            elseif ($link['url_type'] == 'list_page') 
508
            {
509
                log::debug("Find list page: {$url}");
510
            }
511
            elseif ($link['url_type'] == 'content_page') 
512
            {
513
                log::debug("Find content page: {$url}");
514
            }
515
        }
516
517
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type false which is incompatible with the documented return type void.
Loading history...
518
    }
519
520
    /**
521
     * 是否入口页面
522
     * 
523
     * @param mixed $url
524
     * @return void
525
     * @author seatle <[email protected]> 
526
     * @created time :2016-10-12 19:06
527
     */
528
    public function is_scan_page($url)
529
    {
530
        $parse_url = parse_url($url);
531
        if (empty($parse_url['host']) || !in_array($parse_url['host'], self::$configs['domains'])) 
532
        {
533
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
534
        }
535
        return true;
0 ignored issues
show
Bug Best Practice introduced by
The expression return true returns the type true which is incompatible with the documented return type void.
Loading history...
536
    }
537
538
    /**
539
     * 是否列表页面
540
     * 
541
     * @param mixed $url
542
     * @return void
543
     * @author seatle <[email protected]> 
544
     * @created time :2016-10-12 19:06
545
     */
546
    public function is_list_page($url)
547
    {
548
        $result = false;
549
        if (!empty(self::$configs['list_url_regexes'])) 
550
        {
551
            foreach (self::$configs['list_url_regexes'] as $regex) 
552
            {
553
                if (preg_match("#{$regex}#i", $url))
554
                {
555
                    $result = true;
556
                    break;
557
                }
558
            }
559
        }
560
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type boolean which is incompatible with the documented return type void.
Loading history...
561
    }
562
563
    /**
564
     * 是否内容页面
565
     * 
566
     * @param mixed $url
567
     * @return void
568
     * @author seatle <[email protected]> 
569
     * @created time :2016-10-12 19:06
570
     */
571
    public function is_content_page($url)
572
    {
573
        $result = false;
574
        if (!empty(self::$configs['content_url_regexes'])) 
575
        {
576
            foreach (self::$configs['content_url_regexes'] as $regex) 
577
            {
578
                if (preg_match("#{$regex}#i", $url))
579
                {
580
                    $result = true;
581
                    break;
582
                }
583
            }
584
        }
585
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type boolean which is incompatible with the documented return type void.
Loading history...
586
    }
587
588
    /**
589
     * Parse command.
590
     * php yourfile.php start | stop | status | kill
591
     *
592
     * @return void
593
     */
594
    public function parse_command()
595
    {
596
        // 检查运行命令的参数
597
        global $argv;
598
        $start_file = $argv[0]; 
599
600
        // 命令
601
        $command = isset($argv[1]) ? trim($argv[1]) : 'start';
602
603
        // 子命令, 目前只支持-d
604
        $command2 = isset($argv[2]) ? $argv[2] : '';
605
606
        // 根据命令做相应处理
607
        switch($command)
608
        {
609
            // 启动 phpspider
610
        case 'start':
611
            if ($command2 === '-d') 
612
            {
613
                self::$daemonize = true;
614
            }
615
            break;
616
        case 'stop':
617
            exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}'", $info);
618
            if (count($info) <= 1)
619
            {
620
                echo "PHPSpider[$start_file] not run\n";
621
            }
622
            else 
623
            {
624
                //echo "PHPSpider[$start_file] is stoping ...\n";
625
                echo "PHPSpider[$start_file] stop success";
626
                exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGINT", $info);
627
            }
628
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
629
            break;
630
        case 'kill':
631
            exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGKILL");
632
            break;
633
            // 显示 phpspider 运行状态
634
        case 'status':
635
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
636
            // 未知命令
637
        default :
638
            exit("Usage: php yourfile.php {start|stop|status|kill}\n");
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
639
        }
640
    }
641
642
    /**
643
     * Signal hander.
644
     *
645
     * @param int $signal
646
     */
647
    public function signal_handler($signal)
648
    {
649
        switch ($signal) {
650
            // Stop.
651
        case SIGINT:
652
            log::warn("Program stopping...");
653
            self::$terminate = true;
654
            break;
655
            // Show status.
656
        case SIGUSR2:
657
            echo "show status\n";
658
            break;
659
        }
660
    }
661
662
    /**
663
     * Install signal handler.
664
     *
665
     * @return void
666
     */
667
    public function install_signal()
668
    {
669
        if (function_exists('pcntl_signal')) 
670
        {
671
            // stop
672
            pcntl_signal(SIGINT, array(__CLASS__, 'signal_handler'), false);
673
            // status
674
            pcntl_signal(SIGUSR2, array(__CLASS__, 'signal_handler'), false);
675
            // ignore
676
            pcntl_signal(SIGPIPE, SIG_IGN, false);
677
        }
678
    }
679
680
    /**
681
     * Run as deamon mode.
682
     *
683
     * @throws Exception
684
     */
685
    protected static function daemonize()
686
    {
687
        if (!self::$daemonize) 
688
        {
689
            return;
690
        }
691
692
        // fork前一定要关闭redis
693
        queue::clear_link();
694
695
        umask(0);
696
        $pid = pcntl_fork();
697
        if (-1 === $pid) 
698
        {
699
            throw new Exception('fork fail');
700
        } 
701
        elseif ($pid > 0) 
702
        {
703
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
704
        }
705
        if (-1 === posix_setsid()) 
706
        {
707
            throw new Exception("setsid fail");
708
        }
709
        // Fork again avoid SVR4 system regain the control of terminal.
710
        $pid = pcntl_fork();
711
        if (-1 === $pid) 
712
        {
713
            throw new Exception("fork fail");
714
        } 
715
        elseif (0 !== $pid) 
716
        {
717
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
718
        }
719
    }
720
721
    /**
722
     * 检查是否终止当前进程
723
     * 
724
     * @return void
725
     * @author seatle <[email protected]> 
726
     * @created time :2016-11-16 11:06
727
     */
728
    public function check_terminate()
729
    {
730
        if (!self::$terminate) 
731
        {
732
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
733
        }
734
735
        // 删除当前任务状态
736
        $this->del_task_status(self::$serverid, self::$taskid);
737
738
        if (self::$taskmaster) 
739
        {
740
            // 检查子进程是否都退出
741
            while (true)
742
            {
743
                $all_stop = true;
744
                for ($i = 2; $i <= self::$tasknum; $i++) 
745
                {
746
                    // 只要一个还活着就说明没有完全退出
747
                    $task_status = $this->get_task_status(self::$serverid, $i);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status(self::serverid, $i) targeting phpspider\core\phpspider::get_task_status() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
748
                    if ($task_status)
749
                    {
750
                        $all_stop = false;
751
                    }
752
                }
753
                if ($all_stop) 
754
                {
755
                    break;
756
                }
757
                else 
758
                {
759
                    log::warn("Task stop waiting...");
760
                }
761
                sleep(1);
762
            }
763
764
            $this->del_server_list(self::$serverid);
765
766
            // 显示最后结果
767
            log::$log_show = true;
768
769
            $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
770
            log::note("Spider finished in {$spider_time_run}");
771
772
            $get_collected_url_num = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collected_url_num is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
773
            log::note("Total pages: {$get_collected_url_num} \n");
774
        }
775
        exit();
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
776
    }
777
778
    public function start()
779
    {
780
        $this->parse_command();
781
782
        // 爬虫开始时间
783
        self::$time_start = time();
784
        // 当前任务ID
785
        self::$taskid = 1;
786
        // 当前任务进程ID
787
        self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1;
788
        self::$collect_succ = 0;
789
        self::$collect_fail = 0;
790
791
        //--------------------------------------------------------------------------------
792
        // 运行前验证
793
        //--------------------------------------------------------------------------------
794
795
        // 检查PHP版本
796
        if (version_compare(PHP_VERSION, '5.3.0', 'lt')) 
797
        {
798
            log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion());
799
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
800
        }
801
802
        // 检查CURL扩展
803
        if(!function_exists('curl_init'))
804
        {
805
            log::error("The curl extension was not found");
806
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
807
        }
808
809
        // 多任务需要pcntl扩展支持
810
        if (self::$tasknum > 1 && !function_exists('pcntl_fork')) 
811
        {
812
            log::error("Multitasking needs pcntl, the pcntl extension was not found");
813
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
814
        }
815
816
        // 守护进程需要pcntl扩展支持
817
        if (self::$daemonize && !function_exists('pcntl_fork')) 
818
        {
819
            log::error("Daemonize needs pcntl, the pcntl extension was not found");
820
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
821
        }
822
823
        // 集群、保存运行状态、多任务都需要Redis支持
824
        if (self::$multiserver || self::$save_running_state || self::$tasknum > 1) 
825
        {
826
            self::$use_redis = true;
827
828
            queue::set_connect('default', self::$queue_config);
829
            if (!queue::init()) 
830
            {
831
                if (self::$multiserver) 
832
                {
833
                    log::error("Multiserver needs Redis support, ".queue::$error);
834
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
835
                }
836
837
                if (self::$tasknum > 1) 
838
                {
839
                    log::error("Multitasking needs Redis support, ".queue::$error);
840
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
841
                }
842
843
                if (self::$save_running_state) 
844
                {
845
                    log::error("Spider kept running state needs Redis support, ".queue::$error);
846
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
847
                }
848
            }
849
        }
850
851
        // 检查导出
852
        $this->check_export();
853
854
        // 检查缓存
855
        $this->check_cache();
856
857
        // 检查 scan_urls 
858
        if (empty(self::$configs['scan_urls'])) 
859
        {
860
            log::error("No scan url to start");
861
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
862
        }
863
864
        foreach ( self::$configs['scan_urls'] as $url ) 
865
        {
866
            // 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了.
867
            if (!$this->is_scan_page($url))
868
            {
869
                log::error("Domain of scan_urls (\"{$url}\") does not match the domains of the domain name");
870
                exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
871
            }
872
        }
873
874
        // windows 下没法显示面板, 强制显示日志
875
        if (util::is_win()) 
876
        {
877
            self::$configs['name'] = iconv("UTF-8", "GB2312//IGNORE", self::$configs['name']);
878
            log::$log_show = true;
879
        }
880
        else 
881
        {
882
            log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
883
        }
884
885
        if (self::$daemonize) 
886
        {
887
            log::$log_show = true;
888
        }
889
890
        if (log::$log_show)
891
        {
892
            global $argv;
893
            $start_file = $argv[0]; 
894
895
            $header = "";
896
            if (!util::is_win()) $header .= "\033[33m";
897
            $header .= "\n[ ".self::$configs['name']." Spider ] is started...\n\n";
898
            $header .= "  * PHPSpider Version: ".self::VERSION."\n";
899
            $header .= "  * Documentation: https://doc.phpspider.org\n";
900
            $header .= "  * Task Number: ".self::$tasknum."\n\n";
901
            $header .= "Input \"php $start_file stop\" to quit. Start success.\n";
902
            if (!util::is_win()) $header .= "\033[0m";
903
            log::note($header);
904
        }
905
906
        // 如果是守护进程,恢复日志状态
907
        if (self::$daemonize) 
908
        {
909
            log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
910
        }
911
912
        // 多任务和分布式都要清掉, 当然分布式只清自己的
913
        $this->init_redis();
914
915
        //--------------------------------------------------------------------------------
916
        // 生成多任务
917
        //--------------------------------------------------------------------------------
918
919
        // 添加入口URL到队列
920
        foreach ( self::$configs['scan_urls'] as $url ) 
921
        {
922
            // false 表示不允许重复
923
            $this->add_scan_url($url, null, false);
924
        }
925
926
        // 放这个位置, 可以添加入口页面
927
        if ($this->on_start) 
928
        {
929
            call_user_func($this->on_start, $this);
930
        }
931
932
        if (!self::$daemonize) 
933
        {
934
            if (!log::$log_show) 
935
            {
936
                // 第一次先清屏
937
                $this->clear_echo();
938
939
                // 先显示一次面板, 然后下面再每次采集成功显示一次
940
                $this->display_ui();
941
            }
942
        }
943
        else 
944
        {
945
            $this->daemonize();
946
        }
947
948
        // 安装信号
949
        $this->install_signal();
950
951
        // 开始采集
952
        $this->do_collect_page();
953
954
        // 从服务器列表中删除当前服务器信息
955
        $this->del_server_list(self::$serverid);
956
    }
957
958
    /**
959
     * 创建一个子进程
960
     * @param Worker $worker
0 ignored issues
show
Bug introduced by
The type phpspider\core\Worker was not found. Did you mean Worker? If so, make sure to prefix the type with \.
Loading history...
961
     * @throws Exception
962
     */
963
    public function fork_one_task($taskid)
964
    {
965
        $pid = pcntl_fork();
966
967
        // 主进程记录子进程pid
968
        if($pid > 0)
969
        {
970
            // 暂时没用
971
            //self::$taskpids[$taskid] = $pid;
972
        }
973
        // 子进程运行
974
        elseif(0 === $pid)
975
        {
976
            log::warn("Fork children task({$taskid}) successful...");
977
978
            // 初始化子进程参数
979
            self::$time_start = microtime(true);
980
            self::$taskid     = $taskid;
981
            self::$taskmaster = false;
982
            self::$taskpid    = posix_getpid();
983
            self::$collect_succ = 0;
984
            self::$collect_fail = 0;
985
986
            $this->do_collect_page();
987
988
            // 这里用0表示正常退出
989
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
990
        }
991
        else
992
        {
993
            log::error("Fork children task({$taskid}) fail...");
994
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
995
        }
996
    }
997
998
    public function do_collect_page() 
999
    {
1000
        queue::set_connect('default', self::$queue_config);
1001
        queue::init(); 
1002
        while( $queue_lsize = $this->queue_lsize() )
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue_lsize is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1003
        { 
1004
            // 如果是主任务
1005
            if (self::$taskmaster) 
1006
            {
1007
                // 多任务下主任务未准备就绪
1008
                if (self::$tasknum > 1 && !self::$fork_task_complete) 
1009
                {
1010
                    // 主进程采集到两倍于任务数时, 生成子任务一起采集
1011
                    if ( $queue_lsize > self::$tasknum*2 ) 
1012
                    {
1013
                        self::$fork_task_complete = true;
1014
1015
                        // fork 子进程前一定要先干掉redis连接fd, 不然会存在进程互抢redis fd 问题
1016
                        queue::clear_link();
1017
                        // task进程从2开始, 1被master进程所使用
1018
                        for ($i = 2; $i <= self::$tasknum; $i++) 
1019
                        {
1020
                            $this->fork_one_task($i);
1021
                        }
1022
                    }
1023
                }
1024
1025
                // 抓取页面
1026
                $this->collect_page();
1027
                // 保存任务状态
1028
                $this->set_task_status();
1029
1030
                // 每采集成功一次页面, 就刷新一次面板
1031
                if (!log::$log_show && !self::$daemonize) 
1032
                {
1033
                    $this->display_ui();
1034
                }
1035
            }
1036
            // 如果是子任务
1037
            else 
1038
            {
1039
                // 如果队列中的网页比任务数2倍多, 子任务可以采集, 否则等待...
1040
                if ( $queue_lsize > self::$tasknum*2 ) 
1041
                {
1042
                    // 抓取页面
1043
                    $this->collect_page();
1044
                    // 保存任务状态
1045
                    $this->set_task_status();
1046
                }
1047
                else 
1048
                {
1049
                    log::warn("Task(".self::$taskid.") waiting...");
1050
                    sleep(1);
1051
                }
1052
            }
1053
1054
            // 检查进程是否收到关闭信号
1055
            $this->check_terminate();
1056
        } 
1057
    }
1058
1059
    /**
1060
     * 爬取页面
1061
     * 
1062
     * @param mixed $collect_url    要抓取的链接
1063
     * @return void
1064
     * @author seatle <[email protected]> 
1065
     * @created time :2016-09-18 10:17
1066
     */
1067
    public function collect_page() 
1068
    {
1069
        $get_collect_url_num = $this->get_collect_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collect_url_num is correct as $this->get_collect_url_num() targeting phpspider\core\phpspider::get_collect_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1070
        log::info("Find pages: {$get_collect_url_num} ");
1071
1072
        $queue_lsize = $this->queue_lsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue_lsize is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1073
        log::info("Waiting for collect pages: {$queue_lsize} ");
1074
1075
        $get_collected_url_num = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collected_url_num is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1076
        log::info("Collected pages: {$get_collected_url_num} ");
1077
1078
        // 多任务的时候输出爬虫序号
1079
        if (self::$tasknum > 1) 
1080
        {
1081
            log::info("Current task id: ".self::$taskid);
1082
        }
1083
1084
        // 先进先出
1085
        $link = $this->queue_rpop();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->queue_rpop() targeting phpspider\core\phpspider::queue_rpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1086
        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1087
        $url = $link['url'];
1088
1089
        // 标记为已爬取网页
1090
        $this->incr_collected_url_num($url);
1091
1092
        // 爬取页面开始时间
1093
        $page_time_start = microtime(true);
1094
1095
        requests::$input_encoding = null;
1096
        $html = $this->request_url($url, $link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $html is correct as $this->request_url($url, $link) targeting phpspider\core\phpspider::request_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1097
1098
        if (!$html) 
0 ignored issues
show
introduced by
$html is of type void, thus it always evaluated to false.
Loading history...
1099
        {
1100
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1101
        }
1102
        // 当前正在爬取的网页页面的对象
1103
        $page = array(
1104
            'url'     => $url,
1105
            'raw'     => $html,
1106
            'request' => array(
1107
                'url'          => $url,
1108
                'method'       => $link['method'],
1109
                'headers'      => $link['headers'],
1110
                'params'       => $link['params'],
1111
                'context_data' => $link['context_data'],
1112
                'try_num'      => $link['try_num'],
1113
                'max_try'      => $link['max_try'],
1114
                'depth'        => $link['depth'],
1115
                'taskid'       => self::$taskid,
1116
            ),
1117
        );
1118
        unset($html);
1119
1120
        //--------------------------------------------------------------------------------
1121
        // 处理回调函数
1122
        //--------------------------------------------------------------------------------
1123
1124
        // 判断当前网页是否被反爬虫了, 需要开发者实现 
1125
        if ($this->is_anti_spider) 
1126
        {
1127
            $is_anti_spider = call_user_func($this->is_anti_spider, $url, $page['raw'], $this);
1128
            // 如果在回调函数里面判断被反爬虫并且返回true
1129
            if ($is_anti_spider) 
1130
            {
1131
                return false;
1132
            }
1133
        }
1134
1135
        // 在一个网页下载完成之后调用. 主要用来对下载的网页进行处理.
1136
        // 比如下载了某个网页, 希望向网页的body中添加html标签
1137
        if ($this->on_download_page) 
1138
        {
1139
            $return = call_user_func($this->on_download_page, $page, $this);
1140
            // 针对那些老是忘记return的人
1141
            if (isset($return)) $page = $return;
1142
        }
1143
1144
        // 是否从当前页面分析提取URL
1145
        // 回调函数如果返回false表示不需要再从此网页中发现待爬url
1146
        $is_find_url = true;
1147
        if ($link['url_type'] == 'scan_page') 
1148
        {
1149
            if ($this->on_scan_page) 
1150
            {
1151
                $return = call_user_func($this->on_scan_page, $page, $page['raw'], $this);
1152
                if (isset($return)) $is_find_url = $return;
1153
            }
1154
        }
1155
        elseif ($link['url_type'] == 'list_page') 
1156
        {
1157
            if ($this->on_list_page) 
1158
            {
1159
                $return = call_user_func($this->on_list_page, $page, $page['raw'], $this);
1160
                if (isset($return)) $is_find_url = $return;
1161
            }
1162
        }
1163
        elseif ($link['url_type'] == 'content_page') 
1164
        {
1165
            if ($this->on_content_page) 
1166
            {
1167
                $return = call_user_func($this->on_content_page, $page, $page['raw'], $this);
1168
                if (isset($return)) $is_find_url = $return;
1169
            }
1170
        }
1171
1172
        // on_scan_page、on_list_page、on_content_page 返回false表示不需要再从此网页中发现待爬url
1173
        if ($is_find_url) 
1174
        {
1175
            // 如果深度没有超过最大深度, 获取下一级URL
1176
            if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth']) 
1177
            {
1178
                // 分析提取HTML页面中的URL
1179
                $this->get_urls($page['raw'], $url, $link['depth'] + 1);
1180
            }
1181
        }
1182
1183
        // 如果是内容页, 分析提取HTML页面中的字段
1184
        // 列表页也可以提取数据的, source_type: urlcontext, 未实现
1185
        if ($link['url_type'] == 'content_page') 
1186
        {
1187
            $this->get_html_fields($page['raw'], $url, $page);
1188
        }
1189
1190
        // 如果当前深度大于缓存的, 更新缓存
1191
        $this->incr_depth_num($link['depth']);
1192
1193
        // 处理页面耗时时间
1194
        $time_run = round(microtime(true) - $page_time_start, 3);
1195
        log::debug("Success process page {$url} in {$time_run} s");
1196
1197
        $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
1198
        log::info("Spider running in {$spider_time_run}");
1199
1200
        // 爬虫爬取每个网页的时间间隔, 单位: 毫秒
1201
        if (!isset(self::$configs['interval'])) 
1202
        {
1203
            // 默认睡眠100毫秒, 太快了会被认为是ddos
1204
            self::$configs['interval'] = 100;
1205
        }
1206
        usleep(self::$configs['interval'] * 1000);
1207
    }
1208
1209
    /**
1210
     * 下载网页, 得到网页内容
1211
     * 
1212
     * @param mixed $url
1213
     * @param mixed $link
1214
     * @return void
1215
     * @author seatle <[email protected]> 
1216
     * @created time :2016-09-18 10:17
1217
     */
1218
    public function request_url($url, $link = array())
1219
    {
1220
        $time_start = microtime(true);
1221
1222
        //$url = "http://www.qiushibaike.com/article/117568316";
1223
1224
        // 设置了编码就不要让requests去判断了
1225
        if (isset(self::$configs['input_encoding'])) 
1226
        {
1227
            requests::$input_encoding = self::$configs['input_encoding'];
1228
        }
1229
        // 得到的编码如果不是utf-8的要转成utf-8, 因为xpath只支持utf-8
1230
        requests::$output_encoding = 'utf-8';
1231
        requests::set_timeout(self::$configs['timeout']);
1232
        requests::set_useragent(self::$configs['user_agent']);
1233
        if (self::$configs['user_agents']) 
1234
        {
1235
            requests::set_useragents(self::$configs['user_agents']);
0 ignored issues
show
Bug introduced by
The method set_useragents() does not exist on phpspider\core\requests. Did you maybe mean set_useragent()? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1235
            requests::/** @scrutinizer ignore-call */ 
1236
                      set_useragents(self::$configs['user_agents']);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1236
        }
1237
        if (self::$configs['client_ip']) 
1238
        {
1239
            requests::set_client_ip(self::$configs['client_ip']);
1240
        }
1241
        if (self::$configs['client_ips']) 
1242
        {
1243
            requests::set_client_ips(self::$configs['client_ips']);
0 ignored issues
show
Bug introduced by
The method set_client_ips() does not exist on phpspider\core\requests. Did you maybe mean set_client_ip()? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1243
            requests::/** @scrutinizer ignore-call */ 
1244
                      set_client_ips(self::$configs['client_ips']);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1244
        }
1245
1246
        // 是否设置了代理
1247
        if (!empty($link['proxies'])) 
1248
        {
1249
            requests::set_proxies($link['proxies']);
0 ignored issues
show
Bug introduced by
The method set_proxies() does not exist on phpspider\core\requests. Did you maybe mean set_proxy()? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1249
            requests::/** @scrutinizer ignore-call */ 
1250
                      set_proxies($link['proxies']);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1250
            // 自动切换IP
1251
            requests::set_header('Proxy-Switch-Ip', 'yes');
1252
        }
1253
1254
        // 如何设置了 HTTP Headers
1255
        if (!empty($link['headers'])) 
1256
        {
1257
            foreach ($link['headers'] as $k=>$v) 
1258
            {
1259
                requests::set_header($k, $v);
1260
            }
1261
        }
1262
1263
        $method = empty($link['method']) ? 'get' : strtolower($link['method']);
1264
        $params = empty($link['params']) ? array() : $link['params'];
1265
        $html = requests::$method($url, $params);
1266
        // 此url附加的数据不为空, 比如内容页需要列表页一些数据, 拼接到后面去
1267
        if ($html && !empty($link['context_data'])) 
1268
        {
1269
            $html .= $link['context_data'];
1270
        }
1271
1272
        $http_code = requests::$status_code;
1273
1274
        if ($this->on_status_code) 
1275
        {
1276
            $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this);
1277
            if (isset($return)) 
1278
            {
1279
                $html = $return;
1280
            }
1281
            if (!$html) 
1282
            {
1283
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1284
            }
1285
        }
1286
1287
        if ($http_code != 200)
1288
        {
1289
            // 如果是301、302跳转, 抓取跳转后的网页内容
1290
            if ($http_code == 301 || $http_code == 302) 
1291
            {
1292
                $info = requests::$info;
1293
                if (isset($info['redirect_url'])) 
1294
                {
1295
                    $url = $info['redirect_url'];
1296
                    requests::$input_encoding = null;
1297
                    $html = $this->request_url($url, $link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $html is correct as $this->request_url($url, $link) targeting phpspider\core\phpspider::request_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1298
                    if ($html && !empty($link['context_data'])) 
0 ignored issues
show
introduced by
$html is of type void, thus it always evaluated to false.
Loading history...
1299
                    {
1300
                        $html .= $link['context_data'];
1301
                    }
1302
                }
1303
                else 
1304
                {
1305
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1306
                }
1307
            }
1308
            else 
1309
            {
1310
                if ($http_code == 407) 
1311
                {
1312
                    // 扔到队列头部去, 继续采集
1313
                    $this->queue_rpush($link);
1314
                    log::error("Failed to download page {$url}");
1315
                    self::$collect_fail++;
1316
                }
1317
                elseif (in_array($http_code, array('0','502','503','429'))) 
1318
                {
1319
                    // 采集次数加一
1320
                    $link['try_num']++;
1321
                    // 抓取次数 小于 允许抓取失败次数
1322
                    if ( $link['try_num'] <= $link['max_try'] ) 
1323
                    {
1324
                        // 扔到队列头部去, 继续采集
1325
                        $this->queue_rpush($link);
1326
                    }
1327
                    log::error("Failed to download page {$url}, retry({$link['try_num']})");
1328
                }
1329
                else 
1330
                {
1331
                    log::error("Failed to download page {$url}");
1332
                    self::$collect_fail++;
1333
                }
1334
                log::error("HTTP CODE: {$http_code}");
1335
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1336
            }
1337
        }
1338
1339
        // 爬取页面耗时时间
1340
        $time_run = round(microtime(true) - $time_start, 3);
1341
        log::debug("Success download page {$url} in {$time_run} s");
1342
        self::$collect_succ++;
1343
1344
        return $html;
1345
    }
1346
1347
    /**
1348
     * 分析提取HTML页面中的URL
1349
     * 
1350
     * @param mixed $html           HTML内容
1351
     * @param mixed $collect_url    抓取的URL, 用来拼凑完整页面的URL
1352
     * @return void
1353
     * @author seatle <[email protected]> 
1354
     * @created time :2016-09-18 10:17
1355
     */
1356
    public function get_urls($html, $collect_url, $depth = 0) 
1357
    { 
1358
        //--------------------------------------------------------------------------------
1359
        // 正则匹配出页面中的URL
1360
        //--------------------------------------------------------------------------------
1361
        $urls = selector::select($html, '//a/@href');             
1362
        //preg_match_all("/<a.*href=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $html, $matchs); 
1363
        //$urls = array();
1364
        //if (!empty($matchs[1])) 
1365
        //{
1366
            //foreach ($matchs[1] as $url) 
1367
            //{
1368
                //$urls[] = str_replace(array("\"", "'",'&amp;'), array("",'','&'), $url);
1369
            //}
1370
        //}
1371
1372
        if (empty($urls)) 
1373
        {
1374
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1375
        }
1376
1377
        // 如果页面上只有一个url,要把他转为数组,否则下面会报警告
1378
        if (!is_array($urls)) 
1379
        {
1380
            $urls = array($urls);
1381
        }
1382
1383
        foreach ($urls as $key=>$url) 
1384
        {
1385
            $urls[$key] = str_replace(array("\"", "'",'&amp;'), array("",'','&'), $url);
1386
        }
1387
1388
        //--------------------------------------------------------------------------------
1389
        // 过滤和拼凑URL
1390
        //--------------------------------------------------------------------------------
1391
        // 去除重复的RUL
1392
        $urls = array_unique($urls);
1393
        foreach ($urls as $k=>$url) 
1394
        {
1395
            $url = trim($url);
1396
            if (empty($url)) 
1397
            {
1398
                continue;
1399
            }
1400
1401
            $val = $this->fill_url($url, $collect_url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $val is correct as $this->fill_url($url, $collect_url) targeting phpspider\core\phpspider::fill_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1402
            if ($val) 
1403
            {
1404
                $urls[$k] = $val;
1405
            }
1406
            else 
1407
            {
1408
                unset($urls[$k]);
1409
            }
1410
        }
1411
1412
        if (empty($urls)) 
1413
        {
1414
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1415
        }
1416
1417
        //--------------------------------------------------------------------------------
1418
        // 把抓取到的URL放入队列
1419
        //--------------------------------------------------------------------------------
1420
        foreach ($urls as $url) 
1421
        {
1422
            if ($this->on_fetch_url) 
1423
            {
1424
                $return = call_user_func($this->on_fetch_url, $url, $this);
1425
                $url = isset($return) ? $return : $url;
1426
                unset($return);
1427
1428
                // 如果 on_fetch_url 返回 false,此URL不入队列
1429
                if (!$url) 
1430
                {
1431
                    continue;
1432
                }
1433
            }
1434
1435
            // 把当前页当做找到的url的Referer页
1436
            $options = array(
1437
                'headers' => array(
1438
                    'Referer' => $collect_url,
1439
                )
1440
            );
1441
            $this->add_url($url, $options, $depth);
1442
        }
1443
    }
1444
1445
    /**
1446
     * 获得完整的连接地址
1447
     * 
1448
     * @param mixed $url            要检查的URL
1449
     * @param mixed $collect_url    从那个URL页面得到上面的URL
1450
     * @return void
1451
     * @author seatle <[email protected]> 
1452
     * @created time :2016-09-23 17:13
1453
     */
1454
    public function fill_url($url, $collect_url)
1455
    {
1456
        $url = trim($url);
1457
        $collect_url = trim($collect_url);
1458
1459
        // 排除JavaScript的连接
1460
        //if (strpos($url, "javascript:") !== false) 
1461
        if( preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '')
1462
        {
1463
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1464
        }
1465
        // 排除没有被解析成功的语言标签
1466
        if(substr($url, 0, 3) == '<%=')
1467
        {
1468
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1469
        }
1470
1471
        $parse_url = @parse_url($collect_url);
1472
        if (empty($parse_url['scheme']) || empty($parse_url['host'])) 
1473
        {
1474
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1475
        }
1476
        // 过滤mailto、tel、sms、wechat、sinaweibo、weixin等协议
1477
        if (!in_array($parse_url['scheme'], array("http", "https"))) 
1478
        {
1479
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1480
        }
1481
        $scheme = $parse_url['scheme'];
1482
        $domain = $parse_url['host'];
1483
        $path = empty($parse_url['path']) ? '' : $parse_url['path'];
1484
        $base_url_path = $domain.$path;
1485
        $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/","/",$base_url_path);
1486
        $base_url_path = preg_replace("/\/$/",'',$base_url_path);
1487
1488
        $i = $path_step = 0;
1489
        $dstr = $pstr = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $pstr is dead and can be removed.
Loading history...
1490
        $pos = strpos($url,'#');
1491
        if($pos > 0)
1492
        {
1493
            // 去掉#和后面的字符串
1494
            $url = substr($url, 0, $pos);
1495
        }
1496
1497
        // 京东变态的都是 //www.jd.com/111.html
1498
        if(substr($url, 0, 2) == '//')
1499
        {
1500
            $url = str_replace("//", "", $url);
1501
        }
1502
        // /1234.html
1503
        elseif($url[0] == '/')
1504
        {
1505
            $url = $domain.$url;
1506
        }
1507
        // ./1234.html、../1234.html 这种类型的
1508
        elseif($url[0] == '.')
1509
        {
1510
            if(!isset($url[2]))
1511
            {
1512
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1513
            }
1514
            else
1515
            {
1516
                $urls = explode('/',$url);
1517
                foreach($urls as $u)
1518
                {
1519
                    if( $u == '..' )
1520
                    {
1521
                        $path_step++;
1522
                    }
1523
                    // 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的
1524
                    else if( $i < count($urls)-1 )
1525
                    {
1526
                        $dstr .= $urls[$i].'/';
1527
                    }
1528
                    else
1529
                    {
1530
                        $dstr .= $urls[$i];
1531
                    }
1532
                    $i++;
1533
                }
1534
                $urls = explode('/',$base_url_path);
1535
                if(count($urls) <= $path_step)
1536
                {
1537
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1538
                }
1539
                else
1540
                {
1541
                    $pstr = '';
1542
                    for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; }
1543
                    $url = $pstr.$dstr;
1544
                }
1545
            }
1546
        }
1547
        else 
1548
        {
1549
            if( strtolower(substr($url, 0, 7))=='http://' )
1550
            {
1551
                $url = preg_replace('#^http://#i','',$url);
1552
                $scheme = "http";
1553
            }
1554
            else if( strtolower(substr($url, 0, 8))=='https://' )
1555
            {
1556
                $url = preg_replace('#^https://#i','',$url);
1557
                $scheme = "https";
1558
            }
1559
            else
1560
            {
1561
                $url = $base_url_path.'/'.$url;
1562
            }
1563
        }
1564
        // 两个 / 或以上的替换成一个 /
1565
        $url = preg_replace('@/{1,}@i', '/', $url);
1566
        $url = $scheme.'://'.$url;
1567
        //echo $url;exit("\n");
1568
1569
        $parse_url = @parse_url($url);
1570
        $domain = empty($parse_url['host']) ? $domain : $parse_url['host'];
0 ignored issues
show
Unused Code introduced by
The assignment to $domain is dead and can be removed.
Loading history...
1571
        // 如果host不为空, 判断是不是要爬取的域名
1572
        if (!empty($parse_url['host'])) 
1573
        {
1574
            //排除非域名下的url以提高爬取速度
1575
            if (!in_array($parse_url['host'], self::$configs['domains'])) 
1576
            {
1577
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1578
            }
1579
        }
1580
1581
        return $url;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $url returns the type string which is incompatible with the documented return type void.
Loading history...
1582
    }
1583
1584
    /**
1585
     * 连接对象压缩
1586
     * 
1587
     * @return void
1588
     * @author seatle <[email protected]> 
1589
     * @created time :2016-11-05 18:58
1590
     */
1591
    public function link_compress($link)
1592
    {
1593
        if (empty($link['url_type'])) 
1594
        {
1595
            unset($link['url_type']);
1596
        }
1597
1598
        if (empty($link['method']) || strtolower($link['method']) == 'get') 
1599
        {
1600
            unset($link['method']);
1601
        }
1602
1603
        if (empty($link['headers'])) 
1604
        {
1605
            unset($link['headers']);
1606
        }
1607
1608
        if (empty($link['params'])) 
1609
        {
1610
            unset($link['params']);
1611
        }
1612
1613
        if (empty($link['context_data'])) 
1614
        {
1615
            unset($link['context_data']);
1616
        }
1617
1618
        if (empty($link['proxies'])) 
1619
        {
1620
            unset($link['proxies']);
1621
        }
1622
1623
        if (empty($link['try_num'])) 
1624
        {
1625
            unset($link['try_num']);
1626
        }
1627
1628
        if (empty($link['max_try'])) 
1629
        {
1630
            unset($link['max_try']);
1631
        }
1632
1633
        if (empty($link['depth'])) 
1634
        {
1635
            unset($link['depth']);
1636
        }
1637
        //$json = json_encode($link);
1638
        //$json = gzdeflate($json);
1639
        return $link;
1640
    }
1641
1642
    /**
1643
     * 连接对象解压缩
1644
     * 
1645
     * @param mixed $link
1646
     * @return void
1647
     * @author seatle <[email protected]> 
1648
     * @created time :2016-11-05 18:58
1649
     */
1650
    public function link_uncompress($link)
1651
    {
1652
        $link = array(
1653
            'url'          => isset($link['url'])          ? $link['url']          : '',             
1654
            'url_type'     => isset($link['url_type'])     ? $link['url_type']     : '',             
1655
            'method'       => isset($link['method'])       ? $link['method']       : 'get',             
1656
            'headers'      => isset($link['headers'])      ? $link['headers']      : array(),    
1657
            'params'       => isset($link['params'])       ? $link['params']       : array(),           
1658
            'context_data' => isset($link['context_data']) ? $link['context_data'] : '',                
1659
            'proxies'      => isset($link['proxies'])      ? $link['proxies']      : self::$configs['proxies'],             
1660
            'try_num'      => isset($link['try_num'])      ? $link['try_num']      : 0,                 
1661
            'max_try'      => isset($link['max_try'])      ? $link['max_try']      : self::$configs['max_try'],
1662
            'depth'        => isset($link['depth'])        ? $link['depth']        : 0,             
1663
        );
1664
1665
        return $link;
1666
    }
1667
1668
    /**
1669
     * 分析提取HTML页面中的字段
1670
     * 
1671
     * @param mixed $html
1672
     * @return void
1673
     * @author seatle <[email protected]> 
1674
     * @created time :2016-09-18 10:17
1675
     */
1676
    public function get_html_fields($html, $url, $page) 
1677
    {
1678
        $fields = $this->get_fields(self::$configs['fields'], $html, $url, $page);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields is correct as $this->get_fields(self::...'], $html, $url, $page) targeting phpspider\core\phpspider::get_fields() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1679
1680
        if (!empty($fields)) 
1681
        {
1682
            if ($this->on_extract_page) 
1683
            {
1684
                $return = call_user_func($this->on_extract_page, $page, $fields);
1685
                if (!isset($return))
1686
                {
1687
                    log::warn("on_extract_page return value can't be empty");
1688
                }
1689
                elseif (!is_array($return))
1690
                {
1691
                    log::warn("on_extract_page return value must be an array");
1692
                }
1693
                else 
1694
                {
1695
                    $fields = $return;
1696
                }
1697
            }
1698
1699
            if (isset($fields) && is_array($fields)) 
1700
            {
1701
                $fields_num = $this->incr_fields_num();
1702
                if (self::$configs['max_fields'] != 0 && $fields_num > self::$configs['max_fields']) 
1703
                {
1704
                    exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1705
                }
1706
1707
                if (version_compare(PHP_VERSION,'5.4.0','<'))
1708
                {
1709
                    $fields_str = json_encode($fields);
1710
                    $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) {
1711
                        return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1]));
1712
                    }, $fields_str ); 
1713
                } 
1714
                else
1715
                {
1716
                    $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE);
1717
                }
1718
1719
                if (util::is_win()) 
1720
                {
1721
                    $fields_str = mb_convert_encoding($fields_str, 'gb2312', 'utf-8');
1722
                }
1723
                log::info("Result[{$fields_num}]: ".$fields_str);
1724
1725
                // 如果设置了导出选项
1726
                if (!empty(self::$configs['export'])) 
1727
                {
1728
                    self::$export_type = isset(self::$configs['export']['type']) ? self::$configs['export']['type'] : '';
1729
                    if (self::$export_type == 'csv') 
1730
                    {
1731
                        util::put_file(self::$export_file, util::format_csv($fields)."\n", FILE_APPEND);
1732
                    }
1733
                    elseif (self::$export_type == 'sql') 
1734
                    {
1735
                        $sql = db::insert(self::$export_table, $fields, true);
1736
                        util::put_file(self::$export_file, $sql.";\n", FILE_APPEND);
1737
                    }
1738
                    elseif (self::$export_type == 'db') 
1739
                    {
1740
                        db::insert(self::$export_table, $fields);
1741
                    }
1742
                }
1743
            }
1744
        }
1745
    }
1746
1747
    /**
1748
     * 根据配置提取HTML代码块中的字段
1749
     * 
1750
     * @param mixed $confs
1751
     * @param mixed $html
1752
     * @param mixed $page
1753
     * @return void
1754
     * @author seatle <[email protected]> 
1755
     * @created time :2016-09-23 17:13
1756
     */
1757
    public function get_fields($confs, $html, $url, $page) 
1758
    {
1759
        $fields = array();
1760
        foreach ($confs as $conf) 
1761
        {
1762
            // 当前field抽取到的内容是否是有多项
1763
            $repeated = isset($conf['repeated']) && $conf['repeated'] ? true : false;
1764
            // 当前field抽取到的内容是否必须有值
1765
            $required = isset($conf['required']) && $conf['required'] ? true : false;
1766
1767
            if (empty($conf['name'])) 
1768
            {
1769
                log::error("The field name is null, please check your \"fields\" and add the name of the field\n");
1770
                exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1771
            }
1772
1773
            $values = array();
1774
            // 如果定义抽取规则
1775
            if (!empty($conf['selector'])) 
1776
            {
1777
                // 如果这个field是上一个field的附带连接
1778
                if (isset($conf['source_type']) && $conf['source_type']=='attached_url') 
1779
                {
1780
                    // 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的
1781
                    if (!empty($fields[$conf['attached_url']])) 
1782
                    {
1783
                        $collect_url = $this->fill_url($fields[$conf['attached_url']], $url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collect_url is correct as $this->fill_url($fields[...'attached_url']], $url) targeting phpspider\core\phpspider::fill_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1784
                        //log::debug("Find attached content page: {$collect_url}");
1785
                        $link['url'] = $collect_url;
1786
                        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1787
                        requests::$input_encoding = null;
1788
                        $html = $this->request_url($collect_url, $link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $html is correct as $this->request_url($collect_url, $link) targeting phpspider\core\phpspider::request_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1789
                        // 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理.
1790
                        if ($this->on_download_attached_page) 
1791
                        {
1792
                            $return = call_user_func($this->on_download_attached_page, $html, $this);
1793
                            if (isset($return)) 
1794
                            {
1795
                                $html = $return;
1796
                            }
1797
                        }
1798
1799
                        // 请求获取完分页数据后把连接删除了 
1800
                        unset($fields[$conf['attached_url']]);
1801
                    }
1802
                }
1803
1804
                // 没有设置抽取规则的类型 或者 设置为 xpath
1805
                if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') 
1806
                {
1807
                    $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_xpath(...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_xpath() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1808
                }
1809
                elseif ($conf['selector_type']=='css') 
1810
                {
1811
                    $values = $this->get_fields_css($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_css($h...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_css() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1812
                }
1813
                elseif ($conf['selector_type']=='regex') 
1814
                {
1815
                    $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_regex(...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_regex() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1816
                }
1817
1818
                // field不为空而且存在子配置
1819
                if (!empty($values) && !empty($conf['children'])) 
1820
                {
1821
                    $child_values = array();
1822
                    // 父项抽取到的html作为子项的提取内容
1823
                    foreach ($values as $child_html) 
1824
                    {
1825
                        // 递归调用本方法, 所以多少子项目都支持
1826
                        $child_value = $this->get_fields($conf['children'], $child_html, $url, $page);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $child_value is correct as $this->get_fields($conf[...hild_html, $url, $page) targeting phpspider\core\phpspider::get_fields() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1827
                        if (!empty($child_value)) 
1828
                        {
1829
                            $child_values[] = $child_value;
1830
                        }
1831
                    }
1832
                    // 有子项就存子项的数组, 没有就存HTML代码块
1833
                    if (!empty($child_values)) 
1834
                    {
1835
                        $values = $child_values;
1836
                    }
1837
                }
1838
            }
1839
1840
            if (empty($values)) 
1841
            {
1842
                // 如果值为空而且值设置为必须项, 跳出foreach循环
1843
                if ($required) 
1844
                {
1845
                    // 清空整个 fields
1846
                    $fields = array();
1847
                    break;
1848
                }
1849
                // 避免内容分页时attached_url拼接时候string + array了
1850
                $fields[$conf['name']] = '';
1851
                //$fields[$conf['name']] = array();
1852
            }
1853
            else 
1854
            {
1855
                if (is_array($values)) 
1856
                {
1857
                    if ($repeated) 
1858
                    {
1859
                        $fields[$conf['name']] = $values;
1860
                    }
1861
                    else 
1862
                    {
1863
                        $fields[$conf['name']] = $values[0];
1864
                    }
1865
                }
1866
                else 
1867
                {
1868
                    $fields[$conf['name']] = $values;
1869
                }
1870
                // 不重复抽取则只取第一个元素
1871
                //$fields[$conf['name']] = $repeated ? $values : $values[0];
1872
            }
1873
        }
1874
1875
        if (!empty($fields)) 
1876
        {
1877
            foreach ($fields as $fieldname => $data) 
1878
            {
1879
                $pattern = "/<img.*src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU";
1880
                /*$pattern = "/<img.*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.jpeg|\.png]))[\'|\"].*?[\/]?>/i"; */
1881
                // 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理
1882
                if ($this->on_handle_img && preg_match($pattern, $data)) 
1883
                {
1884
                    $return = call_user_func($this->on_handle_img, $fieldname, $data);
1885
                    if (!isset($return))
1886
                    {
1887
                        log::warn("on_handle_img return value can't be empty\n");
1888
                    }
1889
                    else 
1890
                    {
1891
                        // 有数据才会执行 on_handle_img 方法, 所以这里不要被替换没了
1892
                        $data = $return;
1893
                    }
1894
                }
1895
1896
                // 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理
1897
                if ($this->on_extract_field) 
1898
                {
1899
                    $return = call_user_func($this->on_extract_field, $fieldname, $data, $page);
1900
                    if (!isset($return))
1901
                    {
1902
                        log::warn("on_extract_field return value can't be empty\n");
1903
                    }
1904
                    else 
1905
                    {
1906
                        // 有数据才会执行 on_extract_field 方法, 所以这里不要被替换没了
1907
                        $fields[$fieldname] = $return;
1908
                    }
1909
                }
1910
            }
1911
        }
1912
1913
        return $fields;
1914
    }
1915
1916
    /**
1917
     * 验证导出
1918
     * 
1919
     * @return void
1920
     * @author seatle <[email protected]> 
1921
     * @created time :2016-10-02 23:37
1922
     */
1923
    public function check_export()
1924
    {
1925
        // 如果设置了导出选项
1926
        if (!empty(self::$configs['export'])) 
1927
        {
1928
            if (self::$export_type == 'csv') 
1929
            {
1930
                if (empty(self::$export_file)) 
1931
                {
1932
                    log::error("Export data into CSV files need to Set the file path.");
1933
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1934
                }
1935
            }
1936
            elseif (self::$export_type == 'sql') 
1937
            {
1938
                if (empty(self::$export_file)) 
1939
                {
1940
                    log::error("Export data into SQL files need to Set the file path.");
1941
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1942
                }
1943
            }
1944
            elseif (self::$export_type == 'db') 
1945
            {
1946
                if (!function_exists('mysqli_connect'))
1947
                {
1948
                    log::error("Export data to a database need Mysql support, Error: Unable to load mysqli extension.");
1949
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1950
                }
1951
1952
                if (empty(self::$db_config)) 
1953
                {
1954
                    log::error("Export data to a database need Mysql support, Error: You not set a config array for connect.");
1955
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1956
                }
1957
1958
                $config = self::$db_config;
1959
                @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for mysqli_connect(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

1959
                /** @scrutinizer ignore-unhandled */ @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1960
                if(mysqli_connect_errno())
1961
                {
1962
                    log::error("Export data to a database need Mysql support, Error: ".mysqli_connect_error());
1963
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1964
                }
1965
1966
                db::set_connect('default', $config);
1967
                db::_init();
1968
1969
                if (!db::table_exists(self::$export_table))
1970
                {
1971
                    log::error("Table ".self::$export_table." does not exist");
1972
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1973
                }
1974
            }
1975
        }
1976
    }
1977
1978
    public function check_cache()
1979
    {
1980
        if (!self::$use_redis || self::$save_running_state)
1981
        {
1982
            return false;
1983
        }
1984
1985
        //if (queue::exists("collect_queue")) 
1986
        $keys = queue::keys("*"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $keys is correct as phpspider\core\queue::keys('*') targeting phpspider\core\queue::keys() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1987
        $count = count($keys);
0 ignored issues
show
Bug introduced by
$keys of type void is incompatible with the type Countable|array expected by parameter $var of count(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1987
        $count = count(/** @scrutinizer ignore-type */ $keys);
Loading history...
1988
        if ($count != 0) 
1989
        {
1990
            // After this operation, 4,318 kB of additional disk space will be used.
1991
            // Do you want to continue? [Y/n] 
1992
            //$msg = "发现Redis中有采集数据, 是否继续执行, 不继续则清空Redis数据重新采集\n";
1993
            $msg = "Found that the data of Redis, no continue will empty Redis data start again\n";
1994
            $msg .= "Do you want to continue? [Y/n]";
1995
            fwrite(STDOUT, $msg);
1996
            $arg = strtolower(trim(fgets(STDIN)));
1997
            $arg = empty($arg) || !in_array($arg, array('y','n')) ? 'y' : $arg;
1998
            if ($arg == 'n') 
1999
            {
2000
                foreach ($keys as $key) 
0 ignored issues
show
Bug introduced by
The expression $keys of type void is not traversable.
Loading history...
2001
                {
2002
                    $key = str_replace($GLOBALS['config']['redis']['prefix'].":", "", $key);
2003
                    queue::del($key);
2004
                }
2005
            }
2006
        }
2007
    }
2008
2009
    public function init_redis()
2010
    {
2011
        if (!self::$use_redis)
2012
        {
2013
            return false;
2014
        }
2015
2016
        // 添加当前服务器到服务器列表
2017
        $this->add_server_list(self::$serverid, self::$tasknum);
2018
2019
        // 删除当前服务器的任务状态
2020
        // 对于被强制退出的进程有用
2021
        for ($i = 1; $i <= self::$tasknum; $i++) 
2022
        {
2023
            $this->del_task_status(self::$serverid, $i);
2024
        }
2025
    }
2026
2027
    /**
2028
     * 设置任务状态, 主进程和子进程每成功采集一个页面后调用
2029
     * 
2030
     * @return void
2031
     * @author seatle <[email protected]> 
2032
     * @created time :2016-10-30 23:56
2033
     */
2034
    public function set_task_status()
2035
    {
2036
        // 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用
2037
        $mem = round(memory_get_usage(true)/(1024*1024),2);
2038
        $use_time = microtime(true) - self::$time_start; 
2039
        $speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2);
2040
        $status = array(
2041
            'id' => self::$taskid,
2042
            'pid' => self::$taskpid,
2043
            'mem' => $mem,
2044
            'collect_succ' => self::$collect_succ,
2045
            'collect_fail' => self::$collect_fail,
2046
            'speed' => $speed,
2047
        );
2048
        $task_status = json_encode($status);
2049
2050
        if (self::$use_redis)
2051
        {
2052
            $key = "server-".self::$serverid."-task_status-".self::$taskid;
2053
            queue::set($key, $task_status); 
2054
        }
2055
        else 
2056
        {
2057
            self::$task_status = array($task_status);
2058
        }
2059
    }
2060
2061
    /**
2062
     * 删除任务状态
2063
     * 
2064
     * @return void
2065
     * @author seatle <[email protected]> 
2066
     * @created time :2016-11-16 11:06
2067
     */
2068
    public function del_task_status($serverid, $taskid)
2069
    {
2070
        if (!self::$use_redis)
2071
        {
2072
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2073
        }
2074
        $key = "server-{$serverid}-task_status-{$taskid}";
2075
        queue::del($key); 
2076
    }
2077
2078
    /**
2079
     * 获得任务状态, 主进程才会调用
2080
     * 
2081
     * @return void
2082
     * @author seatle <[email protected]> 
2083
     * @created time :2016-10-30 23:56
2084
     */
2085
    public function get_task_status($serverid, $taskid)
2086
    {
2087
        if (!self::$use_redis)
2088
        {
2089
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2090
        }
2091
2092
        $key = "server-{$serverid}-task_status-{$taskid}";
2093
        $task_status = queue::get($key);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as phpspider\core\queue::get($key) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2094
        return $task_status;
2095
    }
2096
2097
    /**
2098
     * 获得任务状态, 主进程才会调用
2099
     * 
2100
     * @return void
2101
     * @author seatle <[email protected]> 
2102
     * @created time :2016-10-30 23:56
2103
     */
2104
    public function get_task_status_list($serverid = 1, $tasknum)
2105
    {
2106
        $task_status = array();
2107
        if (self::$use_redis)
2108
        {
2109
            for ($i = 1; $i <= $tasknum; $i++) 
2110
            {
2111
                $key = "server-{$serverid}-task_status-".$i;
2112
                $task_status[] = queue::get($key);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status[] is correct as phpspider\core\queue::get($key) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2113
            }
2114
        }
2115
        else 
2116
        {
2117
            $task_status = self::$task_status;
2118
        }
2119
        return $task_status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $task_status returns the type array|array<mixed,void> which is incompatible with the documented return type void.
Loading history...
2120
    }
2121
2122
    /**
2123
     * 添加当前服务器信息到服务器列表
2124
     * 
2125
     * @return void
2126
     * @author seatle <[email protected]> 
2127
     * @created time :2016-11-16 11:06
2128
     */
2129
    public function add_server_list($serverid, $tasknum)
2130
    {
2131
        if (!self::$use_redis) 
2132
        {
2133
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2134
        }
2135
2136
        // 更新服务器列表
2137
        $server_list_json = queue::get("server_list");
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2138
        $server_list = array();
2139
        if (!$server_list_json) 
0 ignored issues
show
introduced by
$server_list_json is of type void, thus it always evaluated to false.
Loading history...
2140
        {
2141
            $server_list[$serverid] = array(
2142
                'serverid' => $serverid,
2143
                'tasknum' => $tasknum,
2144
                'time' => time(),
2145
            );
2146
        }
2147
        else 
2148
        {
2149
            $server_list = json_decode($server_list_json, true);
2150
            $server_list[$serverid] = array(
2151
                'serverid' => $serverid,
2152
                'tasknum' => $tasknum,
2153
                'time' => time(),
2154
            );
2155
            ksort($server_list);
2156
        }
2157
        queue::set("server_list", json_encode($server_list));
2158
    }
2159
2160
    /**
2161
     * 从服务器列表中删除当前服务器信息
2162
     * 
2163
     * @return void
2164
     * @author seatle <[email protected]> 
2165
     * @created time :2016-11-16 11:06
2166
     */
2167
    public function del_server_list($serverid)
2168
    {
2169
        if (!self::$use_redis) 
2170
        {
2171
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2172
        }
2173
2174
        $server_list_json = queue::get("server_list");
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2175
        $server_list = array();
0 ignored issues
show
Unused Code introduced by
The assignment to $server_list is dead and can be removed.
Loading history...
2176
        if ($server_list_json) 
0 ignored issues
show
introduced by
$server_list_json is of type void, thus it always evaluated to false.
Loading history...
2177
        {
2178
            $server_list = json_decode($server_list_json, true);
2179
            if (isset($server_list[$serverid])) 
2180
            {
2181
                unset($server_list[$serverid]);
2182
            }
2183
2184
            // 删除完当前的任务列表如果还存在,就更新一下Redis
2185
            if (!empty($server_list)) 
2186
            {
2187
                ksort($server_list);
2188
                queue::set("server_list", json_encode($server_list));
2189
            }
2190
        }
2191
    }
2192
2193
    /**
2194
     * 获取等待爬取页面数量
2195
     * 
2196
     * @param mixed $url
2197
     * @return void
2198
     * @author seatle <[email protected]> 
2199
     * @created time :2016-09-23 17:13
2200
     */
2201
    public function get_collect_url_num()
2202
    {
2203
        if (self::$use_redis)
2204
        {
2205
            $count = queue::get("collect_urls_num"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::get('collect_urls_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2206
        }
2207
        else 
2208
        {
2209
            $count = self::$collect_urls_num;
2210
        }
2211
        return $count;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $count returns the type integer which is incompatible with the documented return type void.
Loading history...
2212
    }
2213
2214
    /**
2215
     * 获取已经爬取页面数量
2216
     * 
2217
     * @param mixed $url
2218
     * @return void
2219
     * @author seatle <[email protected]> 
2220
     * @created time :2016-09-23 17:13
2221
     */
2222
    public function get_collected_url_num()
2223
    {
2224
        if (self::$use_redis)
2225
        {
2226
            $count = queue::get("collected_urls_num"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::get('collected_urls_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2227
        }
2228
        else 
2229
        {
2230
            $count = self::$collected_urls_num;
2231
        }
2232
        return $count;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $count returns the type integer which is incompatible with the documented return type void.
Loading history...
2233
    }
2234
2235
    /**
2236
     * 已采集页面数量加一
2237
     * 
2238
     * @param mixed $url
2239
     * @return void
2240
     * @author seatle <[email protected]> 
2241
     * @created time :2016-09-23 17:13
2242
     */
2243
    public function incr_collected_url_num($url)
2244
    {
2245
        if (self::$use_redis)
2246
        {
2247
            queue::incr("collected_urls_num"); 
2248
        }
2249
        else 
2250
        {
2251
            self::$collected_urls_num++;
2252
        }
2253
    }
2254
2255
    /**
2256
     * 从队列左边插入
2257
     * 
2258
     * @return void
2259
     * @author seatle <[email protected]> 
2260
     * @created time :2016-09-23 17:13
2261
     */
2262
    public function queue_lpush($link = array(), $allowed_repeat = false)
2263
    {
2264
        if (empty($link) || empty($link['url'])) 
2265
        {
2266
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2267
        }
2268
2269
        $url = $link['url'];
2270
        $link = $this->link_compress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_compress($link) targeting phpspider\core\phpspider::link_compress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2271
2272
        $status = false;
2273
        if (self::$use_redis)
2274
        {
2275
            $key = "collect_urls-".md5($url);
2276
            $lock = "lock-".$key;
2277
            // 加锁: 一个进程一个进程轮流处理
2278
            if (queue::lock($lock))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2279
            {
2280
                $exists = queue::exists($key); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $exists is correct as phpspider\core\queue::exists($key) targeting phpspider\core\queue::exists() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2281
                // 不存在或者当然URL可重复入
2282
                if (!$exists || $allowed_repeat) 
0 ignored issues
show
introduced by
$exists is of type void, thus it always evaluated to false.
Loading history...
2283
                {
2284
                    // 待爬取网页记录数加一
2285
                    queue::incr("collect_urls_num"); 
2286
                    // 先标记为待爬取网页
2287
                    queue::set($key, time()); 
2288
                    // 入队列
2289
                    $link = json_encode($link);
2290
                    queue::lpush("collect_queue", $link); 
2291
                    $status = true;
2292
                }
2293
                // 解锁
2294
                queue::unlock($lock);
2295
            }
2296
        }
2297
        else 
2298
        {
2299
            $key = md5($url);
2300
            if (!array_key_exists($key, self::$collect_urls))
2301
            {
2302
                self::$collect_urls_num++;
2303
                self::$collect_urls[$key] = time();
2304
                array_push(self::$collect_queue, $link);
2305
                $status = true;
2306
            }
2307
        }
2308
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type boolean which is incompatible with the documented return type void.
Loading history...
2309
    }
2310
2311
    /**
2312
     * 从队列右边插入
2313
     * 
2314
     * @return void
2315
     * @author seatle <[email protected]> 
2316
     * @created time :2016-09-23 17:13
2317
     */
2318
    public function queue_rpush($link = array(), $allowed_repeat = false)
2319
    {
2320
        if (empty($link) || empty($link['url'])) 
2321
        {
2322
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2323
        }
2324
2325
        $url = $link['url'];
2326
2327
        $status = false;
2328
        if (self::$use_redis)
2329
        {
2330
            $key = "collect_urls-".md5($url);
2331
            $lock = "lock-".$key;
2332
            // 加锁: 一个进程一个进程轮流处理
2333
            if (queue::lock($lock))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2334
            {
2335
                $exists = queue::exists($key); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $exists is correct as phpspider\core\queue::exists($key) targeting phpspider\core\queue::exists() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2336
                // 不存在或者当然URL可重复入
2337
                if (!$exists || $allowed_repeat) 
0 ignored issues
show
introduced by
$exists is of type void, thus it always evaluated to false.
Loading history...
2338
                {
2339
                    // 待爬取网页记录数加一
2340
                    queue::incr("collect_urls_num"); 
2341
                    // 先标记为待爬取网页
2342
                    queue::set($key, time()); 
2343
                    // 入队列
2344
                    $link = json_encode($link);
2345
                    queue::rpush("collect_queue", $link); 
2346
                    $status = true;
2347
                }
2348
                // 解锁
2349
                queue::unlock($lock);
2350
            }
2351
        }
2352
        else 
2353
        {
2354
            $key = md5($url);
2355
            if (!array_key_exists($key, self::$collect_urls))
2356
            {
2357
                self::$collect_urls_num++;
2358
                self::$collect_urls[$key] = time();
2359
                array_unshift(self::$collect_queue, $link);
2360
                $status = true;
2361
            }
2362
        }
2363
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type boolean which is incompatible with the documented return type void.
Loading history...
2364
    }
2365
2366
    /**
2367
     * 从队列左边取出
2368
     * 后进先出
2369
     * 可以避免采集内容页有分页的时候采集失败数据拼凑不全
2370
     * 还可以按顺序采集列表页
2371
     * 
2372
     * @return void
2373
     * @author seatle <[email protected]> 
2374
     * @created time :2016-09-23 17:13
2375
     */
2376
    public function queue_lpop()
2377
    {
2378
        if (self::$use_redis)
2379
        {
2380
            $link = queue::lpop("collect_queue"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::lpop('collect_queue') targeting phpspider\core\queue::lpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2381
            $link = json_decode($link, true);
0 ignored issues
show
Bug introduced by
$link of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2381
            $link = json_decode(/** @scrutinizer ignore-type */ $link, true);
Loading history...
2382
        }
2383
        else 
2384
        {
2385
            $link = array_pop(self::$collect_queue); 
2386
        }
2387
        return $link;
2388
    }
2389
2390
    /**
2391
     * 从队列右边取出
2392
     * 
2393
     * @return void
2394
     * @author seatle <[email protected]> 
2395
     * @created time :2016-09-23 17:13
2396
     */
2397
    public function queue_rpop()
2398
    {
2399
        if (self::$use_redis)
2400
        {
2401
            $link = queue::rpop("collect_queue"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::rpop('collect_queue') targeting phpspider\core\queue::rpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2402
            $link = json_decode($link, true);
0 ignored issues
show
Bug introduced by
$link of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2402
            $link = json_decode(/** @scrutinizer ignore-type */ $link, true);
Loading history...
2403
        }
2404
        else 
2405
        {
2406
            $link = array_shift(self::$collect_queue); 
2407
        }
2408
        return $link;
2409
    }
2410
2411
    /**
2412
     * 队列长度
2413
     * 
2414
     * @return void
2415
     * @author seatle <[email protected]> 
2416
     * @created time :2016-09-23 17:13
2417
     */
2418
    public function queue_lsize()
2419
    {
2420
        if (self::$use_redis)
2421
        {
2422
            $lsize = queue::lsize("collect_queue"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $lsize is correct as phpspider\core\queue::lsize('collect_queue') targeting phpspider\core\queue::lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2423
        }
2424
        else 
2425
        {
2426
            $lsize = count(self::$collect_queue);
2427
        }
2428
        return $lsize;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $lsize returns the type integer which is incompatible with the documented return type void.
Loading history...
2429
    }
2430
2431
    /**
2432
     * 采集深度加一
2433
     * 
2434
     * @return void
2435
     * @author seatle <[email protected]> 
2436
     * @created time :2016-09-23 17:13
2437
     */
2438
    public function incr_depth_num($depth)
2439
    {
2440
        if (self::$use_redis)
2441
        {
2442
            $lock = "lock-depth_num";
2443
            // 锁2秒
2444
            if (queue::lock($lock, time(), 2))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock, time(), 2) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2445
            {
2446
                if (queue::get("depth_num") < $depth) 
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::get('depth_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2447
                {
2448
                    queue::set("depth_num", $depth); 
2449
                }
2450
2451
                queue::unlock($lock);
2452
            }
2453
        }
2454
        else 
2455
        {
2456
            if (self::$depth_num < $depth) 
2457
            {
2458
                self::$depth_num = $depth;
2459
            }
2460
        }
2461
    }
2462
2463
    /**
2464
     * 获得采集深度
2465
     * 
2466
     * @return void
2467
     * @author seatle <[email protected]> 
2468
     * @created time :2016-09-23 17:13
2469
     */
2470
    public function get_depth_num()
2471
    {
2472
        if (self::$use_redis)
2473
        {
2474
            $depth_num = queue::get("depth_num"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $depth_num is correct as phpspider\core\queue::get('depth_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2475
            return $depth_num ? $depth_num : 0;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $depth_num ? $depth_num : 0 returns the type integer which is incompatible with the documented return type void.
Loading history...
introduced by
$depth_num is of type void, thus it always evaluated to false.
Loading history...
2476
        }
2477
        else 
2478
        {
2479
            return self::$depth_num;
0 ignored issues
show
Bug Best Practice introduced by
The expression return self::depth_num returns the type integer which is incompatible with the documented return type void.
Loading history...
2480
        }
2481
    }
2482
2483
    /**
2484
     * 提取到的field数目加一
2485
     * 
2486
     * @return void
2487
     * @author seatle <[email protected]> 
2488
     * @created time :2016-09-23 17:13
2489
     */
2490
    public function incr_fields_num()
2491
    {
2492
        if (self::$use_redis)
2493
        {
2494
            $fields_num = queue::incr("fields_num"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields_num is correct as phpspider\core\queue::incr('fields_num') targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2495
        }
2496
        else 
2497
        {
2498
            self::$fields_num++;
2499
            $fields_num = self::$fields_num;
2500
        }
2501
        return $fields_num;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $fields_num returns the type integer which is incompatible with the documented return type void.
Loading history...
2502
    }
2503
2504
    /**
2505
     * 提取到的field数目
2506
     * 
2507
     * @return void
2508
     * @author seatle <[email protected]> 
2509
     * @created time :2016-09-23 17:13
2510
     */
2511
    public function get_fields_num()
2512
    {
2513
        if (self::$use_redis)
2514
        {
2515
            $fields_num = queue::get("fields_num"); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields_num is correct as phpspider\core\queue::get('fields_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2516
        }
2517
        else 
2518
        {
2519
            $fields_num = self::$fields_num;
2520
        }
2521
        return $fields_num ? $fields_num : 0;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $fields_num ? $fields_num : 0 returns the type integer which is incompatible with the documented return type void.
Loading history...
2522
    }
2523
2524
    /**
2525
     * 采用xpath分析提取字段
2526
     * 
2527
     * @param mixed $html
2528
     * @param mixed $selector
2529
     * @return void
2530
     * @author seatle <[email protected]> 
2531
     * @created time :2016-09-18 10:17
2532
     */
2533
    public function get_fields_xpath($html, $selector, $fieldname) 
2534
    {
2535
        $result = selector::select($html, $selector);
2536
        if (selector::$error) 
2537
        {
2538
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
2539
        }
2540
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type false which is incompatible with the documented return type void.
Loading history...
2541
    }
2542
2543
    /**
2544
     * 采用正则分析提取字段
2545
     * 
2546
     * @param mixed $html
2547
     * @param mixed $selector
2548
     * @return void
2549
     * @author seatle <[email protected]> 
2550
     * @created time :2016-09-18 10:17
2551
     */
2552
    public function get_fields_regex($html, $selector, $fieldname) 
2553
    {
2554
        $result = selector::select($html, $selector, 'regex');
2555
        if (selector::$error) 
2556
        {
2557
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
2558
        }
2559
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type false which is incompatible with the documented return type void.
Loading history...
2560
    }
2561
2562
    /**
2563
     * 采用CSS选择器提取字段
2564
     * 
2565
     * @param mixed $html
2566
     * @param mixed $selector
2567
     * @param mixed $fieldname
2568
     * @return void
2569
     * @author seatle <[email protected]> 
2570
     * @created time :2016-09-18 10:17
2571
     */
2572
    public function get_fields_css($html, $selector, $fieldname) 
2573
    {
2574
        $result = selector::select($html, $selector, 'css');
2575
        if (selector::$error) 
2576
        {
2577
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
2578
        }
2579
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result could also return false which is incompatible with the documented return type void. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
2580
    }
2581
2582
    /**
2583
     * 清空shell输出内容
2584
     * 
2585
     * @return void
2586
     * @author seatle <[email protected]> 
2587
     * @created time :2016-11-16 11:06
2588
     */
2589
    public function clear_echo()
2590
    {
2591
        $arr = array(27, 91, 72, 27, 91, 50, 74);
2592
        foreach ($arr as $a) 
2593
        {
2594
            print chr($a);
2595
        }
2596
        //array_map(create_function('$a', 'print chr($a);'), array(27, 91, 72, 27, 91, 50, 74));
2597
    }
2598
2599
    /**
2600
     * 替换shell输出内容
2601
     * 
2602
     * @param mixed $message
2603
     * @param mixed $force_clear_lines
2604
     * @return void
2605
     * @author seatle <[email protected]> 
2606
     * @created time :2016-11-16 11:06
2607
     */
2608
    public function replace_echo($message, $force_clear_lines = NULL) 
2609
    {
2610
        static $last_lines = 0;
2611
2612
        if(!is_null($force_clear_lines)) 
2613
        {
2614
            $last_lines = $force_clear_lines;
2615
        }
2616
2617
        // 获取终端宽度
2618
        $toss = $status = null;
2619
        $term_width = exec('tput cols', $toss, $status);
2620
        if($status || empty($term_width)) 
2621
        {
2622
            $term_width = 64; // Arbitrary fall-back term width.
2623
        }
2624
2625
        $line_count = 0;
2626
        foreach(explode("\n", $message) as $line) 
2627
        {
2628
            $line_count += count(str_split($line, $term_width));
0 ignored issues
show
Bug introduced by
It seems like $term_width can also be of type string; however, parameter $split_length of str_split() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2628
            $line_count += count(str_split($line, /** @scrutinizer ignore-type */ $term_width));
Loading history...
2629
        }
2630
2631
        // Erasure MAGIC: Clear as many lines as the last output had.
2632
        for($i = 0; $i < $last_lines; $i++) 
2633
        {
2634
            // Return to the beginning of the line
2635
            echo "\r";
2636
            // Erase to the end of the line
2637
            echo "\033[K";
2638
            // Move cursor Up a line
2639
            echo "\033[1A";
2640
            // Return to the beginning of the line
2641
            echo "\r";
2642
            // Erase to the end of the line
2643
            echo "\033[K";
2644
            // Return to the beginning of the line
2645
            echo "\r";
2646
            // Can be consolodated into
2647
            // echo "\r\033[K\033[1A\r\033[K\r";
2648
        }
2649
2650
        $last_lines = $line_count;
2651
2652
        echo $message."\n";
2653
    }
2654
2655
    /**
2656
     * 展示启动界面, Windows 不会到这里来
2657
     * @return void
2658
     */
2659
    public function display_ui()
2660
    {
2661
        $loadavg = sys_getloadavg();
2662
        foreach ($loadavg as $k=>$v) 
2663
        {
2664
            $loadavg[$k] = round($v, 2);
2665
        }
2666
        $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
2667
        //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
2668
        $run_time_str = util::time2second(time()-self::$time_start, false);
2669
        $display_str .= 'PHPSpider version:' . self::VERSION . "          PHP version:" . PHP_VERSION . "\n";
2670
        $display_str .= 'start time:'. date('Y-m-d H:i:s', self::$time_start).'   run ' . $run_time_str . " \n";
2671
2672
        $display_str .= 'spider name: ' . self::$configs['name'] . "\n";
2673
        if (self::$multiserver) 
2674
        {
2675
            $display_str .= 'server id: ' . self::$serverid."\n";
2676
        }
2677
        $display_str .= 'task number: ' . self::$tasknum . "\n";
2678
        $display_str .= 'load average: ' . implode(", ", $loadavg) . "\n";
2679
        $display_str .= "document: https://doc.phpspider.org\n";
2680
2681
        $display_str .= $this->display_task_ui();
2682
2683
        if (self::$multiserver) 
2684
        {
2685
            $display_str .= $this->display_server_ui();
2686
        }
2687
2688
        $display_str .= $this->display_collect_ui();
2689
2690
        // 清屏
2691
        //$this->clear_echo();
2692
        // 返回到第一行,第一列
2693
        //echo "\033[0;0H";
2694
        $display_str .= "---------------------------------------------------------------------\n";
2695
        $display_str .= "Press Ctrl-C to quit. Start success.";
2696
        if (self::$terminate) 
2697
        {
2698
            $display_str .= "\n\033[33mWait for the process exits...\033[0m";
2699
        }
2700
        //echo $display_str;
2701
        $this->replace_echo($display_str);
2702
    }
2703
2704
    public function display_task_ui()
2705
    {
2706
        $display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n";
2707
2708
        $display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')). 
2709
            "\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')). 
2710
            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
2711
            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
2712
            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
2713
            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
2714
            "\n";
2715
2716
        // "\033[32;40m [OK] \033[0m"
2717
        $task_status = $this->get_task_status_list(self::$serverid, self::$tasknum);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status_l...erverid, self::tasknum) targeting phpspider\core\phpspider::get_task_status_list() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2718
        foreach ($task_status as $json) 
0 ignored issues
show
Bug introduced by
The expression $task_status of type void is not traversable.
Loading history...
2719
        {
2720
            $task = json_decode($json, true);
2721
            if (empty($task)) 
2722
            {
2723
                continue;
2724
            }
2725
            $display_str .= str_pad($task['id'], self::$taskid_length+2).
2726
                str_pad($task['pid'], self::$pid_length+2).
2727
                str_pad($task['mem']."MB", self::$mem_length+2). 
2728
                str_pad($task['collect_succ'], self::$urls_length). 
2729
                str_pad($task['collect_fail'], self::$urls_length). 
2730
                str_pad($task['speed']."/s", self::$speed_length+2). 
2731
                "\n";
2732
        }
2733
        //echo "\033[9;0H";
2734
        return $display_str;
2735
    }
2736
2737
    public function display_server_ui()
2738
    {
2739
        $display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n";
2740
2741
        $display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')). 
2742
            "\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')). 
2743
            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
2744
            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
2745
            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
2746
            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
2747
            "\n";
2748
2749
        $server_list_json = queue::get("server_list");
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2750
        $server_list = json_decode($server_list_json, true);
0 ignored issues
show
Bug introduced by
$server_list_json of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2750
        $server_list = json_decode(/** @scrutinizer ignore-type */ $server_list_json, true);
Loading history...
2751
        foreach ($server_list as $server) 
2752
        {
2753
            $serverid = $server['serverid'];
2754
            $tasknum = $server['tasknum'];
2755
            $mem = 0;
2756
            $speed = 0;
2757
            $collect_succ = $collect_fail = 0;
2758
            $task_status = $this->get_task_status_list($serverid, $tasknum);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status_list($serverid, $tasknum) targeting phpspider\core\phpspider::get_task_status_list() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2759
            foreach ($task_status as $json) 
0 ignored issues
show
Bug introduced by
The expression $task_status of type void is not traversable.
Loading history...
2760
            {
2761
                $task = json_decode($json, true);
2762
                if (empty($task)) 
2763
                {
2764
                    continue;
2765
                }
2766
                $mem += $task['mem'];
2767
                $speed += $task['speed'];
2768
                $collect_fail += $task['collect_fail'];
2769
                $collect_succ += $task['collect_succ'];
2770
            }
2771
2772
            $display_str .= str_pad($serverid, self::$server_length).
2773
                str_pad($tasknum, self::$tasknum_length+2). 
2774
                str_pad($mem."MB", self::$mem_length+2). 
2775
                str_pad($collect_succ, self::$urls_length). 
2776
                str_pad($collect_fail, self::$urls_length). 
2777
                str_pad($speed."/s", self::$speed_length+2). 
2778
                "\n";
2779
        }
2780
        return $display_str;
2781
    }
2782
2783
    public function display_collect_ui()
2784
    {
2785
        $display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n";
2786
2787
        $display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')). 
2788
            "\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')). 
2789
            "\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')). 
2790
            "\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')). 
2791
            "\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')). 
2792
            "\n";
2793
2794
        $collect   = $this->get_collect_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collect is correct as $this->get_collect_url_num() targeting phpspider\core\phpspider::get_collect_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2795
        $collected = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collected is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2796
        $queue     = $this->queue_lsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2797
        $fields    = $this->get_fields_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields is correct as $this->get_fields_num() targeting phpspider\core\phpspider::get_fields_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2798
        $depth     = $this->get_depth_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $depth is correct as $this->get_depth_num() targeting phpspider\core\phpspider::get_depth_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2799
        $display_str .= str_pad($collect, 16);
0 ignored issues
show
Bug introduced by
$collect of type void is incompatible with the type string expected by parameter $input of str_pad(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2799
        $display_str .= str_pad(/** @scrutinizer ignore-type */ $collect, 16);
Loading history...
2800
        $display_str .= str_pad($queue, 14);
2801
        $display_str .= str_pad($collected, 15);
2802
        $display_str .= str_pad($fields, 15);
2803
        $display_str .= str_pad($depth, 12);
2804
        $display_str .= "\n";
2805
        return $display_str;
2806
    }
2807
2808
    /**
2809
     * 判断是否附件文件
2810
     * 
2811
     * @return void
2812
     * @author seatle <[email protected]> 
2813
     * @created time :2016-09-23 17:13
2814
     */
2815
    //public function is_attachment_file($url)
2816
    //{
2817
    //$mime_types = $GLOBALS['config']['mimetype'];
2818
    //$mime_types_flip = array_flip($mime_types);
2819
2820
    //$pathinfo = pathinfo($url);
2821
    //$fileext = isset($pathinfo['extension']) ? $pathinfo['extension'] : '';
2822
2823
    //$fileinfo = array();
2824
    //// 存在文件后缀并且是配置里面的后缀
2825
    //if (!empty($fileext) && isset($mime_types_flip[$fileext])) 
2826
    //{
2827
    //stream_context_set_default(
2828
    //array(
2829
    //'http' => array(
2830
    //'method' => 'HEAD'
2831
    //)
2832
    //)
2833
    //);
2834
    //// 代理和Cookie以后实现, 方法和 file_get_contents 一样 使用 stream_context_create 设置
2835
    //$headers = get_headers($url, 1);
2836
    //if (strpos($headers[0], '302')) 
2837
    //{
2838
    //$url = $headers['Location'];
2839
    //$headers = get_headers($url, 1);
2840
    //}
2841
    ////print_r($headers);
2842
    //$fileinfo = array(
2843
    //'basename' => isset($pathinfo['basename']) ? $pathinfo['basename'] : '',
2844
    //'filename' => isset($pathinfo['filename']) ? $pathinfo['filename'] : '',
2845
    //'fileext' => isset($pathinfo['extension']) ? $pathinfo['extension'] : '',
2846
    //'filesize' => isset($headers['Content-Length']) ? $headers['Content-Length'] : 0,
2847
    //'atime' => isset($headers['Date']) ? strtotime($headers['Date']) : time(),
2848
    //'mtime' => isset($headers['Last-Modified']) ? strtotime($headers['Last-Modified']) : time(),
2849
    //);
2850
2851
    //$mime_type = 'html';
2852
    //$content_type = isset($headers['Content-Type']) ? $headers['Content-Type'] : '';
2853
    //if (!empty($content_type)) 
2854
    //{
2855
    //$mime_type = isset($GLOBALS['config']['mimetype'][$content_type]) ? $GLOBALS['config']['mimetype'][$content_type] : $mime_type;
2856
    //}
2857
    //$mime_types_flip = array_flip($mime_types);
2858
    //// 判断一下是不是文件名被加什么后缀了, 比如 http://www.xxxx.com/test.jpg?token=xxxxx
2859
    //if (!isset($mime_types_flip[$fileinfo['fileext']]))
2860
    //{
2861
    //$fileinfo['fileext'] = $mime_type;
2862
    //$fileinfo['basename'] = $fileinfo['filename'].'.'.$mime_type;
2863
    //}
2864
    //}
2865
    //return $fileinfo;
2866
    //}
2867
2868
}
2869
2870
2871