phpspider::getRootDomain()   F
last analyzed

Complexity

Conditions 26
Paths 545

Size

Total Lines 123
Code Lines 67

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 26
eloc 67
nc 545
nop 3
dl 0
loc 123
rs 0.6319
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
// +----------------------------------------------------------------------
3
// | PHPSpider [ A PHP Framework For Crawler ]
4
// +----------------------------------------------------------------------
5
// | Copyright (c) 2006-2014 https://doc.phpspider.org All rights reserved.
6
// +----------------------------------------------------------------------
7
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 )
8
// +----------------------------------------------------------------------
9
// | Author: Seatle Yang <[email protected]>
10
// +----------------------------------------------------------------------
11
12
//----------------------------------
13
// PHPSpider核心类文件
14
// ***********
15
// 泛域名抓取优化版 BY KEN [email protected]
16
// ***********
17
// * 泛域名设置:domain = array('*')
18
// * 增加子域名数量限制 $max_sub_num = 100
19
//----------------------------------
20
21
namespace phpspider\core;
22
23
require_once __DIR__.'/constants.php';
24
25
use Exception;
26
use phpspider\core\db;
27
use phpspider\core\log;
28
use phpspider\core\queue;
29
use phpspider\core\requests;
30
use phpspider\core\selector;
31
use phpspider\core\util;
32
33
// 启动的时候生成data目录
34
util::path_exists(PATH_DATA);
35
util::path_exists(PATH_DATA.'/lock');
36
util::path_exists(PATH_DATA.'/log');
37
util::path_exists(PATH_DATA.'/cache');
38
util::path_exists(PATH_DATA.'/status');
39
40
class phpspider
41
{
42
    /**
43
     * 版本号
44
     * @var string
45
     */
46
    const VERSION = '2.1.5';
47
48
    /**
49
     * 爬虫爬取每个网页的时间间隔,0表示不延时, 单位: 毫秒
50
     */
51
    const INTERVAL = 100;
52
53
    /**
54
     * 爬虫爬取每个网页的超时时间, 单位: 秒 
55
     */
56
    const TIMEOUT = 5;
57
58
    /**
59
     * 爬取失败次数, 不想失败重新爬取则设置为0 
60
     */
61
    const MAX_TRY = 0;
62
63
    /**
64
     * 爬虫爬取网页所使用的浏览器类型: pc/Mac、ios、android
65
     * 默认类型是PC
66
     */
67
    const AGENT_PC      = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36';
68
    const AGENT_IOS     = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1';
69
    const AGENT_ANDROID = 'Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S';
70
71
    /**
72
     * pid文件的路径及名称
73
     * @var string
74
     */
75
    //public static $pid_file = '';
76
77
    /**
78
     * 日志目录, 默认在data根目录下
79
     * @var mixed
80
     */
81
    //public static $log_file = '';
82
83
    /**
84
     * 主任务进程ID 
85
     */
86
    //public static $master_pid = 0;
87
88
    /**
89
     * 所有任务进程ID 
90
     */
91
    //public static $taskpids = array();
92
93
    /**
94
     * Daemonize.
95
     *
96
     * @var bool
97
     */
98
    public static $daemonize = false;
99
100
    /**
101
     * 当前进程是否终止 
102
     */
103
    public static $terminate = false;
104
105
    /**
106
     * 是否分布式 
107
     */
108
    public static $multiserver = false;
109
110
    /**
111
     * 当前服务器ID 
112
     */
113
    public static $serverid = 1;
114
115
    /**
116
     * 主任务进程 
117
     */
118
    public static $taskmaster = true;
119
120
    /**
121
     * 当前任务ID 
122
     */
123
    public static $taskid = 1;
124
125
    /**
126
     * 当前任务进程ID 
127
     */
128
    public static $taskpid = 1;
129
130
    /**
131
     * 并发任务数
132
     */
133
    public static $tasknum = 1;
134
135
    /**
136
     * 生成 
137
     */
138
    public static $fork_task_complete = false;
139
140
    /**
141
     * 是否使用Redis 
142
     */
143
    public static $use_redis = false;
144
145
    /**
146
     * 是否保存爬虫运行状态 
147
     */
148
    public static $save_running_state = false;
149
150
    /**
151
     * 配置 
152
     */
153
    public static $configs = array();
154
155
    /**
156
     * 要抓取的URL队列 
157
     md5(url) => array(
158
         'url'         => '',      // 要爬取的URL
159
         'url_type'    => '',      // 要爬取的URL类型,scan_page、list_page、content_page
160
         'method'      => 'get',   // 默认为"GET"请求, 也支持"POST"请求
161
         'headers'     => array(), // 此url的Headers, 可以为空
162
         'params'      => array(), // 发送请求时需添加的参数, 可以为空
163
         'context_data'=> '',      // 此url附加的数据, 可以为空
164
         'proxy'       => false,   // 是否使用代理
165
         'try_num'     => 0        // 抓取次数
166
         'max_try'     => 0        // 允许抓取失败次数
167
     ) 
168
     */
169
    public static $collect_queue = array();
170
171
    /**
172
     * 要抓取的URL数组
173
     * md5($url) => time()
174
     */
175
    public static $collect_urls = array();
176
177
    /**
178
     * 要抓取的URL数量
179
     */
180
    public static $collect_urls_num = 0;
181
182
    /**
183
     * 已经抓取的URL数量
184
     */
185
    public static $collected_urls_num = 0;
186
187
    /**
188
     * 当前进程采集成功数 
189
     */
190
    public static $collect_succ = 0;
191
192
    /**
193
     * 当前进程采集失败数 
194
     */
195
    public static $collect_fail = 0;
196
197
    /**
198
     * 提取到的字段数 
199
     */
200
    public static $fields_num = 0;
201
202
    /**
203
     * 【KEN】提取到的页面数按域名计数容器 结构为 domain => number
204
     */
205
    public static $pages_num = array();
206
207
    /**
208
     * 【KEN】单域名允许抓取的最大页面数,0为不限制
209
     */
210
    public static $max_pages = 0;
211
212
    /**
213
     * 【KEN】花费的抓取时长计数容器 结构为 domain => number
214
     */
215
    public static $duration = array();
216
217
    /**
218
     * 【KEN】单域名允许抓取的最大时长,单位秒,0为不限制
219
     */
220
    public static $max_duration = 0;
221
222
    /**
223
     * 【KEN】单域名最大子域名发现数量 防止掉进蜘蛛池,推荐值:3000(多数大型网站上限)
224
     */
225
    public static $max_sub_num = 3000; //建议值 3000
226
227
    /**
228
     * 【KEN】子进程未获取任务,超时退出前,等待计时器
229
     */
230
231
    public static $stand_by_time = 0;
232
233
    /**
234
     * 【KEN】子进程未获取任务,超时退出前,最大等待时长/秒,全部任务束后,子进程将会等待的时间,以便有缓冲时间,获得新的任务
235
     */
236
    public static $max_stand_by_time = 60; //建议值 60
237
238
    /**
239
     * 【KEN】每个主机并发上限,降低对方网站流量压力和减少被阻挡概率,建议值 6 ,须与 queue_order = rand 一起使用
240
     */
241
    public static $max_task_per_host     = 0; //0值和非0值会使用不同类型的队列缓存库,从0改为非0值或从非0值改为0需清空队列缓存库再运行,否则任务无法添加
242
    public static $task_per_host_counter = array(); //计数容器
243
244
    /**
245
     * 采集深度
246
     */
247
    public static $depth_num = 0;
248
249
    /**
250
     * 爬虫开始时间 
251
     */
252
    public static $time_start = 0;
253
254
    /**
255
     * 任务状态
256
     */
257
    public static $task_status = array();
258
259
    // 导出类型配置
260
    public static $export_type  = '';
261
    public static $export_file  = '';
262
    public static $export_conf  = '';
263
    public static $export_table = '';
264
265
    // 数据库配置
266
    public static $db_config = array();
267
    // 队列配置
268
    public static $queue_config = array();
269
270
    // 运行面板参数长度
271
    public static $server_length  = 10;
272
    public static $tasknum_length = 8;
273
    public static $taskid_length  = 8;
274
    public static $pid_length     = 8;
275
    public static $mem_length     = 8;
276
    public static $urls_length    = 15;
277
    public static $speed_length   = 6;
278
279
    /**
280
     * 爬虫初始化时调用, 用来指定一些爬取前的操作 
281
     * 
282
     * @var mixed
283
     * @access public
284
     */
285
    public $on_start = null;
286
287
    /**
288
     * URL采集前调用 
289
     * 比如有时需要根据某个特定的URL,来决定这次的请求是否使用代理 / 或使用哪个代理
290
     * 
291
     * @var mixed
292
     * @access public
293
     */
294
    public $on_before_download_page = null;
295
296
    /**
297
     * 网页状态码回调 
298
     * 
299
     * @var mixed
300
     * @access public
301
     */
302
    public $on_status_code = null;
303
304
    /**
305
     * 判断当前网页是否被反爬虫, 需要开发者实现 
306
     * 
307
     * @var mixed
308
     * @access public
309
     */
310
    public $is_anti_spider = null;
311
312
    /**
313
     * 在一个网页下载完成之后调用, 主要用来对下载的网页进行处理 
314
     * 
315
     * @var mixed
316
     * @access public
317
     */
318
    public $on_download_page = null;
319
320
    /**
321
     * 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理 
322
     * 
323
     * @var mixed
324
     * @access public
325
     */
326
    public $on_download_attached_page = null;
327
328
    /**
329
     * 当前页面抽取到URL 
330
     * 
331
     * @var mixed
332
     * @access public
333
     */
334
    public $on_fetch_url = null;
335
336
    /**
337
     * URL属于入口页 
338
     * 在爬取到入口url的内容之后, 添加新的url到待爬队列之前调用 
339
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
340
     * 
341
     * @var mixed
342
     * @access public
343
     */
344
    public $on_scan_page = null;
345
346
    /**
347
     * URL属于列表页
348
     * 在爬取到列表页url的内容之后, 添加新的url到待爬队列之前调用 
349
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
350
     * 
351
     * @var mixed
352
     * @access public
353
     */
354
    public $on_list_page = null;
355
356
    /**
357
     * URL属于内容页 
358
     * 在爬取到内容页url的内容之后, 添加新的url到待爬队列之前调用 
359
     * 主要用来发现新的待爬url, 并且能给新发现的url附加数据
360
     * 
361
     * @var mixed
362
     * @access public
363
     */
364
    public $on_content_page = null;
365
366
    /**
367
     * 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理 
368
     * 
369
     * @var mixed
370
     * @access public
371
     */
372
    public $on_handle_img = null;
373
374
    /**
375
     * 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理 
376
     * 
377
     * @var mixed
378
     * @access public
379
     */
380
    public $on_extract_field = null;
381
382
    /**
383
     * 在一个网页的所有field抽取完成之后, 可能需要对field进一步处理, 以发布到自己的网站 
384
     * 
385
     * @var mixed
386
     * @access public
387
     */
388
    public $on_extract_page = null;
389
390
    /**
391
     * 如果抓取的页面是一个附件文件, 比如图片、视频、二进制文件、apk、ipad、exe 
392
     * 就不去分析他的内容提取field了, 提取field只针对HTML
393
     * 
394
     * @var mixed
395
     * @access public
396
     */
397
    public $on_attachment_file = null;
398
399
    public function __construct($configs = array())
400
    {
401
        // 产生时钟云,解决php7下面ctrl+c无法停止bug
402
        declare(ticks = 1);
403
404
        // 先打开以显示验证报错内容
405
        log::$log_show = true;
406
        log::$log_file = isset($configs['log_file']) ? $configs['log_file'] : PATH_DATA.'/phpspider.log';
407
        log::$log_type = isset($configs['log_type']) ? $configs['log_type'] : false;
408
409
        // 彩蛋
410
        $included_files = get_included_files();
411
        $content = file_get_contents($included_files[0]);
412
        if (!preg_match("#/\* Do NOT delete this comment \*/#", $content) || !preg_match("#/\* 不要删除这段注释 \*/#", $content))
413
        {
414
            $msg = "Unknown error...";
415
            log::error($msg);
416
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
417
        }
418
419
        $configs['name']        = isset($configs['name'])        ? $configs['name']        : 'phpspider';
420
        $configs['proxy']       = isset($configs['proxy'])       ? $configs['proxy']       : false;
421
        $configs['user_agent']  = isset($configs['user_agent'])  ? $configs['user_agent']  : self::AGENT_PC;
422
        $configs['client_ip']   = isset($configs['client_ip'])   ? $configs['client_ip']   : array();
423
        $configs['interval']    = isset($configs['interval'])    ? $configs['interval']    : self::INTERVAL;
424
        $configs['timeout']     = isset($configs['timeout'])     ? $configs['timeout']     : self::TIMEOUT;
425
        $configs['max_try']     = isset($configs['max_try'])     ? $configs['max_try']     : self::MAX_TRY;
426
        $configs['max_depth']   = isset($configs['max_depth'])   ? $configs['max_depth']   : 0;
427
        $configs['max_fields']  = isset($configs['max_fields'])  ? $configs['max_fields']  : 0;
428
        $configs['export']      = isset($configs['export'])      ? $configs['export']      : array();
429
        //新增参数 BY KEN <[email protected]>
430
        $configs['max_pages']         = isset($configs['max_pages']) ? $configs['max_pages'] : self::$max_pages;
431
        $configs['max_duration']      = isset($configs['max_duration']) ? $configs['max_duration'] : self::$max_duration;
432
        $configs['max_sub_num']       = isset($configs['max_sub_num']) ? $configs['max_sub_num'] : self::$max_sub_num;
433
        $configs['max_stand_by_time'] = isset($configs['max_stand_by_time']) ? $configs['max_stand_by_time'] : self::$max_stand_by_time;
434
        $configs['max_task_per_host'] = isset($configs['max_task_per_host']) ? $configs['max_task_per_host'] : self::$max_task_per_host;
435
        //启用 host并发上限时,队列参数强制为随机
436
        if ($configs['max_task_per_host'] > 0)
437
        {
438
            $configs['queue_order'] = 'rand';
439
        }
440
        else
441
        {
442
            $configs['queue_order'] = isset($configs['queue_order']) ? $configs['queue_order'] : 'list';
443
        }
444
445
        // csv、sql、db
446
        self::$export_type  = isset($configs['export']['type'])  ? $configs['export']['type']  : '';
447
        self::$export_file  = isset($configs['export']['file'])  ? $configs['export']['file']  : '';
448
        self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : '';
449
        self::$db_config    = isset($configs['db_config'])       ? $configs['db_config']       : array();
450
        self::$queue_config = isset($configs['queue_config'])    ? $configs['queue_config']    : array();
451
452
        // 是否设置了并发任务数, 并且大于1, 而且不是windows环境
453
        if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win()) 
454
        {
455
            self::$tasknum = $configs['tasknum'];
456
        }
457
458
        // 是否设置了保留运行状态
459
        if (isset($configs['save_running_state'])) 
460
        {
461
            self::$save_running_state = $configs['save_running_state'];
462
        }
463
464
        // 是否分布式
465
        if (isset($configs['multiserver'])) 
466
        {
467
            self::$multiserver = $configs['multiserver'];
468
        }
469
470
        // 当前服务器ID
471
        if (isset($configs['serverid'])) 
472
        {
473
            self::$serverid = $configs['serverid'];
474
        }
475
476
        // 不同项目的采集以采集名称作为前缀区分 缩短 spider name md5长度到4位,减少内存占用
477
        if (isset(self::$queue_config['prefix']))
478
        {
479
            self::$queue_config['prefix'] = self::$queue_config['prefix'].'-'.substr(md5($configs['name']), 0, 4);
480
        }
481
	
482
        self::$configs = $configs;
483
    }
484
485
    public function get_config($name)
486
    {
487
        return empty(self::$configs[$name]) ? array() : self::$configs[$name];
488
    }
489
490
    public function add_scan_url($url, $options = array(), $allowed_repeat = true)
491
    {
492
        // 投递状态
493
        $status = false;
494
        //限制最大子域名数量
495
        if ( ! empty(self::$configs['max_sub_num']))
496
        {
497
            //抓取到的子域名超过指定数量,就丢掉此域名
498
            $sub_domain_count = $this->sub_domain_count($url);
499
            if ($sub_domain_count > self::$configs['max_sub_num'])
500
            {
501
                log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_scan_url $url [Skip]");
502
                return $status;
503
            }
504
        }
505
506
        $link             = $options;
507
        $link['url']      = $url;
508
        $link['url_type'] = 'scan_page';
509
        $link             = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
510
511
        if ($this->is_content_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_content_page($url) targeting phpspider\core\phpspider::is_content_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
512
        {
513
            $link['url_type'] = 'content_page';
514
            $status           = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
515
        }
516
        elseif ($this->is_list_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_list_page($url) targeting phpspider\core\phpspider::is_list_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
517
        {
518
            $link['url_type'] = 'list_page';
519
            $status           = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
520
        }
521
        else
522
        {
523
            $status = $this->queue_lpush($link, $allowed_repeat);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link, $allowed_repeat) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
524
        }
525
526
        if ($status)
0 ignored issues
show
introduced by
$status is of type void, thus it always evaluated to false.
Loading history...
527
        {
528
            if ($link['url_type'] == 'scan_page')
529
            {
530
                log::debug("Find scan page: {$url}");
531
            }
532
            elseif ($link['url_type'] == 'content_page')
533
            {
534
                log::debug("Find content page: {$url}");
535
            }
536
            elseif ($link['url_type'] == 'list_page')
537
            {
538
                log::debug("Find list page: {$url}");
539
            }
540
        }
541
542
        return $status;
543
    }
544
545
    /**
546
     * 一般在 on_scan_page 和 on_list_page 回调函数中调用, 用来往待爬队列中添加url
547
     * 两个进程同时调用这个方法, 传递相同url的时候, 就会出现url重复进入队列
548
     * 
549
     * @param mixed $url
550
     * @param mixed $options
551
     * @return void
552
     * @author seatle <[email protected]> 
553
     * @created time :2016-09-18 10:17
554
     */
555
    public function add_url($url, $options = array(), $depth = 0)
556
    {
557
        // 投递状态
558
        $status = false;
559
        //限制最大子域名数量
560
        if ( ! empty(self::$configs['max_sub_num']))
561
        {
562
            //抓取超过 max_sub_num 子域名的,就丢掉
563
            $sub_domain_count = $this->sub_domain_count($url);
564
            if ($sub_domain_count > self::$configs['max_sub_num'])
565
            {
566
                log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_url $url [Skip]");
567
                //echo '[on_download_page] ' . $domain . "'s subdomin > 1000 ,Skip!\n";
568
                return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type false which is incompatible with the documented return type void.
Loading history...
569
            }
570
        }
571
        $link          = $options;
572
        $link['url']   = $url;
573
        $link['depth'] = $depth;
574
        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
575
576
        if ($this->is_content_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_content_page($url) targeting phpspider\core\phpspider::is_content_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
577
        {
578
            $link['url_type'] = 'content_page';
579
            $status           = $this->queue_lpush($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
580
        }
581
        elseif ($this->is_list_page($url))
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->is_list_page($url) targeting phpspider\core\phpspider::is_list_page() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
582
        {
583
            $link['url_type'] = 'list_page';
584
            $status           = $this->queue_lpush($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $status is correct as $this->queue_lpush($link) targeting phpspider\core\phpspider::queue_lpush() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
585
        }
586
587
        if ($status)
0 ignored issues
show
introduced by
The condition $status is always false.
Loading history...
588
        {
589
            if ($link['url_type'] == 'scan_page')
590
            {
591
                log::debug("Find scan page: {$url}");
592
            }
593
            elseif ($link['url_type'] == 'content_page')
594
            {
595
                log::debug("Find content page: {$url}");
596
            }
597
            elseif ($link['url_type'] == 'list_page')
598
            {
599
                log::debug("Find list page: {$url}");
600
            }
601
        }
602
603
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type false which is incompatible with the documented return type void.
Loading history...
604
    }
605
606
    /**
607
     * 是否入口页面
608
     * 
609
     * @param mixed $url
610
     * @return void
611
     * @author seatle <[email protected]> 
612
     * @created time :2016-10-12 19:06
613
     */
614
    public function is_scan_page($url)
615
    {
616
        $parse_url = parse_url($url);
617
        //2018-1-3 通配所有域名
618
        if ( ! empty($parse_url['host']) and self::$configs['domains'][0] == '*')
619
        {
620
            return true;
0 ignored issues
show
Bug Best Practice introduced by
The expression return true returns the type true which is incompatible with the documented return type void.
Loading history...
621
        }
622
        //限定域名
623
        if (empty($parse_url['host']) || ! in_array($parse_url['host'], self::$configs['domains']))
624
        {
625
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
626
        }
627
        return true;
0 ignored issues
show
Bug Best Practice introduced by
The expression return true returns the type true which is incompatible with the documented return type void.
Loading history...
628
    }
629
630
    /**
631
     * 是否列表页面
632
     * 
633
     * @param mixed $url
634
     * @return void
635
     * @author seatle <[email protected]> 
636
     * @created time :2016-10-12 19:06
637
     */
638
    public function is_list_page($url)
639
    {
640
        $result = false;
641
        //过滤下载类型文件 20180209
642
        if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url))
643
        {
644
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
645
        }
646
647
        //增加 要排除的列表页特征正则 BY KEN <[email protected]>
648
        if ( ! empty(self::$configs['list_url_regexes_remove']))
649
        {
650
            foreach (self::$configs['list_url_regexes_remove'] as $regex)
651
            {
652
                if (preg_match("#{$regex}#i", $url))
653
                {
654
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
655
                }
656
            }
657
        }
658
659
        //增加无列表页选项,即所有页面都要抓取内容,包含列表页
660
        if (empty(self::$configs['list_url_regexes']) or self::$configs['list_url_regexes'][0] == 'x')
661
        {
662
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
663
        }
664
665
        //增加泛列表页,即所有页面都是列表页,只抓取链接,不抓取内容
666
        if (self::$configs['list_url_regexes'][0] == '*')
667
        {
668
            return true;
0 ignored issues
show
Bug Best Practice introduced by
The expression return true returns the type true which is incompatible with the documented return type void.
Loading history...
669
        }
670
671
        if ( ! empty(self::$configs['list_url_regexes']))
672
        {
673
            foreach (self::$configs['list_url_regexes'] as $regex) 
674
            {
675
                if (preg_match("#{$regex}#i", $url))
676
                {
677
                    $result = true;
678
                    break;
679
                }
680
            }
681
        }
682
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type boolean which is incompatible with the documented return type void.
Loading history...
683
    }
684
685
    /**
686
     * 是否内容页面
687
     * 
688
     * @param mixed $url
689
     * @return void
690
     * @author seatle <[email protected]> 
691
     * @created time :2016-10-12 19:06
692
     */
693
    public function is_content_page($url)
694
    {
695
        $result = false;
696
        //过滤下载类型文件 20180209
697
        if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url))
698
        {
699
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
700
        }
701
702
        //增加 要排除的内容页特征正则 BY KEN <[email protected]>
703
        if ( ! empty(self::$configs['content_url_regexes_remove']))
704
        {
705
            foreach (self::$configs['content_url_regexes_remove'] as $regex)
706
            {
707
                if (preg_match("#{$regex}#i", $url))
708
                {
709
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
710
                }
711
            }
712
        }
713
714
        //增加泛内容模式,即所有页面都要提取内容
715
        if (empty(self::$configs['content_url_regexes']) or self::$configs['content_url_regexes'][0] == '*')
716
        {
717
            return true;
0 ignored issues
show
Bug Best Practice introduced by
The expression return true returns the type true which is incompatible with the documented return type void.
Loading history...
718
        }
719
        //无内容,泛列表模式,即所有页面都不提取内容
720
        if (self::$configs['content_url_regexes'][0] == 'x')
721
        {
722
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
723
        }
724
725
        if ( ! empty(self::$configs['content_url_regexes']))
726
        {
727
            foreach (self::$configs['content_url_regexes'] as $regex) 
728
            {
729
                if (preg_match("#{$regex}#i", $url))
730
                {
731
                    $result = true;
732
                    break;
733
                }
734
            }
735
        }
736
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type boolean which is incompatible with the documented return type void.
Loading history...
737
    }
738
739
    /**
740
     * Parse command.
741
     * php yourfile.php start | stop | status | kill
742
     *
743
     * @return void
744
     */
745
    public function parse_command()
746
    {
747
        // 检查运行命令的参数
748
        global $argv;
749
        $start_file = $argv[0]; 
750
751
        // 命令
752
        $command = isset($argv[1]) ? trim($argv[1]) : 'start';
753
754
        // 子命令, 目前只支持-d
755
        $command2 = isset($argv[2]) ? $argv[2] : '';
756
757
        // 根据命令做相应处理
758
        switch($command)
759
        {
760
            // 启动 phpspider
761
        case 'start':
762
            if ($command2 === '-d') 
763
            {
764
                self::$daemonize = true;
765
            }
766
            break;
767
        case 'stop':
768
            exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}'", $info);
769
            if (count($info) <= 1)
770
            {
771
                echo "PHPSpider[$start_file] not run\n";
772
            }
773
            else 
774
            {
775
                //echo "PHPSpider[$start_file] is stoping ...\n";
776
                echo "PHPSpider[$start_file] stop success";
777
                exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGINT", $info);
778
            }
779
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
780
            break;
781
        case 'kill':
782
            exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGKILL");
783
            break;
784
            // 显示 phpspider 运行状态
785
        case 'status':
786
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
787
            // 未知命令
788
        default :
789
            exit("Usage: php yourfile.php {start|stop|status|kill}\n");
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
790
        }
791
    }
792
793
    /**
794
     * Signal hander.
795
     *
796
     * @param int $signal
797
     */
798
    public function signal_handler($signal)
799
    {
800
        switch ($signal)
801
        {
802
            // Stop.
803
            case SIGINT:
804
                log::warn('Program stopping...');
805
                self::$terminate = true;
806
                break;
807
            // Show status.
808
            case SIGUSR2:
809
                echo "show status\n";
810
                break;
811
        }
812
    }
813
814
    /**
815
     * Install signal handler.
816
     *
817
     * @return void
818
     */
819
    public function install_signal()
820
    {
821
        if (function_exists('pcntl_signal')) 
822
        {
823
            // stop
824
            // static调用方式
825
            //pcntl_signal(SIGINT, array(__CLASS__, 'signal_handler'), false);
826
            pcntl_signal(SIGINT, array(&$this, 'signal_handler'), false);
827
            // status
828
            pcntl_signal(SIGUSR2, array(&$this, 'signal_handler'), false);
829
            // ignore
830
            pcntl_signal(SIGPIPE, SIG_IGN, false);
831
        }
832
    }
833
834
    /**
835
     * Run as deamon mode.
836
     *
837
     * @throws Exception
838
     */
839
    protected static function daemonize()
840
    {
841
        if (!self::$daemonize) 
842
        {
843
            return;
844
        }
845
846
        // fork前一定要关闭redis
847
        queue::clear_link();
848
849
        umask(0);
850
        $pid = pcntl_fork();
851
        if (-1 === $pid) 
852
        {
853
            throw new Exception('fork fail');
854
        } 
855
        elseif ($pid > 0) 
856
        {
857
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
858
        }
859
        if (-1 === posix_setsid()) 
860
        {
861
            throw new Exception('setsid fail');
862
        }
863
        // Fork again avoid SVR4 system regain the control of terminal.
864
        $pid = pcntl_fork();
865
        if (-1 === $pid)
866
        {
867
            throw new Exception('fork fail');
868
        }
869
        elseif (0 !== $pid)
870
        {
871
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
872
        }
873
    }
874
875
    /**
876
     * 检查是否终止当前进程
877
     * 
878
     * @return void
879
     * @author seatle <[email protected]> 
880
     * @created time :2016-11-16 11:06
881
     */
882
    public function check_terminate()
883
    {
884
        if (!self::$terminate) 
885
        {
886
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
887
        }
888
889
        // 删除当前任务状态
890
        $this->del_task_status(self::$serverid, self::$taskid);
891
892
        if (self::$taskmaster) 
893
        {
894
            // 检查子进程是否都退出
895
            while (true)
896
            {
897
                $all_stop = true;
898
                for ($i = 2; $i <= self::$tasknum; $i++) 
899
                {
900
                    // 只要一个还活着就说明没有完全退出
901
                    $task_status = $this->get_task_status(self::$serverid, $i);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status(self::serverid, $i) targeting phpspider\core\phpspider::get_task_status() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
902
                    if ($task_status)
903
                    {
904
                        $all_stop = false;
905
                    }
906
                }
907
                if ($all_stop)
908
                {
909
                    break;
910
                }
911
                else
912
                {
913
                    log::warn('Task stop waiting...');
914
                }
915
                sleep(1);
916
            }
917
918
            $this->del_server_list(self::$serverid);
919
920
            // 显示最后结果
921
            log::$log_show = true;
922
923
            $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
924
            log::note("Spider finished in {$spider_time_run}");
925
926
            $get_collected_url_num = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collected_url_num is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
927
            log::note("Total pages: {$get_collected_url_num} \n");
928
        }
929
        exit();
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
930
    }
931
932
    public function start()
933
    {
934
        $this->parse_command();
935
936
        // 爬虫开始时间
937
        self::$time_start = time();
938
        // 当前任务ID
939
        self::$taskid = 1;
940
        // 当前任务进程ID
941
        self::$taskpid      = function_exists('posix_getpid') ? posix_getpid() : 1;
942
        self::$collect_succ = 0;
943
        self::$collect_fail = 0;
944
945
        //--------------------------------------------------------------------------------
946
        // 运行前验证
947
        //--------------------------------------------------------------------------------
948
949
        // 检查PHP版本
950
        if (version_compare(PHP_VERSION, '5.3.0', 'lt')) 
951
        {
952
            log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion());
953
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
954
        }
955
956
        // 检查CURL扩展
957
        if(!function_exists('curl_init'))
958
        {
959
            log::error('The curl extension was not found');
960
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
961
        }
962
963
        // 多任务需要pcntl扩展支持
964
        if (self::$tasknum > 1 && !function_exists('pcntl_fork')) 
965
        {
966
            log::error('Multitasking needs pcntl, the pcntl extension was not found');
967
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
968
        }
969
970
        // 守护进程需要pcntl扩展支持
971
        if (self::$daemonize && !function_exists('pcntl_fork')) 
972
        {
973
            log::error('Daemonize needs pcntl, the pcntl extension was not found');
974
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
975
        }
976
977
        // 集群、保存运行状态、多任务都需要Redis支持
978
        if ( self::$multiserver || self::$save_running_state || self::$tasknum > 1 ) 
979
        {
980
            self::$use_redis = true;
981
982
            queue::set_connect('default', self::$queue_config);
983
            if (!queue::init()) 
984
            {
985
                if ( self::$multiserver ) 
986
                {
987
                    log::error('Multiserver needs Redis support, '.queue::$error);
988
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
989
                }
990
991
                if ( self::$tasknum > 1 ) 
992
                {
993
                    log::error('Multitasking needs Redis support, '.queue::$error);
994
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
995
                }
996
997
                if ( self::$save_running_state ) 
998
                {
999
                    log::error('Spider kept running state needs Redis support, '.queue::$error);
1000
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1001
                }
1002
            }
1003
        }
1004
1005
        // 检查导出
1006
        $this->check_export();
1007
1008
        // 检查缓存
1009
        $this->check_cache();
1010
1011
        // 检查 scan_urls 
1012
        if (empty(self::$configs['scan_urls'])) 
1013
        {
1014
            log::error('No scan url to start');
1015
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1016
        }
1017
1018
        foreach ( self::$configs['scan_urls'] as $url ) 
1019
        {
1020
            // 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了.
1021
            if (!$this->is_scan_page($url))
1022
            {
1023
                log::error("Domain of scan_urls (\"{$url}\") does not match the domains of the domain name");
1024
                exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1025
            }
1026
        }
1027
1028
        // windows 下没法显示面板, 强制显示日志
1029
        if (util::is_win()) 
1030
        {
1031
            self::$configs['name'] = iconv('UTF-8', 'GB2312//IGNORE', self::$configs['name']);
1032
            log::$log_show         = true;
1033
        }
1034
        // 守护进程下也显示日志
1035
        elseif (self::$daemonize) 
1036
        {
1037
            log::$log_show = true;
1038
        }
1039
        else 
1040
        {
1041
            log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
1042
        }
1043
1044
        if (log::$log_show)
1045
        {
1046
            global $argv;
1047
            $start_file = $argv[0]; 
1048
1049
            $header = '';
1050
            if ( ! util::is_win())
1051
            {
1052
                $header .= "\033[33m";
1053
            }
1054
1055
            $header .= "\n[ ".self::$configs['name']." Spider ] is started...\n\n";
1056
            $header .= '  * PHPSpider Version: '.self::VERSION."\n";
1057
            $header .= "  * Documentation: https://doc.phpspider.org\n";
1058
            $header .= '  * Task Number: '.self::$tasknum."\n\n";
1059
            $header .= "Input \"php $start_file stop\" to quit. Start success.\n";
1060
            if ( ! util::is_win())
1061
            {
1062
                $header .= "\033[0m";
1063
            }
1064
1065
            log::note($header);
1066
        }
1067
1068
        // 如果是守护进程,恢复日志状态
1069
        //if (self::$daemonize) 
1070
        //{
1071
            //log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false;
1072
        //}
1073
1074
        // 多任务和分布式都要清掉, 当然分布式只清自己的
1075
        $this->init_redis();
1076
1077
        //--------------------------------------------------------------------------------
1078
        // 生成多任务
1079
        //--------------------------------------------------------------------------------
1080
1081
        // 添加入口URL到队列
1082
        foreach ( self::$configs['scan_urls'] as $url ) 
1083
        {
1084
            // false 表示不允许重复
1085
            $this->add_scan_url($url, null, false);
1086
        }
1087
1088
        // 放这个位置, 可以添加入口页面
1089
        if ($this->on_start) 
1090
        {
1091
            call_user_func($this->on_start, $this);
1092
        }
1093
1094
        if (!self::$daemonize) 
1095
        {
1096
            if (!log::$log_show) 
1097
            {
1098
                // 第一次先清屏
1099
                $this->clear_echo();
1100
1101
                // 先显示一次面板, 然后下面再每次采集成功显示一次
1102
                $this->display_ui();
1103
            }
1104
        }
1105
        else 
1106
        {
1107
            $this->daemonize();
1108
        }
1109
1110
        // 安装信号
1111
        $this->install_signal();
1112
1113
        // 开始采集
1114
        $this->do_collect_page();
1115
1116
        // 从服务器列表中删除当前服务器信息
1117
        $this->del_server_list(self::$serverid);
1118
    }
1119
1120
    /**
1121
     * 创建一个子进程
1122
     * @param Worker $worker
0 ignored issues
show
Bug introduced by
The type phpspider\core\Worker was not found. Did you mean Worker? If so, make sure to prefix the type with \.
Loading history...
1123
     * @throws Exception
1124
     */
1125
    public function fork_one_task($taskid)
1126
    {
1127
        $pid = pcntl_fork();
1128
1129
        // 主进程记录子进程pid
1130
        if($pid > 0)
1131
        {
1132
            // 暂时没用
1133
            //self::$taskpids[$taskid] = $pid;
1134
        }
1135
        // 子进程运行
1136
        elseif (0 === $pid)
1137
        {
1138
            log::warn("Fork children task({$taskid}) successful...");
1139
1140
            // 初始化子进程参数
1141
            self::$time_start   = microtime(true);
1142
            self::$taskid       = $taskid;
1143
            self::$taskmaster   = false;
1144
            self::$taskpid      = posix_getpid();
1145
            self::$collect_succ = 0;
1146
            self::$collect_fail = 0;
1147
1148
            queue::set_connect('default', self::$queue_config);
1149
            queue::init();
1150
1151
            //退出前计时,等待1分钟,如果获取不到新任务,再退出
1152
            self::$stand_by_time = 0;
1153
            while (self::$stand_by_time < self::$configs['max_stand_by_time'])
1154
            {
1155
                $this->do_collect_page();
1156
                log::warn('Task('.self::$taskid.') Stand By '.self::$stand_by_time.'/'.self::$configs['max_stand_by_time'].' s');
1157
                self::$stand_by_time++;
1158
                sleep(1);
1159
            }
1160
            $queue_lsize = $this->queue_lsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue_lsize is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1161
            log::warn('Task('.self::$taskid.') exit : queue_lsize = '.$queue_lsize);
0 ignored issues
show
Bug introduced by
Are you sure $queue_lsize of type void can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1161
            log::warn('Task('.self::$taskid.') exit : queue_lsize = './** @scrutinizer ignore-type */ $queue_lsize);
Loading history...
1162
            $this->del_task_status(self::$serverid, $taskid);
1163
1164
            // 这里用0表示子进程正常退出
1165
            exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1166
        }
1167
        else
1168
        {
1169
            log::error("Fork children task({$taskid}) fail...");
1170
            exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
1171
        }
1172
    }
1173
1174
    public function do_collect_page() 
1175
    {
1176
        while( $queue_lsize = $this->queue_lsize() )
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue_lsize is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1177
        { 
1178
            // 如果是主任务
1179
            if (self::$taskmaster) 
1180
            {
1181
                // 多任务下主任务未准备就绪
1182
                if (self::$tasknum > 1 && !self::$fork_task_complete) 
1183
                {
1184
                    // 主进程采集到多于任务数2个时, 生成子任务一起采集
1185
                    if ($queue_lsize > self::$tasknum + 2)
1186
                    {
1187
                        self::$fork_task_complete = true;
1188
1189
                        // fork 子进程前一定要先干掉redis连接fd, 不然会存在进程互抢redis fd 问题
1190
                        queue::clear_link();
1191
                        // task进程从2开始, 1被master进程所使用
1192
                        for ($i = 2; $i <= self::$tasknum; $i++) 
1193
                        {
1194
                            $this->fork_one_task($i);
1195
                        }
1196
                    }
1197
                }
1198
                //在主进程中,保存当前配置到缓存,以使子进程可实时读取动态修改后的配置 20180209
1199
                if (self::$use_redis and ! empty(self::$configs))
1200
                {
1201
                    queue::set('configs_'.self::$configs['name'], json_encode(self::$configs));
1202
                }
1203
                // 抓取页面
1204
                $this->collect_page();
1205
                // 保存任务状态
1206
                $this->set_task_status();
1207
1208
                // 每采集成功一次页面, 就刷新一次面板
1209
                if (!log::$log_show && !self::$daemonize) 
1210
                {
1211
                    $this->display_ui();
1212
                }
1213
            }
1214
            // 如果是子任务
1215
            else 
1216
            {
1217
                // 主进程采集到多于任务数2个时, 子任务可以采集, 否则等待...
1218
                if ($queue_lsize > self::$taskid + 2)
1219
                {
1220
                    //在子进程中,从内存中实时读取当前最新配置,用于适应主进程常驻内存模式,无限循环后的配置变动 20180209
1221
                    if (self::$use_redis and ! empty(self::$configs))
1222
                    {
1223
                        if ($configs_active = queue::get('configs_'.self::$configs['name']))
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $configs_active is correct as phpspider\core\queue::ge... self::configs['name']) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1224
                        {
1225
                            self::$configs = json_decode($configs_active, true);
0 ignored issues
show
Bug introduced by
$configs_active of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1225
                            self::$configs = json_decode(/** @scrutinizer ignore-type */ $configs_active, true);
Loading history...
1226
                        }
1227
                    }
1228
                    // 抓取页面
1229
                    $this->collect_page();
1230
                    // 保存任务状态
1231
                    $this->set_task_status();
1232
                }
1233
                else 
1234
                {
1235
                    log::warn('Task('.self::$taskid.') waiting...reason: queue_lsize = '.$queue_lsize.' < tasknum  = '.self::$tasknum);
0 ignored issues
show
Bug introduced by
Are you sure $queue_lsize of type void can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1235
                    log::warn('Task('.self::$taskid.') waiting...reason: queue_lsize = './** @scrutinizer ignore-type */ $queue_lsize.' < tasknum  = '.self::$tasknum);
Loading history...
1236
                    sleep(1);
1237
                }
1238
            }
1239
1240
            // 检查进程是否收到关闭信号
1241
            $this->check_terminate();
1242
        } 
1243
    }
1244
1245
    /**
1246
     * 爬取页面
1247
     * 
1248
     * @param mixed $collect_url    要抓取的链接
1249
     * @return void
1250
     * @author seatle <[email protected]> 
1251
     * @created time :2016-09-18 10:17
1252
     */
1253
    public function collect_page() 
1254
    {
1255
        //减少非必要 queue_lsize 查询 20180214
1256
        if (isset(self::$configs['log_type']) and strstr(self::$configs['log_type'], 'info'))
1257
        {
1258
            $get_collect_url_num = $this->get_collect_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collect_url_num is correct as $this->get_collect_url_num() targeting phpspider\core\phpspider::get_collect_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1259
            log::info('task id: '.self::$taskid." Find pages: {$get_collect_url_num} ");
1260
1261
            $queue_lsize = $this->queue_lsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue_lsize is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1262
            log::info('task id: '.self::$taskid." Waiting for collect pages: {$queue_lsize} ");
1263
1264
            $get_collected_url_num = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $get_collected_url_num is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1265
            log::info('task id: '.self::$taskid." Collected pages: {$get_collected_url_num} ");
1266
1267
            // 多任务的时候输出爬虫序号
1268
            if (self::$tasknum > 1)
1269
            {
1270
                log::info('Current task id: '.self::$taskid);
1271
            }
1272
        }
1273
        //顺序提取任务,先进先出(当配置 queue_order = rand ,先进先出无效,都为随机提取任务)
1274
        $link = $this->queue_rpop();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->queue_rpop() targeting phpspider\core\phpspider::queue_rpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1275
1276
        if (empty($link))
1277
        {
1278
            log::warn('Task('.self::$taskid.') Get Task link Fail...Stand By...');
1279
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1280
        }
1281
        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1282
        if (empty($link['url']))
1283
        {
1284
            log::warn('Task('.self::$taskid.') Get Task url Fail...Stand By...');
1285
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1286
        }
1287
        self::$stand_by_time = 0; //接到任务,则超时退出计时重置
1288
1289
        $url = $link['url'];
1290
1291
        //限制单域名最大url数量 20180213
1292
        if (isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0)
1293
        {
1294
            $domain_pages_num = $this->incr_pages_num($url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $domain_pages_num is correct as $this->incr_pages_num($url) targeting phpspider\core\phpspider::incr_pages_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1295
            if ($domain_pages_num > self::$configs['max_pages'])
1296
            {
1297
                log::debug('Task('.self::$taskid.') pages = '.$domain_pages_num.' more than '.self::$configs['max_pages'].", $url [Skip]");
0 ignored issues
show
Bug introduced by
Are you sure $domain_pages_num of type void can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1297
                log::debug('Task('.self::$taskid.') pages = './** @scrutinizer ignore-type */ $domain_pages_num.' more than '.self::$configs['max_pages'].", $url [Skip]");
Loading history...
1298
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1299
            }
1300
        }
1301
1302
        //限制单域名最大花费时长 20180213
1303
        if (isset(self::$configs['max_duration']) and self::$configs['max_duration'] > 0)
1304
        {
1305
            $domain_duration = $this->get_duration_num($url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $domain_duration is correct as $this->get_duration_num($url) targeting phpspider\core\phpspider::get_duration_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1306
            if ($domain_duration > self::$configs['max_duration'])
1307
            {
1308
                log::debug('Task('.self::$taskid.') duration = '.$domain_duration.' more than '.self::$configs['max_duration'].", $url [Skip]");
0 ignored issues
show
Bug introduced by
Are you sure $domain_duration of type void can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1308
                log::debug('Task('.self::$taskid.') duration = './** @scrutinizer ignore-type */ $domain_duration.' more than '.self::$configs['max_duration'].", $url [Skip]");
Loading history...
1309
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1310
            }
1311
        }
1312
1313
        //当前 host 并发检测 2018-5 BY KEN <[email protected]>
1314
        if (self::$configs['max_task_per_host'] > 0)
1315
        {
1316
            $task_per_host = $this->get_task_per_host_num($url);
1317
            if ($task_per_host < self::$configs['max_task_per_host'])
1318
            {
1319
                $task_per_host = $this->incr_task_per_host($url);
0 ignored issues
show
Unused Code introduced by
The assignment to $task_per_host is dead and can be removed.
Loading history...
1320
            }
1321
            else
1322
            {
1323
                log::warn('Task('.self::$taskid.') task_per_host = '.$task_per_host.' > '.self::$configs['max_task_per_host'].' ; URL: '.$url.' will be retry later...');
1324
                $this->queue_lpush($link); //放回队列
1325
                usleep(100000);
1326
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1327
            }
1328
        }
1329
1330
        // 已采集页面数量 +1
1331
        $this->incr_collected_url_num($url);
1332
1333
        // 爬取页面开始时间
1334
        $page_time_start = microtime(true);
1335
1336
        // 下载页面前执行
1337
        // 比如有时需要根据某个特定的URL,来决定这次的请求是否使用代理 / 或使用哪个代理
1338
        if ($this->on_before_download_page) 
1339
        {
1340
            $return = call_user_func($this->on_before_download_page, $url, $link, $this);
1341
            if (isset($return)) $link = $return;
1342
        }
1343
1344
        requests::$input_encoding = null;
1345
        $html = $this->request_url($url, $link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $html is correct as $this->request_url($url, $link) targeting phpspider\core\phpspider::request_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1346
1347
        //记录速度较慢域名花费抓取时间 20180213
1348
        $time_run = round(microtime(true) - $page_time_start);
1349
        if ($time_run > 1)
1350
        {
1351
            $this->incr_duration_num($url, $time_run);
1352
        }
1353
1354
        // 爬完页面开始处理时间
1355
        $page_time_start = microtime(true);
1356
	
1357
        if (!$html) 
0 ignored issues
show
introduced by
$html is of type void, thus it always evaluated to false.
Loading history...
1358
        {
1359
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1360
        }
1361
        // 当前正在爬取的网页页面的对象
1362
        $page = array(
1363
            'url'     => $url,
1364
            'raw'     => $html,
1365
            'request' => array(
1366
                'url'          => $url,
1367
                'method'       => $link['method'],
1368
                'headers'      => $link['headers'],
1369
                'params'       => $link['params'],
1370
                'context_data' => $link['context_data'],
1371
                'try_num'      => $link['try_num'],
1372
                'max_try'      => $link['max_try'],
1373
                'depth'        => $link['depth'],
1374
                'taskid'       => self::$taskid,
1375
            ),
1376
        );
1377
        //printf("memory usage: %.2f M\n", memory_get_usage() / 1024 / 1024 ); 
1378
        unset($html);
1379
1380
        //--------------------------------------------------------------------------------
1381
        // 处理回调函数
1382
        //--------------------------------------------------------------------------------
1383
1384
        // 判断当前网页是否被反爬虫了, 需要开发者实现 
1385
        if ($this->is_anti_spider) 
1386
        {
1387
            $is_anti_spider = call_user_func($this->is_anti_spider, $url, $page['raw'], $this);
1388
            // 如果在回调函数里面判断被反爬虫并且返回true
1389
            if ($is_anti_spider) 
1390
            {
1391
                return false;
1392
            }
1393
        }
1394
1395
        // 在一个网页下载完成之后调用. 主要用来对下载的网页进行处理.
1396
        // 比如下载了某个网页, 希望向网页的body中添加html标签
1397
        if ($this->on_download_page)
1398
        {
1399
            $return = call_user_func($this->on_download_page, $page, $this);
1400
            // 针对那些老是忘记return的人
1401
            if (isset($return))
1402
            {
1403
                $page = $return;
1404
            }
1405
            unset($return);
1406
        }
1407
1408
        // 是否从当前页面分析提取URL
1409
        // 回调函数如果返回false表示不需要再从此网页中发现待爬url
1410
        $is_find_url = true;
1411
        if ($link['url_type'] == 'scan_page')
1412
        {
1413
            if ($this->on_scan_page)
1414
            {
1415
                $return = call_user_func($this->on_scan_page, $page, $page['raw'], $this);
1416
                if (isset($return))
1417
                {
1418
                    $is_find_url = $return;
1419
                }
1420
1421
                unset($return);
1422
            }
1423
        }
1424
        elseif ($link['url_type'] == 'content_page')
1425
        {
1426
            if ($this->on_content_page)
1427
            {
1428
                $return = call_user_func($this->on_content_page, $page, $page['raw'], $this);
1429
                if (isset($return))
1430
                {
1431
                    $is_find_url = $return;
1432
                }
1433
                unset($return);
1434
            }
1435
        }
1436
        elseif ($link['url_type'] == 'list_page')
1437
        {
1438
            if ($this->on_list_page)
1439
            {
1440
                $return = call_user_func($this->on_list_page, $page, $page['raw'], $this);
1441
                if (isset($return))
1442
                {
1443
                    $is_find_url = $return;
1444
                }
1445
                unset($return);
1446
            }
1447
        }
1448
1449
        // on_scan_page、on_list_page、on_content_page 返回false表示不需要再从此网页中发现待爬url
1450
        if ($is_find_url) 
1451
        {
1452
            // 如果深度没有超过最大深度, 获取下一级URL
1453
            if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth']) 
1454
            {
1455
                // 分析提取HTML页面中的URL
1456
                $this->get_urls($page['raw'], $url, $link['depth'] + 1);
1457
            }
1458
        }
1459
1460
        // 如果是内容页, 分析提取HTML页面中的字段
1461
        // 列表页也可以提取数据的, source_type: urlcontext, 未实现
1462
        if ($link['url_type'] == 'content_page') 
1463
        {
1464
            $this->get_html_fields($page['raw'], $url, $page);
1465
        }
1466
1467
        // 如果当前深度大于缓存的, 更新缓存
1468
        $this->incr_depth_num($link['depth']);
1469
1470
        // 处理页面耗时时间
1471
        $time_run = round(microtime(true) - $page_time_start, 3);
1472
        log::debug('task id: '.self::$taskid." Success process page {$url} in {$time_run} s");
1473
1474
        $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
1475
        log::info('task id: '.self::$taskid." Spider running in {$spider_time_run}");
1476
1477
        // 爬虫爬取每个网页的时间间隔, 单位: 毫秒
1478
        if (!isset(self::$configs['interval'])) 
1479
        {
1480
            // 默认睡眠100毫秒, 太快了会被认为是ddos
1481
            self::$configs['interval'] = 100;
1482
        }
1483
        usleep(self::$configs['interval'] * 1000);
1484
    }
1485
1486
    /**
1487
     * 下载网页, 得到网页内容
1488
     * 
1489
     * @param mixed $url
1490
     * @param mixed $link
1491
     * @return void
1492
     * @author seatle <[email protected]> 
1493
     * @created time :2016-09-18 10:17
1494
     */
1495
    public function request_url($url, $link = array())
1496
    {
1497
        $time_start = microtime(true);
1498
1499
        //$url = "http://www.qiushibaike.com/article/117568316";
1500
1501
        // 设置了编码就不要让requests去判断了
1502
        if (isset(self::$configs['input_encoding'])) 
1503
        {
1504
            requests::$input_encoding = self::$configs['input_encoding'];
1505
        }
1506
        // 得到的编码如果不是utf-8的要转成utf-8, 因为xpath只支持utf-8
1507
        requests::$output_encoding = 'utf-8';
1508
        requests::set_timeout(self::$configs['timeout']);
1509
        requests::set_useragent(self::$configs['user_agent']);
1510
1511
        // 先删除伪造IP
1512
        requests::del_client_ip();
1513
        // 是否设置了伪造IP
1514
        if (self::$configs['client_ip']) 
1515
        {
1516
            requests::set_client_ip(self::$configs['client_ip']);
1517
        }
1518
1519
        // 先删除代理,免得前一个URL的代理被带过来了
1520
        requests::del_proxy();
1521
        // 是否设置了代理
1522
        if ($link['proxy']) 
1523
        {
1524
            requests::set_proxy($link['proxy']);
1525
        }
1526
1527
        // 如何设置了 HTTP Headers
1528
        if (!empty($link['headers'])) 
1529
        {
1530
            foreach ($link['headers'] as $k=>$v) 
1531
            {
1532
                requests::set_header($k, $v);
1533
            }
1534
        }
1535
        //限制 http 请求模式为 get 或 post
1536
        $method = trim(strtolower($link['method']));
1537
        $method = ($method == 'post') ? 'post' : 'get';
1538
        $params = empty($link['params']) ? array() : $link['params'];
1539
        $html = requests::$method($url, $params);
1540
        // 此url附加的数据不为空, 比如内容页需要列表页一些数据, 拼接到后面去
1541
        if ($html && !empty($link['context_data'])) 
1542
        {
1543
            $html .= $link['context_data'];
1544
        }
1545
1546
        $http_code = requests::$status_code;
1547
1548
        //请求完成 host 的并发计数减 1 2018-5 BY KEN <[email protected]>
1549
        if (self::$configs['max_task_per_host'] > 0)
1550
        {
1551
            $this->incr_task_per_host($url, 'decr');
1552
        }
1553
1554
        if ($this->on_status_code)
1555
        {
1556
            $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this);
1557
            if (isset($return)) 
1558
            {
1559
                $html = $return;
1560
            }
1561
            unset($return);
1562
            if ( ! $html)
1563
            {
1564
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1565
            }
1566
        }
1567
1568
        if ($http_code != 200)
1569
        {
1570
            // 如果是301、302跳转, 抓取跳转后的网页内容
1571
            if ($http_code == 301 || $http_code == 302) 
1572
            {
1573
                $info = requests::$info;
1574
                //if (isset($info['redirect_url'])) 
1575
                if (!empty($info['redirect_url'])) 
1576
                {
1577
                    $url = $info['redirect_url'];
1578
                    requests::$input_encoding = null;
1579
                    $method = empty($link['method']) ? 'get' : strtolower($link['method']);
1580
                    $params = empty($link['params']) ? array() : $link['params'];
1581
                    $html = requests::$method($url, $params);
1582
                    // 有跳转的就直接获取就好,不要调用自己,容易进入死循环
1583
                    //$html = $this->request_url($url, $link);
1584
                    if ($html && !empty($link['context_data'])) 
1585
                    {
1586
                        $html .= $link['context_data'];
1587
                    }
1588
                }
1589
                else 
1590
                {
1591
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1592
                }
1593
            }
1594
            else 
1595
            {
1596
                if ( ! empty(self::$configs['max_try']) and $http_code == 407)
1597
                {
1598
                    // 扔到队列头部去, 继续采集
1599
                    $this->queue_rpush($link);
1600
                    log::error("Failed to download page {$url}");
1601
                    self::$collect_fail++;
1602
                }
1603
                elseif ( ! empty(self::$configs['max_try']) and in_array($http_code, array('0', '502', '503', '429')))
1604
                {
1605
                    // 采集次数加一
1606
                    $link['try_num']++;
1607
                    // 抓取次数 小于 允许抓取失败次数
1608
                    if ( $link['try_num'] <= $link['max_try'] ) 
1609
                    {
1610
                        // 扔到队列头部去, 继续采集
1611
                        $this->queue_rpush($link);
1612
                    }
1613
                    log::error("Failed to download page {$url}, retry({$link['try_num']})");
1614
                }
1615
                else 
1616
                {
1617
                    log::error("Failed to download page {$url}");
1618
                    self::$collect_fail++;
1619
                }
1620
                log::error("HTTP CODE: {$http_code}");
1621
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1622
            }
1623
        }
1624
1625
        // 爬取页面耗时时间
1626
        $time_run = round(microtime(true) - $time_start, 3);
1627
        log::debug("Success download page {$url} in {$time_run} s");
1628
        self::$collect_succ++;
1629
1630
        return $html;
1631
    }
1632
1633
    /**
1634
     * 分析提取HTML页面中的URL
1635
     * 
1636
     * @param mixed $html           HTML内容
1637
     * @param mixed $collect_url    抓取的URL, 用来拼凑完整页面的URL
1638
     * @return void
1639
     * @author seatle <[email protected]> 
1640
     * @created time :2016-09-18 10:17
1641
     */
1642
    public function get_urls($html, $collect_url, $depth = 0) 
1643
    { 
1644
        //--------------------------------------------------------------------------------
1645
        // 正则匹配出页面中的URL
1646
        //--------------------------------------------------------------------------------
1647
        $urls = selector::select($html, '//a/@href');             
1648
        //preg_match_all("/<a.*href=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isU", $html, $matchs); 
1649
        //$urls = array();
1650
        //if (!empty($matchs[1])) 
1651
        //{
1652
            //foreach ($matchs[1] as $url) 
1653
            //{
1654
                //$urls[] = str_replace(array("\"", "'",'&amp;'), array("",'','&'), $url);
1655
            //}
1656
        //}
1657
1658
        if (empty($urls)) 
1659
        {
1660
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1661
        }
1662
1663
        // 如果页面上只有一个url,要把他转为数组,否则下面会报警告
1664
        if (!is_array($urls)) 
1665
        {
1666
            $urls = array($urls);
1667
        }
1668
1669
        foreach ($urls as $key=>$url) 
1670
        {
1671
            //限制最大子域名数量
1672
            if ( ! empty(self::$configs['max_sub_num']))
1673
            {
1674
                //抓取子域名超过超过指定值,就丢掉
1675
                $sub_domain_count = $this->sub_domain_count($url);
1676
                if ($sub_domain_count > self::$configs['max_sub_num'])
1677
                {
1678
                    unset($urls[$key]);
1679
                    log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",get_urls $url [Skip]");
1680
                    continue;
1681
                }
1682
            }
1683
            $urls[$key] = str_replace(array('"', "'", '&amp;'), array('', '', '&'), $url);
1684
        }
1685
1686
        //--------------------------------------------------------------------------------
1687
        // 过滤和拼凑URL
1688
        //--------------------------------------------------------------------------------
1689
        // 去除重复的URL
1690
        $urls = array_unique($urls);
1691
        foreach ($urls as $k=>$url) 
1692
        {
1693
            $url = trim($url);
1694
            if (empty($url)) 
1695
            {
1696
                continue;
1697
            }
1698
1699
            $val = $this->fill_url($url, $collect_url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $val is correct as $this->fill_url($url, $collect_url) targeting phpspider\core\phpspider::fill_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1700
1701
            //限制单域名最大url数量 20180213
1702
            if ($val and isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0)
1703
            {
1704
                $domain_pages_num = $this->incr_pages_num($val);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $domain_pages_num is correct as $this->incr_pages_num($val) targeting phpspider\core\phpspider::incr_pages_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1705
                if ($domain_pages_num > self::$configs['max_pages'])
1706
                {
1707
                    continue;
1708
                }
1709
            }
1710
1711
            if ($val)
1712
            {
1713
                $urls[$k] = $val;
1714
            }
1715
            else 
1716
            {
1717
                unset($urls[$k]);
1718
            }
1719
        }
1720
1721
        if (empty($urls)) 
1722
        {
1723
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1724
        }
1725
1726
        //--------------------------------------------------------------------------------
1727
        // 把抓取到的URL放入队列
1728
        //--------------------------------------------------------------------------------
1729
        foreach ($urls as $url) 
1730
        {
1731
            if ($this->on_fetch_url) 
1732
            {
1733
                $return = call_user_func($this->on_fetch_url, $url, $this);
1734
                $url = isset($return) ? $return : $url;
1735
                unset($return);
1736
1737
                // 如果 on_fetch_url 返回 false,此URL不入队列
1738
                if (!$url) 
1739
                {
1740
                    continue;
1741
                }
1742
            }
1743
1744
            // 把当前页当做找到的url的Referer页
1745
            $options = array(
1746
                'headers' => array(
1747
                    'Referer' => $collect_url,
1748
                )
1749
            );
1750
            $this->add_url($url, $options, $depth);
1751
        }
1752
    }
1753
1754
    /**
1755
     * 获得完整的连接地址
1756
     * 
1757
     * @param mixed $url            要检查的URL
1758
     * @param mixed $collect_url    从那个URL页面得到上面的URL
1759
     * @return void
1760
     * @author seatle <[email protected]> 
1761
     * @created time :2016-09-23 17:13
1762
     */
1763
    public function fill_url($url, $collect_url)
1764
    {
1765
        $url         = trim($url);
1766
        $collect_url = trim($collect_url);
1767
1768
        // 排除JavaScript的连接
1769
        //if (strpos($url, "javascript:") !== false)
1770
        if (preg_match("@^(mailto|javascript:|#|'|\")@i", $url) || $url == '')
1771
        {
1772
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1773
        }
1774
        // 排除没有被解析成功的语言标签
1775
        if (substr($url, 0, 3) == '<%=' or substr($url, 0, 1) == '{' or substr($url, 0, 2) == ' {')
1776
        // if(substr($url, 0, 3) == '<%=')
1777
        {
1778
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1779
        }
1780
1781
        $parse_url = @parse_url($collect_url);
1782
        if (empty($parse_url['scheme']) || empty($parse_url['host'])) 
1783
        {
1784
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1785
        }
1786
        // 过滤mailto、tel、sms、wechat、sinaweibo、weixin等协议
1787
        if ( ! in_array($parse_url['scheme'], array('http', 'https')))
1788
        {
1789
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1790
        }
1791
        $scheme        = $parse_url['scheme'];
1792
        $domain        = $parse_url['host'];
1793
        $path          = empty($parse_url['path']) ? '' : $parse_url['path'];
1794
        $base_url_path = $domain.$path;
1795
        $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", '/', $base_url_path);
1796
        $base_url_path = preg_replace("/\/$/", '', $base_url_path);
1797
        $i             = $path_step             = 0;
1798
        $dstr          = $pstr          = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $pstr is dead and can be removed.
Loading history...
1799
        $pos           = strpos($url, '#');
1800
        if ($pos > 0)
1801
        {
1802
            // 去掉#和后面的字符串
1803
            $url = substr($url, 0, $pos);
1804
        }
1805
1806
        // 修正url格式为 //www.jd.com/111.html 为正确的http
1807
        if (substr($url, 0, 2) == '//')
1808
        {
1809
            $url = preg_replace('/^\/\//iu', '', $url);
1810
        }
1811
        // /1234.html
1812
        elseif($url[0] == '/')
1813
        {
1814
            $url = $domain.$url;
1815
        }
1816
        // ./1234.html、../1234.html 这种类型的
1817
        elseif($url[0] == '.')
1818
        {
1819
            if(!isset($url[2]))
1820
            {
1821
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1822
            }
1823
            else
1824
            {
1825
                $urls = explode('/',$url);
1826
                foreach($urls as $u)
1827
                {
1828
                    if( $u == '..' )
1829
                    {
1830
                        $path_step++;
1831
                    }
1832
                    // 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的
1833
                    else if( $i < count($urls)-1 )
1834
                    {
1835
                        $dstr .= $urls[$i].'/';
1836
                    }
1837
                    else
1838
                    {
1839
                        $dstr .= $urls[$i];
1840
                    }
1841
                    $i++;
1842
                }
1843
                $urls = explode('/',$base_url_path);
1844
                if(count($urls) <= $path_step)
1845
                {
1846
                    return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1847
                }
1848
                else
1849
                {
1850
                    $pstr = '';
1851
                    for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; }
1852
                    $url = $pstr.$dstr;
1853
                }
1854
            }
1855
        }
1856
        else 
1857
        {
1858
            if( strtolower(substr($url, 0, 7))=='http://' )
1859
            {
1860
                $url    = preg_replace('#^http://#i', '', $url);
1861
                $scheme = 'http';
1862
            }
1863
            else if( strtolower(substr($url, 0, 8))=='https://' )
1864
            {
1865
                $url = preg_replace('#^https://#i','',$url);
1866
                $scheme = "https";
1867
            }
1868
            // 相对路径,像 1111.html 这种
1869
            else
1870
            {
1871
                $arr = explode("/", $base_url_path);
1872
                // 去掉空值
1873
                $arr = array_filter($arr);
1874
                $base_url_path = implode("/", $arr);
1875
                $url = $base_url_path.'/'.$url;
1876
            }
1877
        }
1878
        // 两个 / 或以上的替换成一个 /
1879
        $url = preg_replace('/\/{1,}/i', '/', $url);
1880
        $url = $scheme.'://'.$url;
1881
1882
        $parse_url = @parse_url($url);
1883
        $domain    = empty($parse_url['host']) ? $domain : $parse_url['host'];
0 ignored issues
show
Unused Code introduced by
The assignment to $domain is dead and can be removed.
Loading history...
1884
        // 如果host不为空, 判断是不是要爬取的域名
1885
        if ( ! empty($parse_url['host']))
1886
        {
1887
            //2018-1-3 通配所有域名
1888
            if (empty(self::$configs['domains']) or self::$configs['domains'][0] == '*')
1889
            {
1890
                return $url;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $url returns the type string which is incompatible with the documented return type void.
Loading history...
1891
            }
1892
            //排除非域名下的url以提高爬取速度
1893
            if (!in_array($parse_url['host'], self::$configs['domains'])) 
1894
            {
1895
                return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
1896
            }
1897
        }
1898
1899
        return $url;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $url returns the type string which is incompatible with the documented return type void.
Loading history...
1900
    }
1901
1902
    /**
1903
     * 连接对象压缩
1904
     * 
1905
     * @return void
1906
     * @author seatle <[email protected]> 
1907
     * @created time :2016-11-05 18:58
1908
     */
1909
    public function link_compress($link)
1910
    {
1911
        if (empty($link['url_type'])) 
1912
        {
1913
            unset($link['url_type']);
1914
        }
1915
1916
        if (empty($link['method']) || strtolower($link['method']) == 'get') 
1917
        {
1918
            unset($link['method']);
1919
        }
1920
1921
        if (empty($link['headers'])) 
1922
        {
1923
            unset($link['headers']);
1924
        }
1925
1926
        if (empty($link['params'])) 
1927
        {
1928
            unset($link['params']);
1929
        }
1930
1931
        if (empty($link['context_data'])) 
1932
        {
1933
            unset($link['context_data']);
1934
        }
1935
1936
        if (empty($link['proxy'])) 
1937
        {
1938
            unset($link['proxy']);
1939
        }
1940
1941
        if (empty($link['try_num'])) 
1942
        {
1943
            unset($link['try_num']);
1944
        }
1945
1946
        if (empty($link['max_try'])) 
1947
        {
1948
            unset($link['max_try']);
1949
        }
1950
1951
        if (empty($link['depth'])) 
1952
        {
1953
            unset($link['depth']);
1954
        }
1955
        //$json = json_encode($link);
1956
        //$json = gzdeflate($json);
1957
        return $link;
1958
    }
1959
1960
    /**
1961
     * 连接对象解压缩
1962
     * 
1963
     * @param mixed $link
1964
     * @return void
1965
     * @author seatle <[email protected]> 
1966
     * @created time :2016-11-05 18:58
1967
     */
1968
    public function link_uncompress($link)
1969
    {
1970
        $link = array(
1971
            'url'          => isset($link['url'])          ? $link['url']          : '',             
1972
            'url_type'     => isset($link['url_type'])     ? $link['url_type']     : '',             
1973
            'method'       => isset($link['method'])       ? $link['method']       : 'get',             
1974
            'headers'      => isset($link['headers'])      ? $link['headers']      : array(),    
1975
            'params'       => isset($link['params'])       ? $link['params']       : array(),           
1976
            'context_data' => isset($link['context_data']) ? $link['context_data'] : '',                
1977
            'proxy'        => isset($link['proxy'])        ? $link['proxy']        : self::$configs['proxy'],             
1978
            'try_num'      => isset($link['try_num'])      ? $link['try_num']      : 0,                 
1979
            'max_try'      => isset($link['max_try'])      ? $link['max_try']      : self::$configs['max_try'],
1980
            'depth'        => isset($link['depth'])        ? $link['depth']        : 0,             
1981
        );
1982
1983
        return $link;
1984
    }
1985
1986
    /**
1987
     * 分析提取HTML页面中的字段
1988
     * 
1989
     * @param mixed $html
1990
     * @return void
1991
     * @author seatle <[email protected]> 
1992
     * @created time :2016-09-18 10:17
1993
     */
1994
    public function get_html_fields($html, $url, $page) 
1995
    {
1996
        $fields = $this->get_fields(self::$configs['fields'], $html, $url, $page);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields is correct as $this->get_fields(self::...'], $html, $url, $page) targeting phpspider\core\phpspider::get_fields() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
1997
1998
        if (!empty($fields)) 
1999
        {
2000
            if ($this->on_extract_page) 
2001
            {
2002
                $return = call_user_func($this->on_extract_page, $page, $fields);
2003
                if (!isset($return))
2004
                {
2005
                    log::warn("on_extract_page return value can't be empty");
2006
                }
2007
                // 返回false,跳过当前页面,内容不入库
2008
                elseif ($return === false)
2009
                {
2010
                    return false;
2011
                }
2012
                elseif (!is_array($return))
2013
                {
2014
                    log::warn('on_extract_page return value must be an array');
2015
                }
2016
                else 
2017
                {
2018
                    $fields = $return;
2019
                }
2020
            }
2021
2022
            if (isset($fields) && is_array($fields)) 
2023
            {
2024
                $fields_num = $this->incr_fields_num();
2025
                if (self::$configs['max_fields'] != 0 && $fields_num > self::$configs['max_fields']) 
2026
                {
2027
                    exit(0);
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2028
                }
2029
2030
                if (version_compare(PHP_VERSION,'5.4.0','<'))
2031
                {
2032
                    $fields_str = json_encode($fields);
2033
                    $fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function ($matchs)
2034
                    {
2035
                        return @iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1]));
2036
                    }, $fields_str);
2037
                }
2038
                else
2039
                {
2040
                    $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE);
2041
                }
2042
2043
                if (util::is_win()) 
2044
                {
2045
                    $fields_str = mb_convert_encoding($fields_str, 'gb2312', 'utf-8');
2046
                }
2047
                log::info("Result[{$fields_num}]: ".$fields_str);
2048
2049
                // 如果设置了导出选项
2050
                if (!empty(self::$configs['export'])) 
2051
                {
2052
                    self::$export_type = isset(self::$configs['export']['type']) ? self::$configs['export']['type'] : '';
2053
                    if (self::$export_type == 'csv') 
2054
                    {
2055
                        util::put_file(self::$export_file, util::format_csv($fields)."\n", FILE_APPEND);
2056
                    }
2057
                    elseif (self::$export_type == 'sql') 
2058
                    {
2059
                        $sql = db::insert(self::$export_table, $fields, true);
2060
                        util::put_file(self::$export_file, $sql.";\n", FILE_APPEND);
2061
                    }
2062
                    elseif (self::$export_type == 'db') 
2063
                    {
2064
                        db::insert(self::$export_table, $fields);
2065
                    }
2066
                }
2067
            }
2068
        }
2069
    }
2070
2071
    /**
2072
     * 根据配置提取HTML代码块中的字段
2073
     * 
2074
     * @param mixed $confs
2075
     * @param mixed $html
2076
     * @param mixed $page
2077
     * @return void
2078
     * @author seatle <[email protected]> 
2079
     * @created time :2016-09-23 17:13
2080
     */
2081
    public function get_fields($confs, $html, $url, $page) 
2082
    {
2083
        $fields = array();
2084
        foreach ($confs as $conf) 
2085
        {
2086
            // 当前field抽取到的内容是否是有多项
2087
            $repeated = isset($conf['repeated']) && $conf['repeated'] ? true : false;
2088
            // 当前field抽取到的内容是否必须有值
2089
            $required = isset($conf['required']) && $conf['required'] ? true : false;
2090
2091
            if (empty($conf['name'])) 
2092
            {
2093
                log::error("The field name is null, please check your \"fields\" and add the name of the field\n");
2094
                exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2095
            }
2096
2097
            $values = NULL;
2098
            // 如果定义抽取规则
2099
            if (!empty($conf['selector'])) 
2100
            {
2101
                // 如果这个field是上一个field的附带连接
2102
                if (isset($conf['source_type']) && $conf['source_type']=='attached_url') 
2103
                {
2104
                    // 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的
2105
                    if (!empty($fields[$conf['attached_url']])) 
2106
                    {
2107
                        $collect_url = $this->fill_url($fields[$conf['attached_url']], $url);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collect_url is correct as $this->fill_url($fields[...'attached_url']], $url) targeting phpspider\core\phpspider::fill_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2108
                        log::debug("Find attached content page: {$collect_url}");
2109
                        $link['url'] = $collect_url;
2110
                        $link = $this->link_uncompress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_uncompress($link) targeting phpspider\core\phpspider::link_uncompress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2111
                        requests::$input_encoding = null;
2112
                        $html                     = $this->request_url($collect_url, $link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $html is correct as $this->request_url($collect_url, $link) targeting phpspider\core\phpspider::request_url() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2113
                        // 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理.
2114
                        if ($this->on_download_attached_page) 
2115
                        {
2116
                            $return = call_user_func($this->on_download_attached_page, $html, $this);
2117
                            if (isset($return)) 
2118
                            {
2119
                                $html = $return;
2120
                            }
2121
                        }
2122
2123
                        // 请求获取完分页数据后把连接删除了 
2124
                        unset($fields[$conf['attached_url']]);
2125
                    }
2126
                }
2127
2128
                // 没有设置抽取规则的类型 或者 设置为 xpath
2129
                if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') 
2130
                {
2131
                    // 如果找不到,返回的是false
2132
                    $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_xpath(...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_xpath() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2133
                }
2134
                elseif ($conf['selector_type']=='css') 
2135
                {
2136
                    $values = $this->get_fields_css($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_css($h...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_css() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2137
                }
2138
                elseif ($conf['selector_type']=='regex') 
2139
                {
2140
                    $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $values is correct as $this->get_fields_regex(...ector'], $conf['name']) targeting phpspider\core\phpspider::get_fields_regex() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2141
                }
2142
2143
                // field不为空而且存在子配置
2144
                if (isset($values) && !empty($conf['children'])) 
2145
                {
2146
                    // 如果提取到的结果是字符串,就转为数组,方便下面统一foreach
2147
                    if (!is_array($values)) 
2148
                    {
2149
                        $values = array($values);
2150
                    }
2151
                    $child_values = array();
2152
                    // 父项抽取到的html作为子项的提取内容
2153
                    foreach ($values as $child_html) 
2154
                    {
2155
                        // 递归调用本方法, 所以多少子项目都支持
2156
                        $child_value = $this->get_fields($conf['children'], $child_html, $url, $page);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $child_value is correct as $this->get_fields($conf[...hild_html, $url, $page) targeting phpspider\core\phpspider::get_fields() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2157
                        if (!empty($child_value)) 
2158
                        {
2159
                            $child_values[] = $child_value;
2160
                        }
2161
                    }
2162
                    // 有子项就存子项的数组, 没有就存HTML代码块
2163
                    if (!empty($child_values)) 
2164
                    {
2165
                        $values = $child_values;
2166
                    }
2167
                }
2168
            }
2169
2170
            if (!isset($values)) 
2171
            {
2172
                // 如果值为空而且值设置为必须项, 跳出foreach循环
2173
                if ($required) 
2174
                {
2175
                    log::warn("Selector {$conf['name']}[{$conf['selector']}] not found, It's a must");
2176
                    // 清空整个 fields,当前页面就等于略过了
2177
                    $fields = array();
2178
                    break;
2179
                }
2180
                // 避免内容分页时attached_url拼接时候string + array了
2181
                $fields[$conf['name']] = '';
2182
                //$fields[$conf['name']] = array();
2183
            }
2184
            else 
2185
            {
2186
                if (is_array($values)) 
2187
                {
2188
                    if ($repeated) 
2189
                    {
2190
                        $fields[$conf['name']] = $values;
2191
                    }
2192
                    else 
2193
                    {
2194
                        $fields[$conf['name']] = $values[0];
2195
                    }
2196
                }
2197
                else 
2198
                {
2199
                    $fields[$conf['name']] = $values;
2200
                }
2201
                // 不重复抽取则只取第一个元素
2202
                //$fields[$conf['name']] = $repeated ? $values : $values[0];
2203
            }
2204
        }
2205
2206
        if (!empty($fields)) 
2207
        {
2208
            foreach ($fields as $fieldname => $data) 
2209
            {
2210
                $pattern = "/<img\s+.*?src=[\"']{0,1}(.*)[\"']{0,1}[> \r\n\t]{1,}/isu";
2211
                /*$pattern = "/<img.*?src=[\'|\"](.*?(?:[\.gif|\.jpg|\.jpeg|\.png]))[\'|\"].*?[\/]?>/i"; */
2212
                // 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理
2213
                if ($this->on_handle_img && preg_match($pattern, $data)) 
2214
                {
2215
                    $return = call_user_func($this->on_handle_img, $fieldname, $data);
2216
                    if (!isset($return))
2217
                    {
2218
                        log::warn("on_handle_img return value can't be empty\n");
2219
                    }
2220
                    else 
2221
                    {
2222
                        // 有数据才会执行 on_handle_img 方法, 所以这里不要被替换没了
2223
                        $data = $return;
2224
                    }
2225
                }
2226
2227
                // 当一个field的内容被抽取到后进行的回调, 在此回调中可以对网页中抽取的内容作进一步处理
2228
                if ($this->on_extract_field) 
2229
                {
2230
                    $return = call_user_func($this->on_extract_field, $fieldname, $data, $page);
2231
                    if (!isset($return))
2232
                    {
2233
                        log::warn("on_extract_field return value can't be empty\n");
2234
                    }
2235
                    else 
2236
                    {
2237
                        // 有数据才会执行 on_extract_field 方法, 所以这里不要被替换没了
2238
                        $fields[$fieldname] = $return;
2239
                    }
2240
                }
2241
            }
2242
        }
2243
2244
        return $fields;
2245
    }
2246
2247
    /**
2248
     * 验证导出
2249
     * 
2250
     * @return void
2251
     * @author seatle <[email protected]> 
2252
     * @created time :2016-10-02 23:37
2253
     */
2254
    public function check_export()
2255
    {
2256
        // 如果设置了导出选项
2257
        if (!empty(self::$configs['export'])) 
2258
        {
2259
            if (self::$export_type == 'csv') 
2260
            {
2261
                if (empty(self::$export_file)) 
2262
                {
2263
                    log::error('Export data into CSV files need to Set the file path.');
2264
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2265
                }
2266
            }
2267
            elseif (self::$export_type == 'sql') 
2268
            {
2269
                if (empty(self::$export_file)) 
2270
                {
2271
                    log::error('Export data into SQL files need to Set the file path.');
2272
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2273
                }
2274
            }
2275
            elseif (self::$export_type == 'db') 
2276
            {
2277
                if (!function_exists('mysqli_connect'))
2278
                {
2279
                    log::error('Export data to a database need Mysql support, unable to load mysqli extension.');
2280
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2281
                }
2282
2283
                if (empty(self::$db_config)) 
2284
                {
2285
                    log::error('Export data to a database need Mysql support, you have not set a config array for connect.');
2286
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2287
                }
2288
2289
                $config = self::$db_config;
2290
                @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for mysqli_connect(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

2290
                /** @scrutinizer ignore-unhandled */ @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
2291
                if(mysqli_connect_errno())
2292
                {
2293
                    log::error('Export data to a database need Mysql support, '.mysqli_connect_error());
2294
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2295
                }
2296
2297
                db::set_connect('default', $config);
2298
                db::_init();
2299
2300
                if (!db::table_exists(self::$export_table))
2301
                {
2302
                    log::error('Table '.self::$export_table.' does not exist');
2303
                    exit;
0 ignored issues
show
Best Practice introduced by
Using exit here is not recommended.

In general, usage of exit should be done with care and only when running in a scripting context like a CLI script.

Loading history...
2304
                }
2305
            }
2306
        }
2307
    }
2308
2309
    public function check_cache()
2310
    {
2311
        if ( !self::$use_redis || self::$save_running_state)
2312
        {
2313
            return false;
2314
        }
2315
2316
        // 这个位置要改
2317
        //$keys = queue::keys("*"); 
2318
        //$count = count($keys);
2319
        // 直接检查db,清空的时候整个db清空,所以注意db不要跟其他项目混用
2320
        $count = queue::dbsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::dbsize() targeting phpspider\core\queue::dbsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2321
        if ( $count > 0 ) 
2322
        {
2323
            // After this operation, 4,318 kB of additional disk space will be used.
2324
            // Do you want to continue? [Y/n] 
2325
            //$msg = "发现Redis中有采集数据, 是否继续执行, 不继续则清空Redis数据重新采集\n";
2326
            $msg = "Found that the data of Redis, no continue will empty Redis data start again\n";
2327
            $msg .= 'Do you want to continue? [Y/n]';
2328
            fwrite(STDOUT, $msg);
2329
            $arg = strtolower(trim(fgets(STDIN)));
2330
            $arg = empty($arg) || !in_array($arg, array('Y', 'N', 'y','n')) ? 'y' : strtolower($arg);
2331
            if ($arg == 'n') 
2332
            {
2333
                log::warn('Clear redis data...');
2334
                queue::flushdb();
2335
                // 下面这种性能太差了
2336
                //foreach ($keys as $key) 
2337
                //{
2338
                    //$key = str_replace(self::$queue_config['prefix'].':', '', $key);
2339
                    //queue::del($key);
2340
                //}
2341
            }
2342
        }
2343
    }
2344
2345
    public function init_redis()
2346
    {
2347
        if (!self::$use_redis)
2348
        {
2349
            return false;
2350
        }
2351
2352
        // 添加当前服务器到服务器列表
2353
        $this->add_server_list(self::$serverid, self::$tasknum);
2354
2355
        // 删除当前服务器的任务状态
2356
        // 对于被强制退出的进程有用
2357
        for ($i = 1; $i <= self::$tasknum; $i++) 
2358
        {
2359
            $this->del_task_status(self::$serverid, $i);
2360
        }
2361
    }
2362
2363
    /**
2364
     * 设置任务状态, 主进程和子进程每成功采集一个页面后调用
2365
     * 
2366
     * @return void
2367
     * @author seatle <[email protected]> 
2368
     * @created time :2016-10-30 23:56
2369
     */
2370
    public function set_task_status()
2371
    {
2372
        // 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用
2373
        $mem = round(memory_get_usage(true)/(1024*1024),2);
2374
        $use_time = microtime(true) - self::$time_start; 
2375
        $speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2);
2376
        $status = array(
2377
            'id' => self::$taskid,
2378
            'pid' => self::$taskpid,
2379
            'mem' => $mem,
2380
            'collect_succ' => self::$collect_succ,
2381
            'collect_fail' => self::$collect_fail,
2382
            'speed' => $speed,
2383
        );
2384
        $task_status = json_encode($status);
2385
2386
        if (self::$use_redis)
2387
        {
2388
            $key = 'server-'.self::$serverid.'-task_status-'.self::$taskid;
2389
            queue::set($key, $task_status);
2390
        }
2391
        else 
2392
        {
2393
            self::$task_status = array($task_status);
2394
        }
2395
    }
2396
2397
    /**
2398
     * 删除任务状态
2399
     * 
2400
     * @return void
2401
     * @author seatle <[email protected]> 
2402
     * @created time :2016-11-16 11:06
2403
     */
2404
    public function del_task_status($serverid, $taskid)
2405
    {
2406
        if (!self::$use_redis)
2407
        {
2408
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2409
        }
2410
        $key = "server-{$serverid}-task_status-{$taskid}";
2411
        queue::del($key); 
2412
    }
2413
2414
    /**
2415
     * 获得任务状态, 主进程才会调用
2416
     * 
2417
     * @return void
2418
     * @author seatle <[email protected]> 
2419
     * @created time :2016-10-30 23:56
2420
     */
2421
    public function get_task_status($serverid, $taskid)
2422
    {
2423
        if (!self::$use_redis)
2424
        {
2425
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2426
        }
2427
2428
        $key = "server-{$serverid}-task_status-{$taskid}";
2429
        $task_status = queue::get($key);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as phpspider\core\queue::get($key) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2430
        return $task_status;
2431
    }
2432
2433
    /**
2434
     * 获得任务状态, 主进程才会调用
2435
     * 
2436
     * @return void
2437
     * @author seatle <[email protected]> 
2438
     * @created time :2016-10-30 23:56
2439
     */
2440
    public function get_task_status_list($serverid = 1, $tasknum)
2441
    {
2442
        $task_status = array();
2443
        if (self::$use_redis)
2444
        {
2445
            for ($i = 1; $i <= $tasknum; $i++) 
2446
            {
2447
                $key           = "server-{$serverid}-task_status-".$i;
2448
                $task_status[] = queue::get($key);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status[] is correct as phpspider\core\queue::get($key) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2449
            }
2450
        }
2451
        else 
2452
        {
2453
            $task_status = self::$task_status;
2454
        }
2455
        return $task_status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $task_status returns the type array|array<mixed,void> which is incompatible with the documented return type void.
Loading history...
2456
    }
2457
2458
    /**
2459
     * 添加当前服务器信息到服务器列表
2460
     * 
2461
     * @return void
2462
     * @author seatle <[email protected]> 
2463
     * @created time :2016-11-16 11:06
2464
     */
2465
    public function add_server_list($serverid, $tasknum)
2466
    {
2467
        if (!self::$use_redis) 
2468
        {
2469
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2470
        }
2471
2472
        // 更新服务器列表
2473
        $server_list_json = queue::get('server_list');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2474
        $server_list      = array();
2475
        if ( ! $server_list_json)
0 ignored issues
show
introduced by
$server_list_json is of type void, thus it always evaluated to false.
Loading history...
2476
        {
2477
            $server_list[$serverid] = array(
2478
                'serverid' => $serverid,
2479
                'tasknum' => $tasknum,
2480
                'time' => time(),
2481
            );
2482
        }
2483
        else 
2484
        {
2485
            $server_list            = json_decode($server_list_json, true);
2486
            $server_list[$serverid] = array(
2487
                'serverid' => $serverid,
2488
                'tasknum'  => $tasknum,
2489
                'time'     => time(),
2490
            );
2491
            ksort($server_list);
2492
        }
2493
        queue::set('server_list', json_encode($server_list));
2494
    }
2495
2496
    /**
2497
     * 从服务器列表中删除当前服务器信息
2498
     * 
2499
     * @return void
2500
     * @author seatle <[email protected]> 
2501
     * @created time :2016-11-16 11:06
2502
     */
2503
    public function del_server_list($serverid)
2504
    {
2505
        if (!self::$use_redis) 
2506
        {
2507
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2508
        }
2509
2510
        $server_list_json = queue::get('server_list');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2511
        $server_list      = array();
0 ignored issues
show
Unused Code introduced by
The assignment to $server_list is dead and can be removed.
Loading history...
2512
        if ($server_list_json)
0 ignored issues
show
introduced by
$server_list_json is of type void, thus it always evaluated to false.
Loading history...
2513
        {
2514
            $server_list = json_decode($server_list_json, true);
2515
            if (isset($server_list[$serverid])) 
2516
            {
2517
                unset($server_list[$serverid]);
2518
            }
2519
2520
            // 删除完当前的任务列表如果还存在,就更新一下Redis
2521
            if (!empty($server_list)) 
2522
            {
2523
                ksort($server_list);
2524
                queue::set('server_list', json_encode($server_list));
2525
            }
2526
        }
2527
    }
2528
2529
    /**
2530
     * 获取等待爬取页面数量
2531
     * 
2532
     * @param mixed $url
2533
     * @return void
2534
     * @author seatle <[email protected]> 
2535
     * @created time :2016-09-23 17:13
2536
     */
2537
    public function get_collect_url_num()
2538
    {
2539
        if (self::$use_redis)
2540
        {
2541
            $count = queue::get('collect_urls_num');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::get('collect_urls_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2542
        }
2543
        else 
2544
        {
2545
            $count = self::$collect_urls_num;
2546
        }
2547
        return $count;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $count returns the type integer which is incompatible with the documented return type void.
Loading history...
2548
    }
2549
2550
    /**
2551
     * 获取已经爬取页面数量
2552
     * 
2553
     * @param mixed $url
2554
     * @return void
2555
     * @author seatle <[email protected]> 
2556
     * @created time :2016-09-23 17:13
2557
     */
2558
    public function get_collected_url_num()
2559
    {
2560
        if (self::$use_redis)
2561
        {
2562
            $count = queue::get('collected_urls_num');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::get('collected_urls_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2563
        }
2564
        else 
2565
        {
2566
            $count = self::$collected_urls_num;
2567
        }
2568
        return $count;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $count returns the type integer which is incompatible with the documented return type void.
Loading history...
2569
    }
2570
2571
    /**
2572
     * 已采集页面数量加一
2573
     * 
2574
     * @param mixed $url
2575
     * @return void
2576
     * @author seatle <[email protected]> 
2577
     * @created time :2016-09-23 17:13
2578
     */
2579
    public function incr_collected_url_num($url)
2580
    {
2581
        if (self::$use_redis)
2582
        {
2583
            queue::incr('collected_urls_num');
2584
        }
2585
        else 
2586
        {
2587
            self::$collected_urls_num++;
2588
        }
2589
    }
2590
2591
    /**
2592
     * 从队列左边插入
2593
     * 
2594
     * @return void
2595
     * @author seatle <[email protected]> 
2596
     * @created time :2016-09-23 17:13
2597
     */
2598
    public function queue_lpush($link = array(), $allowed_repeat = false)
2599
    {
2600
        if (empty($link) || empty($link['url'])) 
2601
        {
2602
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2603
        }
2604
2605
        $url = $link['url'];
2606
        $link = $this->link_compress($link);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as $this->link_compress($link) targeting phpspider\core\phpspider::link_compress() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2607
2608
        $status = false;
2609
        if (self::$use_redis)
2610
        {
2611
            $key  = 'collect_urls-'.md5($url);
2612
            $lock = 'lock-'.$key;
2613
            // 加锁: 一个进程一个进程轮流处理
2614
            if (queue::lock($lock))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2615
            {
2616
                $exists = queue::exists($key); 
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $exists is correct as phpspider\core\queue::exists($key) targeting phpspider\core\queue::exists() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2617
                // 不存在或者当然URL可重复入
2618
                if (!$exists || $allowed_repeat) 
0 ignored issues
show
introduced by
$exists is of type void, thus it always evaluated to false.
Loading history...
2619
                {
2620
                    // 待爬取网页记录数加一
2621
                    queue::incr('collect_urls_num');
2622
                    // 先标记为待爬取网页
2623
                    queue::set($key, time()); 
2624
                    // 入队列
2625
                    $link = json_encode($link);
2626
                    //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN <[email protected]>
2627
                    if (self::$configs['queue_order'] == 'rand')
2628
                    {
2629
                        queue::sadd('collect_queue', $link);
2630
                    }
2631
                    else
2632
                    {
2633
                        queue::lpush('collect_queue', $link);
2634
                    }
2635
                    $status = true;
2636
                }
2637
                // 解锁
2638
                queue::unlock($lock);
2639
            }
2640
        }
2641
        else 
2642
        {
2643
            $key = md5($url);
2644
            if (!array_key_exists($key, self::$collect_urls))
2645
            {
2646
                self::$collect_urls_num++;
2647
                self::$collect_urls[$key] = time();
2648
                array_push(self::$collect_queue, $link);
2649
                $status = true;
2650
            }
2651
        }
2652
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type boolean which is incompatible with the documented return type void.
Loading history...
2653
    }
2654
2655
    /**
2656
     * 从队列右边插入
2657
     *
2658
     * @return void
2659
     * @author seatle <[email protected]>
2660
     * @created time :2016-09-23 17:13
2661
     */
2662
    public function queue_rpush($link = array(), $allowed_repeat = false)
2663
    {
2664
        if (empty($link) || empty($link['url'])) 
2665
        {
2666
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type void.
Loading history...
2667
        }
2668
2669
        $url = $link['url'];
2670
2671
        $status = false;
2672
        if (self::$use_redis)
2673
        {
2674
            $key  = 'collect_urls-'.md5($url);
2675
            $lock = 'lock-'.$key;
2676
            // 加锁: 一个进程一个进程轮流处理
2677
            if (queue::lock($lock))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2678
            {
2679
                $exists = queue::exists($key);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $exists is correct as phpspider\core\queue::exists($key) targeting phpspider\core\queue::exists() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2680
                // 不存在或者当然URL可重复入
2681
                if ( ! $exists || $allowed_repeat)
0 ignored issues
show
introduced by
$exists is of type void, thus it always evaluated to false.
Loading history...
2682
                {
2683
                    // 待爬取网页记录数加一
2684
                    queue::incr('collect_urls_num');
2685
                    // 先标记为待爬取网页
2686
                    queue::set($key, time());
2687
                    // 入队列
2688
                    $link = json_encode($link);
2689
                    //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN <[email protected]>
2690
                    if (self::$configs['queue_order'] == 'rand')
2691
                    {
2692
                        queue::sadd('collect_queue', $link); //无序集合
2693
                    }
2694
                    else
2695
                    {
2696
                        queue::rpush('collect_queue', $link); //有序列表
2697
                    }
2698
                    $status = true;
2699
                }
2700
                // 解锁
2701
                queue::unlock($lock);
2702
            }
2703
        }
2704
        else 
2705
        {
2706
            $key = md5($url);
2707
            if (!array_key_exists($key, self::$collect_urls))
2708
            {
2709
                self::$collect_urls_num++;
2710
                self::$collect_urls[$key] = time();
2711
                array_unshift(self::$collect_queue, $link);
2712
                $status = true;
2713
            }
2714
        }
2715
        return $status;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $status returns the type boolean which is incompatible with the documented return type void.
Loading history...
2716
    }
2717
2718
    /**
2719
     * 从队列左边取出
2720
     * 后进先出
2721
     * 可以避免采集内容页有分页的时候采集失败数据拼凑不全
2722
     * 还可以按顺序采集列表页
2723
     * 
2724
     * @return void
2725
     * @author seatle <[email protected]> 
2726
     * @created time :2016-09-23 17:13
2727
     */
2728
    public function queue_lpop()
2729
    {
2730
        if (self::$use_redis)
2731
        {
2732
            //根据采集设置为顺序采集还是随机采集,使用列表或集合对象
2733
            if (self::$configs['queue_order'] == 'rand')
2734
            {
2735
                $link = queue::spop('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::spop('collect_queue') targeting phpspider\core\queue::spop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2736
            }
2737
            else
2738
            {
2739
                $link = queue::lpop('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::lpop('collect_queue') targeting phpspider\core\queue::lpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2740
            }
2741
            $link = json_decode($link, true);
0 ignored issues
show
Bug introduced by
$link of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2741
            $link = json_decode(/** @scrutinizer ignore-type */ $link, true);
Loading history...
2742
        }
2743
        else 
2744
        {
2745
            $link = array_pop(self::$collect_queue); 
2746
        }
2747
        return $link;
2748
    }
2749
2750
    /**
2751
     * 从队列右边取出
2752
     * 
2753
     * @return void
2754
     * @author seatle <[email protected]> 
2755
     * @created time :2016-09-23 17:13
2756
     */
2757
    public function queue_rpop()
2758
    {
2759
        if (self::$use_redis)
2760
        {
2761
            //根据采集设置为顺序采集还是随机采集,使用列表或集合对象
2762
            if (self::$configs['queue_order'] == 'rand')
2763
            {
2764
                $link = queue::spop('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::spop('collect_queue') targeting phpspider\core\queue::spop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2765
            }
2766
            else
2767
            {
2768
                $link = queue::rpop('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $link is correct as phpspider\core\queue::rpop('collect_queue') targeting phpspider\core\queue::rpop() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2769
            }
2770
            $link = json_decode($link, true);
0 ignored issues
show
Bug introduced by
$link of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

2770
            $link = json_decode(/** @scrutinizer ignore-type */ $link, true);
Loading history...
2771
        }
2772
        else 
2773
        {
2774
            $link = array_shift(self::$collect_queue); 
2775
        }
2776
        return $link;
2777
    }
2778
2779
    /**
2780
     * 队列长度
2781
     * 
2782
     * @return void
2783
     * @author seatle <[email protected]> 
2784
     * @created time :2016-09-23 17:13
2785
     */
2786
    public function queue_lsize()
2787
    {
2788
        if (self::$use_redis)
2789
        {
2790
            //根据采集设置为顺序采集还是随机采集,使用列表或集合对象
2791
            if (self::$configs['queue_order'] == 'rand')
2792
            {
2793
                $lsize = queue::scard('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $lsize is correct as phpspider\core\queue::scard('collect_queue') targeting phpspider\core\queue::scard() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2794
            }
2795
            else
2796
            {
2797
                $lsize = queue::lsize('collect_queue');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $lsize is correct as phpspider\core\queue::lsize('collect_queue') targeting phpspider\core\queue::lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2798
            }
2799
        }
2800
        else 
2801
        {
2802
            $lsize = count(self::$collect_queue);
2803
        }
2804
        return $lsize;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $lsize returns the type integer which is incompatible with the documented return type void.
Loading history...
2805
    }
2806
2807
    /**
2808
     * 采集深度加一
2809
     * 
2810
     * @return void
2811
     * @author seatle <[email protected]> 
2812
     * @created time :2016-09-23 17:13
2813
     */
2814
    public function incr_depth_num($depth)
2815
    {
2816
        if (self::$use_redis)
2817
        {
2818
            $lock = 'lock-depth_num';
2819
            // 锁2秒
2820
            if (queue::lock($lock, time(), 2))
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::lock($lock, time(), 2) targeting phpspider\core\queue::lock() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2821
            {
2822
                if (queue::get('depth_num') < $depth)
0 ignored issues
show
Bug introduced by
Are you sure the usage of phpspider\core\queue::get('depth_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2823
                {
2824
                    queue::set('depth_num', $depth);
2825
                }
2826
2827
                queue::unlock($lock);
2828
            }
2829
        }
2830
        else 
2831
        {
2832
            if (self::$depth_num < $depth) 
2833
            {
2834
                self::$depth_num = $depth;
2835
            }
2836
        }
2837
    }
2838
2839
    /**
2840
     * 获得采集深度
2841
     * 
2842
     * @return void
2843
     * @author seatle <[email protected]> 
2844
     * @created time :2016-09-23 17:13
2845
     */
2846
    public function get_depth_num()
2847
    {
2848
        if (self::$use_redis)
2849
        {
2850
            $depth_num = queue::get('depth_num');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $depth_num is correct as phpspider\core\queue::get('depth_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2851
            return $depth_num ? $depth_num : 0;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $depth_num ? $depth_num : 0 returns the type integer which is incompatible with the documented return type void.
Loading history...
introduced by
$depth_num is of type void, thus it always evaluated to false.
Loading history...
2852
        }
2853
        else 
2854
        {
2855
            return self::$depth_num;
0 ignored issues
show
Bug Best Practice introduced by
The expression return self::depth_num returns the type integer which is incompatible with the documented return type void.
Loading history...
2856
        }
2857
    }
2858
2859
    /**
2860
     * 提取到的field数目加一
2861
     * 
2862
     * @return void
2863
     * @author seatle <[email protected]> 
2864
     * @created time :2016-09-23 17:13
2865
     */
2866
    public function incr_fields_num()
2867
    {
2868
        if (self::$use_redis)
2869
        {
2870
            $fields_num = queue::incr('fields_num');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields_num is correct as phpspider\core\queue::incr('fields_num') targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2871
        }
2872
        else 
2873
        {
2874
            self::$fields_num++;
2875
            $fields_num = self::$fields_num;
2876
        }
2877
        return $fields_num;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $fields_num returns the type integer which is incompatible with the documented return type void.
Loading history...
2878
    }
2879
2880
    /**
2881
     * 提取到的field数目
2882
     * 
2883
     * @return void
2884
     * @author seatle <[email protected]> 
2885
     * @created time :2016-09-23 17:13
2886
     */
2887
    public function get_fields_num()
2888
    {
2889
        if (self::$use_redis)
2890
        {
2891
            $fields_num = queue::get('fields_num');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields_num is correct as phpspider\core\queue::get('fields_num') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2892
        }
2893
        else 
2894
        {
2895
            $fields_num = self::$fields_num;
2896
        }
2897
        return $fields_num ? $fields_num : 0;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $fields_num ? $fields_num : 0 returns the type integer which is incompatible with the documented return type void.
Loading history...
2898
    }
2899
2900
    /**
2901
     * 提取到的pages数目加一,用于限制单域名采集页数上限
2902
     *
2903
     * @return void
2904
     * @author KEN <[email protected]>
2905
     * @created time :2018-05
2906
     */
2907
    public function incr_pages_num($url = '')
2908
    {
2909
        if ( ! empty($url))
2910
        {
2911
            $domain = $this->getRootDomain($url, 'host');
2912
        }
2913
        if (empty($domain))
2914
        {
2915
            $domain = 'all';
2916
        }
2917
        if (self::$use_redis)
2918
        {
2919
            $pages_num[$domain] = queue::incr('pages_num:'.$domain);
0 ignored issues
show
Comprehensibility Best Practice introduced by
$pages_num was never initialized. Although not strictly required by PHP, it is generally a good practice to add $pages_num = array(); before regardless.
Loading history...
Bug introduced by
Are you sure the assignment to $pages_num[$domain] is correct as phpspider\core\queue::in...'pages_num:' . $domain) targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
2920
        }
2921
        else
2922
        {
2923
            if (empty(self::$pages_num[$domain]))
2924
            {
2925
                self::$pages_num[$domain] = 1;
2926
            }
2927
            else
2928
            {
2929
                self::$pages_num[$domain]++;
2930
            }
2931
            $pages_num[$domain] = self::$pages_num[$domain];
2932
        }
2933
        return $pages_num[$domain];
2934
    }
2935
2936
    /**
2937
     * 超过1秒的慢速采集时间计数,用于限制单域名总采集时间上限
2938
     *
2939
     * @return void
2940
     * @author KEN <[email protected]>
2941
     * @created time :2018-05
2942
     */
2943
    public function incr_duration_num($url = '', $time_run = 1)
2944
    {
2945
        if ( ! empty($url))
2946
        {
2947
            $domain = $this->getRootDomain($url);
2948
        }
2949
        if (empty($domain))
2950
        {
2951
            $domain = 'all';
2952
        }
2953
        if (self::$use_redis)
2954
        {
2955
            $duration[$domain] = queue::incr('duration:'.$domain, $time_run);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $duration[$domain] is correct as phpspider\core\queue::in...' . $domain, $time_run) targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
Comprehensibility Best Practice introduced by
$duration was never initialized. Although not strictly required by PHP, it is generally a good practice to add $duration = array(); before regardless.
Loading history...
2956
        }
2957
        else
2958
        {
2959
            if (empty(self::$duration[$domain]))
2960
            {
2961
                self::$duration[$domain] = $time_run;
2962
            }
2963
            else
2964
            {
2965
                self::$duration[$domain] += $time_run;
2966
            }
2967
            $duration[$domain] = self::$duration[$domain];
2968
        }
2969
        return $duration[$domain];
2970
    }
2971
2972
    /**
2973
     * 读取单域名总慢速采集(响应超过1秒)的时间
2974
     *
2975
     * @return void
2976
     * @author KEN <[email protected]>
2977
     * @created time :2018-04
2978
     */
2979
    public function get_duration_num($url = '')
2980
    {
2981
        if ( ! empty($url))
2982
        {
2983
            $domain = $this->getRootDomain($url);
2984
        }
2985
        if (empty($domain))
2986
        {
2987
            $domain = 'all';
2988
        }
2989
        if (self::$use_redis)
2990
        {
2991
            $duration[$domain] = queue::get('duration:'.$domain);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $duration[$domain] is correct as phpspider\core\queue::get('duration:' . $domain) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
Comprehensibility Best Practice introduced by
$duration was never initialized. Although not strictly required by PHP, it is generally a good practice to add $duration = array(); before regardless.
Loading history...
2992
        }
2993
        else
2994
        {
2995
            $duration[$domain] =  ! empty(self::$duration[$domain]) ? self::$duration[$domain] : 0;
2996
        }
2997
        return $duration[$domain] ? $duration[$domain] : 0;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $duration[$domain] ? $duration[$domain] : 0 also could return the type integer which is incompatible with the documented return type void.
Loading history...
2998
    }
2999
3000
    /**
3001
     * 单 host 当前并发计数
3002
     * @return int
3003
     * @author KEN <[email protected]>
3004
     * @created time :2018-05-28 16:40
3005
     */
3006
    public function incr_task_per_host($url = '', $type = 'incr')
3007
    {
3008
        if (empty($url))
3009
        {
3010
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
3011
        }
3012
        $domain = $this->getRootDomain($url, 'host');
3013
        if (empty($domain))
3014
        {
3015
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
3016
        }
3017
        if (self::$use_redis)
3018
        {
3019
            if ($type == 'decr')
3020
            {
3021
                $task_per_host_counter[$domain] = queue::decr('task_per_host:'.$domain);
0 ignored issues
show
Comprehensibility Best Practice introduced by
$task_per_host_counter was never initialized. Although not strictly required by PHP, it is generally a good practice to add $task_per_host_counter = array(); before regardless.
Loading history...
Bug introduced by
Are you sure the assignment to $task_per_host_counter[$domain] is correct as phpspider\core\queue::de...k_per_host:' . $domain) targeting phpspider\core\queue::decr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3022
            }
3023
            else
3024
            {
3025
                $task_per_host_counter[$domain] = queue::incr('task_per_host:'.$domain);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_per_host_counter[$domain] is correct as phpspider\core\queue::in...k_per_host:' . $domain) targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3026
            }
3027
        }
3028
        else
3029
        {
3030
3031
            if (empty(self::$task_per_host_counter[$domain]))
3032
            {
3033
                self::$task_per_host_counter[$domain] = 1;
3034
            }
3035
            else
3036
            {
3037
                if ($type == 'decr')
3038
                {
3039
                    self::$task_per_host_counter[$domain]--;
3040
                }
3041
                else
3042
                {
3043
                    self::$task_per_host_counter[$domain]++;
3044
                }
3045
            }
3046
            $task_per_host_counter[$domain] = self::$task_per_host_counter[$domain];
3047
        }
3048
        return $task_per_host_counter[$domain];
3049
    }
3050
3051
    //获取url所属 host 当前并发数量 KEN <[email protected]>
3052
    public function get_task_per_host_num($url)
3053
    {
3054
        if (empty($url))
3055
        {
3056
            return 0;
3057
        }
3058
        $domain = $this->getRootDomain($url, 'host');
3059
        if (empty($domain))
3060
        {
3061
            return 0;
3062
        }
3063
        if (self::$use_redis)
3064
        {
3065
            $count = queue::get('task_per_host:'.$domain);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::ge...k_per_host:' . $domain) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3066
        }
3067
        else
3068
        {
3069
            $count = self::$task_per_host_counter[$domain];
3070
        }
3071
        return $count;
3072
    }
3073
3074
    /**
3075
     * 采用xpath分析提取字段
3076
     * 
3077
     * @param mixed $html
3078
     * @param mixed $selector
3079
     * @return void
3080
     * @author seatle <[email protected]> 
3081
     * @created time :2016-09-18 10:17
3082
     */
3083
    public function get_fields_xpath($html, $selector, $fieldname) 
3084
    {
3085
        $result = selector::select($html, $selector);
3086
        if (selector::$error) 
3087
        {
3088
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
3089
        }
3090
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type false which is incompatible with the documented return type void.
Loading history...
3091
    }
3092
3093
    /**
3094
     * 采用正则分析提取字段
3095
     * 
3096
     * @param mixed $html
3097
     * @param mixed $selector
3098
     * @return void
3099
     * @author seatle <[email protected]> 
3100
     * @created time :2016-09-18 10:17
3101
     */
3102
    public function get_fields_regex($html, $selector, $fieldname) 
3103
    {
3104
        $result = selector::select($html, $selector, 'regex');
3105
        if (selector::$error) 
3106
        {
3107
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
3108
        }
3109
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result returns the type false which is incompatible with the documented return type void.
Loading history...
3110
    }
3111
3112
    /**
3113
     * 采用CSS选择器提取字段
3114
     * 
3115
     * @param mixed $html
3116
     * @param mixed $selector
3117
     * @param mixed $fieldname
3118
     * @return void
3119
     * @author seatle <[email protected]> 
3120
     * @created time :2016-09-18 10:17
3121
     */
3122
    public function get_fields_css($html, $selector, $fieldname) 
3123
    {
3124
        $result = selector::select($html, $selector, 'css');
3125
        if (selector::$error) 
3126
        {
3127
            log::error("Field(\"{$fieldname}\") ".selector::$error."\n");
3128
        }
3129
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result could also return false which is incompatible with the documented return type void. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
3130
    }
3131
3132
    /**
3133
     * 清空shell输出内容
3134
     * 
3135
     * @return void
3136
     * @author seatle <[email protected]> 
3137
     * @created time :2016-11-16 11:06
3138
     */
3139
    public function clear_echo()
3140
    {
3141
        $arr = array(27, 91, 72, 27, 91, 50, 74);
3142
        foreach ($arr as $a) 
3143
        {
3144
            print chr($a);
3145
        }
3146
        //array_map(create_function('$a', 'print chr($a);'), array(27, 91, 72, 27, 91, 50, 74));
3147
    }
3148
3149
    /**
3150
     * 替换shell输出内容
3151
     * 
3152
     * @param mixed $message
3153
     * @param mixed $force_clear_lines
3154
     * @return void
3155
     * @author seatle <[email protected]> 
3156
     * @created time :2016-11-16 11:06
3157
     */
3158
    public function replace_echo($message, $force_clear_lines = NULL) 
3159
    {
3160
        static $last_lines = 0;
3161
3162
        if(!is_null($force_clear_lines)) 
3163
        {
3164
            $last_lines = $force_clear_lines;
3165
        }
3166
3167
        // 获取终端宽度
3168
        $toss = $status = null;
3169
        $term_width = exec('tput cols', $toss, $status);
3170
        if($status || empty($term_width)) 
3171
        {
3172
            $term_width = 64; // Arbitrary fall-back term width.
3173
        }
3174
3175
        $line_count = 0;
3176
        foreach(explode("\n", $message) as $line) 
3177
        {
3178
            $line_count += count(str_split($line, $term_width));
0 ignored issues
show
Bug introduced by
It seems like $term_width can also be of type string; however, parameter $split_length of str_split() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

3178
            $line_count += count(str_split($line, /** @scrutinizer ignore-type */ $term_width));
Loading history...
3179
        }
3180
3181
        // Erasure MAGIC: Clear as many lines as the last output had.
3182
        for($i = 0; $i < $last_lines; $i++) 
3183
        {
3184
            // Return to the beginning of the line
3185
            echo "\r";
3186
            // Erase to the end of the line
3187
            echo "\033[K";
3188
            // Move cursor Up a line
3189
            echo "\033[1A";
3190
            // Return to the beginning of the line
3191
            echo "\r";
3192
            // Erase to the end of the line
3193
            echo "\033[K";
3194
            // Return to the beginning of the line
3195
            echo "\r";
3196
            // Can be consolodated into
3197
            // echo "\r\033[K\033[1A\r\033[K\r";
3198
        }
3199
3200
        $last_lines = $line_count;
3201
3202
        echo $message."\n";
3203
    }
3204
3205
    /**
3206
     * 展示启动界面, Windows 不会到这里来
3207
     * @return void
3208
     */
3209
    public function display_ui()
3210
    {
3211
        $loadavg = sys_getloadavg();
3212
        foreach ($loadavg as $k=>$v) 
3213
        {
3214
            $loadavg[$k] = round($v, 2);
3215
        }
3216
        $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
3217
        //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
3218
        $run_time_str = util::time2second(time() - self::$time_start, false);
3219
        $display_str .= 'PHPSpider version:'.self::VERSION.'          PHP version:'.PHP_VERSION."\n";
3220
        $display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).'   run '.$run_time_str." \n";
3221
3222
        $display_str .= 'spider name: '.self::$configs['name']."\n";
3223
        if (self::$multiserver)
3224
        {
3225
            $display_str .= 'server id: '.self::$serverid."\n";
3226
        }
3227
        $display_str .= 'task number: '.self::$tasknum."\n";
3228
        $display_str .= 'load average: '.implode(', ', $loadavg)."\n";
3229
        $display_str .= "document: https://doc.phpspider.org\n";
3230
3231
        $display_str .= $this->display_task_ui();
3232
3233
        if (self::$multiserver) 
3234
        {
3235
            $display_str .= $this->display_server_ui();
3236
        }
3237
3238
        $display_str .= $this->display_collect_ui();
3239
3240
        // 清屏
3241
        //$this->clear_echo();
3242
        // 返回到第一行,第一列
3243
        //echo "\033[0;0H";
3244
        $display_str .= "---------------------------------------------------------------------\n";
3245
        $display_str .= 'Press Ctrl-C to quit. Start success.'.date('Y-m-d H:i:s').' - '.round(memory_get_usage() / 1024 / 1024, 2).'MB'."\n";
3246
        if (self::$terminate)
3247
        {
3248
            $display_str .= "\n\033[33mWait for the process exits...\033[0m";
3249
        }
3250
        //echo $display_str;
3251
        $this->replace_echo($display_str);
3252
    }
3253
3254
    public function display_task_ui()
3255
    {
3256
        $display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n";
3257
3258
        $display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')). 
3259
            "\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')). 
3260
            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
3261
            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
3262
            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
3263
            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
3264
            "\n";
3265
3266
        // "\033[32;40m [OK] \033[0m"
3267
        $task_status = $this->get_task_status_list(self::$serverid, self::$tasknum);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status_l...erverid, self::tasknum) targeting phpspider\core\phpspider::get_task_status_list() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3268
        foreach ($task_status as $json) 
0 ignored issues
show
Bug introduced by
The expression $task_status of type void is not traversable.
Loading history...
3269
        {
3270
            $task = json_decode($json, true);
3271
            if (empty($task)) 
3272
            {
3273
                continue;
3274
            }
3275
            $display_str .= str_pad($task['id'], self::$taskid_length + 2).
3276
            str_pad($task['pid'], self::$pid_length + 2).
3277
            str_pad($task['mem'].'MB', self::$mem_length + 2).
3278
            str_pad($task['collect_succ'], self::$urls_length).
3279
            str_pad($task['collect_fail'], self::$urls_length).
3280
            str_pad($task['speed'].'/s', self::$speed_length + 2).
3281
                "\n";
3282
        }
3283
        //echo "\033[9;0H";
3284
        return $display_str;
3285
    }
3286
3287
    public function display_server_ui()
3288
    {
3289
        $display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n";
3290
3291
        $display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')). 
3292
            "\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')). 
3293
            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
3294
            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
3295
            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
3296
            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
3297
            "\n";
3298
3299
        $server_list_json = queue::get('server_list');
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $server_list_json is correct as phpspider\core\queue::get('server_list') targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3300
        $server_list      = json_decode($server_list_json, true);
0 ignored issues
show
Bug introduced by
$server_list_json of type void is incompatible with the type string expected by parameter $json of json_decode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

3300
        $server_list      = json_decode(/** @scrutinizer ignore-type */ $server_list_json, true);
Loading history...
3301
        foreach ($server_list as $server)
3302
        {
3303
            $serverid     = $server['serverid'];
3304
            $tasknum      = $server['tasknum'];
3305
            $mem          = 0;
3306
            $speed        = 0;
3307
            $collect_succ = $collect_fail = 0;
3308
            $task_status  = $this->get_task_status_list($serverid, $tasknum);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $task_status is correct as $this->get_task_status_list($serverid, $tasknum) targeting phpspider\core\phpspider::get_task_status_list() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3309
            foreach ($task_status as $json)
0 ignored issues
show
Bug introduced by
The expression $task_status of type void is not traversable.
Loading history...
3310
            {
3311
                $task = json_decode($json, true);
3312
                if (empty($task))
3313
                {
3314
                    continue;
3315
                }
3316
                $mem += $task['mem'];
3317
                $speed += $task['speed'];
3318
                $collect_fail += $task['collect_fail'];
3319
                $collect_succ += $task['collect_succ'];
3320
            }
3321
3322
            $display_str .= str_pad($serverid, self::$server_length).
3323
            str_pad($tasknum, self::$tasknum_length + 2).
3324
            str_pad($mem.'MB', self::$mem_length + 2).
3325
            str_pad($collect_succ, self::$urls_length).
3326
            str_pad($collect_fail, self::$urls_length).
3327
            str_pad($speed.'/s', self::$speed_length + 2).
3328
                "\n";
3329
        }
3330
        return $display_str;
3331
    }
3332
3333
    public function display_collect_ui()
3334
    {
3335
        $display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n";
3336
3337
        $display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')). 
3338
            "\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')). 
3339
            "\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')). 
3340
            "\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')). 
3341
            "\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')). 
3342
            "\n";
3343
3344
        $collect   = $this->get_collect_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collect is correct as $this->get_collect_url_num() targeting phpspider\core\phpspider::get_collect_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3345
        $collected = $this->get_collected_url_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $collected is correct as $this->get_collected_url_num() targeting phpspider\core\phpspider::get_collected_url_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3346
        $queue     = $this->queue_lsize();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $queue is correct as $this->queue_lsize() targeting phpspider\core\phpspider::queue_lsize() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3347
        $fields    = $this->get_fields_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $fields is correct as $this->get_fields_num() targeting phpspider\core\phpspider::get_fields_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3348
        $depth     = $this->get_depth_num();
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $depth is correct as $this->get_depth_num() targeting phpspider\core\phpspider::get_depth_num() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3349
        $display_str .= str_pad($collect, 16);
0 ignored issues
show
Bug introduced by
$collect of type void is incompatible with the type string expected by parameter $input of str_pad(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

3349
        $display_str .= str_pad(/** @scrutinizer ignore-type */ $collect, 16);
Loading history...
3350
        $display_str .= str_pad($queue, 14);
3351
        $display_str .= str_pad($collected, 15);
3352
        $display_str .= str_pad($fields, 15);
3353
        $display_str .= str_pad($depth, 12);
3354
        $display_str .= "\n";
3355
        return $display_str;
3356
    }
3357
3358
    /**
3359
     * 判断是否附件文件
3360
     * 
3361
     * @return void
3362
     * @author seatle <[email protected]> 
3363
     * @created time :2016-09-23 17:13
3364
     */
3365
    //public function is_attachment_file($url)
3366
    //{
3367
    //$mime_types = $GLOBALS['config']['mimetype'];
3368
    //$mime_types_flip = array_flip($mime_types);
3369
3370
    //$pathinfo = pathinfo($url);
3371
    //$fileext = isset($pathinfo['extension']) ? $pathinfo['extension'] : '';
3372
3373
    //$fileinfo = array();
3374
    //// 存在文件后缀并且是配置里面的后缀
3375
    //if (!empty($fileext) && isset($mime_types_flip[$fileext])) 
3376
    //{
3377
    //stream_context_set_default(
3378
    //array(
3379
    //'http' => array(
3380
    //'method' => 'HEAD'
3381
    //)
3382
    //)
3383
    //);
3384
    //// 代理和Cookie以后实现, 方法和 file_get_contents 一样 使用 stream_context_create 设置
3385
    //$headers = get_headers($url, 1);
3386
    //if (strpos($headers[0], '302')) 
3387
    //{
3388
    //$url = $headers['Location'];
3389
    //$headers = get_headers($url, 1);
3390
    //}
3391
    ////print_r($headers);
3392
    //$fileinfo = array(
3393
    //'basename' => isset($pathinfo['basename']) ? $pathinfo['basename'] : '',
3394
    //'filename' => isset($pathinfo['filename']) ? $pathinfo['filename'] : '',
3395
    //'fileext' => isset($pathinfo['extension']) ? $pathinfo['extension'] : '',
3396
    //'filesize' => isset($headers['Content-Length']) ? $headers['Content-Length'] : 0,
3397
    //'atime' => isset($headers['Date']) ? strtotime($headers['Date']) : time(),
3398
    //'mtime' => isset($headers['Last-Modified']) ? strtotime($headers['Last-Modified']) : time(),
3399
    //);
3400
3401
    //$mime_type = 'html';
3402
    //$content_type = isset($headers['Content-Type']) ? $headers['Content-Type'] : '';
3403
    //if (!empty($content_type)) 
3404
    //{
3405
    //$mime_type = isset($GLOBALS['config']['mimetype'][$content_type]) ? $GLOBALS['config']['mimetype'][$content_type] : $mime_type;
3406
    //}
3407
    //$mime_types_flip = array_flip($mime_types);
3408
    //// 判断一下是不是文件名被加什么后缀了, 比如 http://www.xxxx.com/test.jpg?token=xxxxx
3409
    //if (!isset($mime_types_flip[$fileinfo['fileext']]))
3410
    //{
3411
    //$fileinfo['fileext'] = $mime_type;
3412
    //$fileinfo['basename'] = $fileinfo['filename'].'.'.$mime_type;
3413
    //}
3414
    //}
3415
    //return $fileinfo;
3416
    //}
3417
3418
    //返回当前是否是主进程
3419
    public function is_taskmaster()
3420
    {
3421
        return self::$taskmaster;
0 ignored issues
show
Bug Best Practice introduced by
The expression return self::taskmaster returns the type boolean which is incompatible with the documented return type void.
Loading history...
3422
    }
3423
3424
    //返回当前是否进程ID
3425
    public function get_task_id()
3426
    {
3427
        return self::$taskid;
3428
    }
3429
3430
    //检测子域名数量
3431
    public function sub_domain_count($url)
3432
    {
3433
        if (empty($url))
3434
        {
3435
            return 0;
3436
        }
3437
        $count  = 0;
3438
        $domain = $this->getRootDomain($url, 'root');
3439
        if (empty($domain))
3440
        {
3441
            return 0;
3442
        }
3443
        $host = $this->getRootDomain($url, 'host');
3444
        if (empty($host))
3445
        {
3446
            return $count;
3447
        }
3448
        if (self::$use_redis)
3449
        {
3450
            $count = queue::get($domain);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::get($domain) targeting phpspider\core\queue::get() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3451
            if ( ! empty(self::$configs['max_sub_num']) and $count > self::$configs['max_sub_num'])
3452
            {
3453
                return $count;
3454
            }
3455
            if (strlen($host) > 32)
3456
            {
3457
                $host = md5($host);
3458
            }
3459
            $hostkey = 'sub_d-'.$host;
3460
            $exists  = queue::exists($hostkey);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $exists is correct as phpspider\core\queue::exists($hostkey) targeting phpspider\core\queue::exists() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3461
            if ( ! $exists)
0 ignored issues
show
introduced by
$exists is of type void, thus it always evaluated to false.
Loading history...
3462
            {
3463
                // 子域名数量加一
3464
                $count = queue::incr($domain);
0 ignored issues
show
Bug introduced by
Are you sure the assignment to $count is correct as phpspider\core\queue::incr($domain) targeting phpspider\core\queue::incr() seems to always return null.

This check looks for function or method calls that always return null and whose return value is assigned to a variable.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
$object = $a->getObject();

The method getObject() can return nothing but null, so it makes no sense to assign that value to a variable.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
3465
                queue::set($hostkey, 1);
3466
            }
3467
        }
3468
        return $count;
3469
    }
3470
3471
    //提取url的根域名 host domain subdomain name tld
3472
    public function getRootDomain($url = '', $type = 'root', $domain_check = false)
3473
    {
3474
        if (empty($url))
3475
        {
3476
            return $url;
3477
        }
3478
        $url = trim($url);
3479
        if ( ! preg_match('/^http/i', $url))
3480
        {
3481
            $url = 'http://'.$url;
3482
        }
3483
        //截取限定字符
3484
        $arr = array();
3485
        if (preg_match_all('/(^https?:\/\/[\p{Han}a-zA-Z0-9\-\.\/]+)/iu', $url, $arr))
3486
        {
3487
            $url = $arr['0']['0'];
3488
            unset($arr);
3489
        }
3490
        $url_parse = parse_url(strtolower($url));
3491
        if (empty($url_parse['host']))
3492
        {
3493
            return '';
3494
        }
3495
        //host判断快速返回
3496
        if ($domain_check === false and $type == 'host')
3497
        {
3498
            return $url_parse['host'];
3499
        }
3500
3501
        //结束数组初始化
3502
        $res = array(
3503
            'scheme' => '',
3504
            'host'   => '',
3505
            'path'   => '',
3506
            'name'   => '',
3507
            'domain' => '',
3508
        );
3509
3510
        $urlarr        = explode('.', $url_parse['host']);
3511
        $count         = count($urlarr);
3512
        $res['scheme'] = $url_parse['scheme'];
3513
        $res['host']   = $url_parse['host'];
3514
        if ( ! empty($url_parse['path']))
3515
        {
3516
            $res['path'] = $url_parse['path'];
3517
        }
3518
        #列举域名中固定元素
3519
        $state_domain = array('com', 'edu', 'gov', 'int', 'mil', 'net', 'org', 'biz', 'info', 'pro', 'name', 'coop', 'aero', 'xxx', 'idv', 'mobi', 'cc', 'me', 'jp', 'uk', 'ws', 'eu', 'pw', 'kr', 'io', 'us', 'cn', 'al', 'dz', 'af', 'ar', 'ae', 'aw', 'om', 'az', 'eg', 'et', 'ie', 'ee', 'ad', 'ao', 'ai', 'ag', 'at', 'au', 'mo', 'bb', 'pg', 'bs', 'pk', 'py', 'ps', 'bh', 'pa', 'br', 'by', 'bm', 'bg', 'mp', 'bj', 'be', 'is', 'pr', 'ba', 'pl', 'bo', 'bz', 'bw', 'bt', 'bf', 'bi', 'bv', 'kp', 'gq', 'dk', 'de', 'tl', 'tp', 'tg', 'dm', 'do', 'ru', 'ec', 'er', 'fr', 'fo', 'pf', 'gf', 'tf', 'va', 'ph', 'fj', 'fi', 'cv', 'fk', 'gm', 'cg', 'cd', 'co', 'cr', 'gg', 'gd', 'gl', 'ge', 'cu', 'gp', 'gu', 'gy', 'kz', 'ht', 'nl', 'an', 'hm', 'hn', 'ki', 'dj', 'kg', 'gn', 'gw', 'ca', 'gh', 'ga', 'kh', 'cz', 'zw', 'cm', 'qa', 'ky', 'km', 'ci', 'kw', 'hr', 'ke', 'ck', 'lv', 'ls', 'la', 'lb', 'lt', 'lr', 'ly', 'li', 're', 'lu', 'rw', 'ro', 'mg', 'im', 'mv', 'mt', 'mw', 'my', 'ml', 'mk', 'mh', 'mq', 'yt', 'mu', 'mr', 'um', 'as', 'vi', 'mn', 'ms', 'bd', 'pe', 'fm', 'mm', 'md', 'ma', 'mc', 'mz', 'mx', 'nr', 'np', 'ni', 'ne', 'ng', 'nu', 'no', 'nf', 'na', 'za', 'aq', 'gs', 'pn', 'pt', 'se', 'ch', 'sv', 'yu', 'sl', 'sn', 'cy', 'sc', 'sa', 'cx', 'st', 'sh', 'kn', 'lc', 'sm', 'pm', 'vc', 'lk', 'sk', 'si', 'sj', 'sz', 'sd', 'sr', 'sb', 'so', 'tj', 'tw', 'th', 'tz', 'to', 'tc', 'tt', 'tn', 'tv', 'tr', 'tm', 'tk', 'wf', 'vu', 'gt', 've', 'bn', 'ug', 'ua', 'uy', 'uz', 'es', 'eh', 'gr', 'hk', 'sg', 'nc', 'nz', 'hu', 'sy', 'jm', 'am', 'ac', 'ye', 'iq', 'ir', 'il', 'it', 'in', 'id', 'vg', 'jo', 'vn', 'zm', 'je', 'td', 'gi', 'cl', 'cf', 'yr', 'arpa', 'museum', 'asia', 'ax', 'bl', 'bq', 'cat', 'cw', 'gb', 'jobs', 'mf', 'rs', 'su', 'sx', 'tel', 'travel', 'shop', 'ltd', 'store', 'vip', '网店', '中国', '公司', '网络', 'co.il', 'co.nz', 'co.uk', 'me.uk', 'org.uk', 'com.sb', '在线', '中文网', '移动', 'wang', 'club', 'ren', 'top', 'website', 'cool', 'company', 'city', 'email', 'market', 'software', 'ninja', '我爱你', 'bike', 'today', 'life', 'space', 'pub', 'site', 'help', 'link', 'photo', 'video', 'click', 'pics', 'sexy', 'audio', 'gift', 'tech', '网址', 'online', 'win', 'download', 'party', 'bid', 'loan', 'date', 'trade', 'red', 'blue', 'pink', 'poker', 'green', 'farm', 'zone', 'guru', 'tips', 'land', 'care', 'camp', 'cab', 'cash', 'limo', 'toys', 'tax', 'town', 'fish', 'fund', 'fail', 'house', 'shoes', 'media', 'guide', 'tools', 'solar', 'watch', 'cheap', 'rocks', 'news', 'live', 'lawyer', 'host', 'wiki', 'ink', 'design', 'lol', 'hiphop', 'hosting', 'diet', 'flowers', 'car', 'cars', 'auto', 'mom', 'cq', 'he', 'nm', 'ln', 'jl', 'hl', 'js', 'zj', 'ah', 'jx', 'ha', 'hb', 'gx', 'hi', 'gz', 'yn', 'xz', 'qh', 'nx', 'xj', 'xyz', 'xin', 'science', 'press', 'band', 'engineer', 'social', 'studio', 'work', 'game', 'kim', 'games', 'group', '集团');
3520
        if ($count <= 2)
3521
        {
3522
            #当域名直接根形式不存在host部分直接输出
3523
            $last   = array_pop($urlarr);
3524
            $last_1 = array_pop($urlarr);
3525
            if (in_array($last, $state_domain))
3526
            {
3527
                $res['domain'] = $last_1.'.'.$last;
3528
                $res['name']   = $last_1;
3529
                $res['tld']    = $last;
3530
            }
3531
        }
3532
        elseif ($count > 2)
3533
        {
3534
            $last          = array_pop($urlarr);
3535
            $last_1        = array_pop($urlarr);
3536
            $last_2        = array_pop($urlarr);
3537
            $res['domain'] = $last_1.'.'.$last; //默认为n.com形式
3538
            $res['name']   = $last_2;
3539
3540
            //排除非标准 ltd 域名
3541
            if ( ! in_array($last, $state_domain))
3542
            {
3543
                return false;
3544
            }
3545
3546
            if (in_array($last, $state_domain))
3547
            {
3548
                $res['domain'] = $last_1.'.'.$last; //n.com形式
3549
                $res['name']   = $last_1;
3550
                $res['tld']    = $last;
3551
            }
3552
            //排除顶级根二级后缀
3553
            if ($last_1 !== $last and in_array($last_1, $state_domain) and ! in_array($last, array('com', 'net', 'org', 'edu', 'gov')))
3554
            {
3555
                $res['domain'] = $last_2.'.'.$last_1.'.'.$last; //n.n.com形式
3556
                $res['name']   = $last_2;
3557
                $res['tld']    = $last_1.'.'.$last;
3558
            }
3559
            //限定cn顶级根二级后缀为'com', 'net', 'org', 'edu', 'gov'
3560
            if (in_array($last, array('cn')) and $last_1 !== $last and strlen($last_1) > 2 and ! in_array($last_1, array('com', 'net', 'org', 'edu', 'gov')))
3561
            {
3562
                $res['domain'] = $last_1.'.'.$last; //n.n.cn形式
3563
                $res['name']   = $last_1;
3564
                $res['tld']    = $last;
3565
            }
3566
        }
3567
3568
        //检测和验证返回的是不是域名格式
3569
        if ( ! empty($res['domain']) and preg_match('/^([\p{Han}a-zA-Z0-9])+([\p{Han}a-zA-Z0-9\-])*\.[a-zA-Z\.\p{Han}]+$/iu', $res['domain']))
3570
        {
3571
            if ($type == 'arr')
3572
            {
3573
                return $res;
3574
            }
3575
            elseif ($type == 'host')
3576
            {
3577
                return $res['host'];
3578
            }
3579
            elseif ($type == 'tld')
3580
            {
3581
                return $res['tld'];
3582
            }
3583
            elseif ($type == 'subdomain')
3584
            {
3585
                return $res['name'];
3586
            }
3587
            else
3588
            {
3589
                return $res['domain'];
3590
            }
3591
        }
3592
        else
3593
        {
3594
            return '';
3595
        }
3596
    }
3597
3598
}
3599