Passed
Branch master (489407)
by Ricardo
02:43
created
core/phpspider.bak20170807.php 1 patch
Spacing   +104 added lines, -104 removed lines patch added patch discarded remove patch
@@ -15,7 +15,7 @@  discard block
 block discarded – undo
15 15
 
16 16
 namespace phpspider\core;
17 17
 
18
-require_once __DIR__ . '/constants.php';
18
+require_once __DIR__.'/constants.php';
19 19
 
20 20
 use phpspider\core\requests;
21 21
 use phpspider\core\selector;
@@ -352,7 +352,7 @@  discard block
 block discarded – undo
352 352
     function __construct($configs = array())
353 353
     {
354 354
         // 产生时钟云,解决php7下面ctrl+c无法停止bug
355
-        declare(ticks = 1);
355
+        declare(ticks=1);
356 356
 
357 357
         // 先打开以显示验证报错内容
358 358
         log::$log_show = true;
@@ -369,25 +369,25 @@  discard block
 block discarded – undo
369 369
             exit;
370 370
         }
371 371
 
372
-        $configs['name']        = isset($configs['name'])        ? $configs['name']        : 'phpspider';
373
-        $configs['proxies']     = isset($configs['proxies'])     ? $configs['proxies']     : '';
374
-        $configs['user_agent']  = isset($configs['user_agent'])  ? $configs['user_agent']  : self::AGENT_PC;
372
+        $configs['name']        = isset($configs['name']) ? $configs['name'] : 'phpspider';
373
+        $configs['proxies']     = isset($configs['proxies']) ? $configs['proxies'] : '';
374
+        $configs['user_agent']  = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC;
375 375
         $configs['user_agents'] = isset($configs['user_agents']) ? $configs['user_agents'] : null;
376
-        $configs['client_ip']   = isset($configs['client_ip'])   ? $configs['client_ip']   : null;
377
-        $configs['client_ips']  = isset($configs['client_ips'])  ? $configs['client_ips']  : null;
378
-        $configs['interval']    = isset($configs['interval'])    ? $configs['interval']    : self::INTERVAL;
379
-        $configs['timeout']     = isset($configs['timeout'])     ? $configs['timeout']     : self::TIMEOUT;
380
-        $configs['max_try']     = isset($configs['max_try'])     ? $configs['max_try']     : self::MAX_TRY;
381
-        $configs['max_depth']   = isset($configs['max_depth'])   ? $configs['max_depth']   : 0;
382
-        $configs['max_fields']  = isset($configs['max_fields'])  ? $configs['max_fields']  : 0;
383
-        $configs['export']      = isset($configs['export'])      ? $configs['export']      : array();
376
+        $configs['client_ip']   = isset($configs['client_ip']) ? $configs['client_ip'] : null;
377
+        $configs['client_ips']  = isset($configs['client_ips']) ? $configs['client_ips'] : null;
378
+        $configs['interval']    = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL;
379
+        $configs['timeout']     = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT;
380
+        $configs['max_try']     = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY;
381
+        $configs['max_depth']   = isset($configs['max_depth']) ? $configs['max_depth'] : 0;
382
+        $configs['max_fields']  = isset($configs['max_fields']) ? $configs['max_fields'] : 0;
383
+        $configs['export']      = isset($configs['export']) ? $configs['export'] : array();
384 384
 
385 385
         // csv、sql、db
386
-        self::$export_type  = isset($configs['export']['type'])  ? $configs['export']['type']  : '';
387
-        self::$export_file  = isset($configs['export']['file'])  ? $configs['export']['file']  : '';
386
+        self::$export_type  = isset($configs['export']['type']) ? $configs['export']['type'] : '';
387
+        self::$export_file  = isset($configs['export']['file']) ? $configs['export']['file'] : '';
388 388
         self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : '';
389
-        self::$db_config    = isset($configs['db_config'])       ? $configs['db_config']       : array();
390
-        self::$queue_config = isset($configs['queue_config'])    ? $configs['queue_config']    : array();
389
+        self::$db_config    = isset($configs['db_config']) ? $configs['db_config'] : array();
390
+        self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array();
391 391
 
392 392
         // 是否设置了并发任务数, 并且大于1, 而且不是windows环境
393 393
         if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win()) 
@@ -604,7 +604,7 @@  discard block
 block discarded – undo
604 604
         $command2 = isset($argv[2]) ? $argv[2] : '';
605 605
 
606 606
         // 根据命令做相应处理
607
-        switch($command)
607
+        switch ($command)
608 608
         {
609 609
             // 启动 phpspider
610 610
         case 'start':
@@ -766,7 +766,7 @@  discard block
 block discarded – undo
766 766
             // 显示最后结果
767 767
             log::$log_show = true;
768 768
 
769
-            $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
769
+            $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start));
770 770
             log::note("Spider finished in {$spider_time_run}");
771 771
 
772 772
             $get_collected_url_num = $this->get_collected_url_num();
@@ -795,12 +795,12 @@  discard block
 block discarded – undo
795 795
         // 检查PHP版本
796 796
         if (version_compare(PHP_VERSION, '5.3.0', 'lt')) 
797 797
         {
798
-            log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion());
798
+            log::error('PHP 5.3+ is required, currently installed version is: '.phpversion());
799 799
             exit;
800 800
         }
801 801
 
802 802
         // 检查CURL扩展
803
-        if(!function_exists('curl_init'))
803
+        if (!function_exists('curl_init'))
804 804
         {
805 805
             log::error("The curl extension was not found");
806 806
             exit;
@@ -861,7 +861,7 @@  discard block
 block discarded – undo
861 861
             exit;
862 862
         }
863 863
 
864
-        foreach ( self::$configs['scan_urls'] as $url ) 
864
+        foreach (self::$configs['scan_urls'] as $url) 
865 865
         {
866 866
             // 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了.
867 867
             if (!$this->is_scan_page($url))
@@ -917,7 +917,7 @@  discard block
 block discarded – undo
917 917
         //--------------------------------------------------------------------------------
918 918
 
919 919
         // 添加入口URL到队列
920
-        foreach ( self::$configs['scan_urls'] as $url ) 
920
+        foreach (self::$configs['scan_urls'] as $url) 
921 921
         {
922 922
             // false 表示不允许重复
923 923
             $this->add_scan_url($url, null, false);
@@ -965,13 +965,13 @@  discard block
 block discarded – undo
965 965
         $pid = pcntl_fork();
966 966
 
967 967
         // 主进程记录子进程pid
968
-        if($pid > 0)
968
+        if ($pid > 0)
969 969
         {
970 970
             // 暂时没用
971 971
             //self::$taskpids[$taskid] = $pid;
972 972
         }
973 973
         // 子进程运行
974
-        elseif(0 === $pid)
974
+        elseif (0 === $pid)
975 975
         {
976 976
             log::warn("Fork children task({$taskid}) successful...");
977 977
 
@@ -999,7 +999,7 @@  discard block
 block discarded – undo
999 999
     {
1000 1000
         queue::set_connect('default', self::$queue_config);
1001 1001
         queue::init(); 
1002
-        while( $queue_lsize = $this->queue_lsize() )
1002
+        while ($queue_lsize = $this->queue_lsize())
1003 1003
         { 
1004 1004
             // 如果是主任务
1005 1005
             if (self::$taskmaster) 
@@ -1008,7 +1008,7 @@  discard block
 block discarded – undo
1008 1008
                 if (self::$tasknum > 1 && !self::$fork_task_complete) 
1009 1009
                 {
1010 1010
                     // 主进程采集到两倍于任务数时, 生成子任务一起采集
1011
-                    if ( $queue_lsize > self::$tasknum*2 ) 
1011
+                    if ($queue_lsize > self::$tasknum * 2) 
1012 1012
                     {
1013 1013
                         self::$fork_task_complete = true;
1014 1014
 
@@ -1037,7 +1037,7 @@  discard block
 block discarded – undo
1037 1037
             else 
1038 1038
             {
1039 1039
                 // 如果队列中的网页比任务数2倍多, 子任务可以采集, 否则等待...
1040
-                if ( $queue_lsize > self::$tasknum*2 ) 
1040
+                if ($queue_lsize > self::$tasknum * 2) 
1041 1041
                 {
1042 1042
                     // 抓取页面
1043 1043
                     $this->collect_page();
@@ -1176,7 +1176,7 @@  discard block
 block discarded – undo
1176 1176
             if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth']) 
1177 1177
             {
1178 1178
                 // 分析提取HTML页面中的URL
1179
-                $this->get_urls($page['raw'], $url, $link['depth'] + 1);
1179
+                $this->get_urls($page['raw'], $url, $link['depth']+1);
1180 1180
             }
1181 1181
         }
1182 1182
 
@@ -1191,10 +1191,10 @@  discard block
 block discarded – undo
1191 1191
         $this->incr_depth_num($link['depth']);
1192 1192
 
1193 1193
         // 处理页面耗时时间
1194
-        $time_run = round(microtime(true) - $page_time_start, 3);
1194
+        $time_run = round(microtime(true)-$page_time_start, 3);
1195 1195
         log::debug("Success process page {$url} in {$time_run} s");
1196 1196
 
1197
-        $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start));
1197
+        $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start));
1198 1198
         log::info("Spider running in {$spider_time_run}");
1199 1199
 
1200 1200
         // 爬虫爬取每个网页的时间间隔, 单位: 毫秒
@@ -1314,12 +1314,12 @@  discard block
 block discarded – undo
1314 1314
                     log::error("Failed to download page {$url}");
1315 1315
                     self::$collect_fail++;
1316 1316
                 }
1317
-                elseif (in_array($http_code, array('0','502','503','429'))) 
1317
+                elseif (in_array($http_code, array('0', '502', '503', '429'))) 
1318 1318
                 {
1319 1319
                     // 采集次数加一
1320 1320
                     $link['try_num']++;
1321 1321
                     // 抓取次数 小于 允许抓取失败次数
1322
-                    if ( $link['try_num'] <= $link['max_try'] ) 
1322
+                    if ($link['try_num'] <= $link['max_try']) 
1323 1323
                     {
1324 1324
                         // 扔到队列头部去, 继续采集
1325 1325
                         $this->queue_rpush($link);
@@ -1337,7 +1337,7 @@  discard block
 block discarded – undo
1337 1337
         }
1338 1338
 
1339 1339
         // 爬取页面耗时时间
1340
-        $time_run = round(microtime(true) - $time_start, 3);
1340
+        $time_run = round(microtime(true)-$time_start, 3);
1341 1341
         log::debug("Success download page {$url} in {$time_run} s");
1342 1342
         self::$collect_succ++;
1343 1343
 
@@ -1382,7 +1382,7 @@  discard block
 block discarded – undo
1382 1382
 
1383 1383
         foreach ($urls as $key=>$url) 
1384 1384
         {
1385
-            $urls[$key] = str_replace(array("\"", "'",'&amp;'), array("",'','&'), $url);
1385
+            $urls[$key] = str_replace(array("\"", "'", '&amp;'), array("", '', '&'), $url);
1386 1386
         }
1387 1387
 
1388 1388
         //--------------------------------------------------------------------------------
@@ -1458,12 +1458,12 @@  discard block
 block discarded – undo
1458 1458
 
1459 1459
         // 排除JavaScript的连接
1460 1460
         //if (strpos($url, "javascript:") !== false) 
1461
-        if( preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '')
1461
+        if (preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '')
1462 1462
         {
1463 1463
             return false;
1464 1464
         }
1465 1465
         // 排除没有被解析成功的语言标签
1466
-        if(substr($url, 0, 3) == '<%=')
1466
+        if (substr($url, 0, 3) == '<%=')
1467 1467
         {
1468 1468
             return false;
1469 1469
         }
@@ -1482,46 +1482,46 @@  discard block
 block discarded – undo
1482 1482
         $domain = $parse_url['host'];
1483 1483
         $path = empty($parse_url['path']) ? '' : $parse_url['path'];
1484 1484
         $base_url_path = $domain.$path;
1485
-        $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/","/",$base_url_path);
1486
-        $base_url_path = preg_replace("/\/$/",'',$base_url_path);
1485
+        $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", "/", $base_url_path);
1486
+        $base_url_path = preg_replace("/\/$/", '', $base_url_path);
1487 1487
 
1488 1488
         $i = $path_step = 0;
1489 1489
         $dstr = $pstr = '';
1490
-        $pos = strpos($url,'#');
1491
-        if($pos > 0)
1490
+        $pos = strpos($url, '#');
1491
+        if ($pos > 0)
1492 1492
         {
1493 1493
             // 去掉#和后面的字符串
1494 1494
             $url = substr($url, 0, $pos);
1495 1495
         }
1496 1496
 
1497 1497
         // 京东变态的都是 //www.jd.com/111.html
1498
-        if(substr($url, 0, 2) == '//')
1498
+        if (substr($url, 0, 2) == '//')
1499 1499
         {
1500 1500
             $url = str_replace("//", "", $url);
1501 1501
         }
1502 1502
         // /1234.html
1503
-        elseif($url[0] == '/')
1503
+        elseif ($url[0] == '/')
1504 1504
         {
1505 1505
             $url = $domain.$url;
1506 1506
         }
1507 1507
         // ./1234.html、../1234.html 这种类型的
1508
-        elseif($url[0] == '.')
1508
+        elseif ($url[0] == '.')
1509 1509
         {
1510
-            if(!isset($url[2]))
1510
+            if (!isset($url[2]))
1511 1511
             {
1512 1512
                 return false;
1513 1513
             }
1514 1514
             else
1515 1515
             {
1516
-                $urls = explode('/',$url);
1517
-                foreach($urls as $u)
1516
+                $urls = explode('/', $url);
1517
+                foreach ($urls as $u)
1518 1518
                 {
1519
-                    if( $u == '..' )
1519
+                    if ($u == '..')
1520 1520
                     {
1521 1521
                         $path_step++;
1522 1522
                     }
1523 1523
                     // 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的
1524
-                    else if( $i < count($urls)-1 )
1524
+                    else if ($i < count($urls)-1)
1525 1525
                     {
1526 1526
                         $dstr .= $urls[$i].'/';
1527 1527
                     }
@@ -1531,29 +1531,29 @@  discard block
 block discarded – undo
1531 1531
                     }
1532 1532
                     $i++;
1533 1533
                 }
1534
-                $urls = explode('/',$base_url_path);
1535
-                if(count($urls) <= $path_step)
1534
+                $urls = explode('/', $base_url_path);
1535
+                if (count($urls) <= $path_step)
1536 1536
                 {
1537 1537
                     return false;
1538 1538
                 }
1539 1539
                 else
1540 1540
                 {
1541 1541
                     $pstr = '';
1542
-                    for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; }
1542
+                    for ($i = 0; $i < count($urls)-$path_step; $i++) { $pstr .= $urls[$i].'/'; }
1543 1543
                     $url = $pstr.$dstr;
1544 1544
                 }
1545 1545
             }
1546 1546
         }
1547 1547
         else 
1548 1548
         {
1549
-            if( strtolower(substr($url, 0, 7))=='http://' )
1549
+            if (strtolower(substr($url, 0, 7)) == 'http://')
1550 1550
             {
1551
-                $url = preg_replace('#^http://#i','',$url);
1551
+                $url = preg_replace('#^http://#i', '', $url);
1552 1552
                 $scheme = "http";
1553 1553
             }
1554
-            else if( strtolower(substr($url, 0, 8))=='https://' )
1554
+            else if (strtolower(substr($url, 0, 8)) == 'https://')
1555 1555
             {
1556
-                $url = preg_replace('#^https://#i','',$url);
1556
+                $url = preg_replace('#^https://#i', '', $url);
1557 1557
                 $scheme = "https";
1558 1558
             }
1559 1559
             else
@@ -1650,16 +1650,16 @@  discard block
 block discarded – undo
1650 1650
     public function link_uncompress($link)
1651 1651
     {
1652 1652
         $link = array(
1653
-            'url'          => isset($link['url'])          ? $link['url']          : '',             
1654
-            'url_type'     => isset($link['url_type'])     ? $link['url_type']     : '',             
1655
-            'method'       => isset($link['method'])       ? $link['method']       : 'get',             
1656
-            'headers'      => isset($link['headers'])      ? $link['headers']      : array(),    
1657
-            'params'       => isset($link['params'])       ? $link['params']       : array(),           
1653
+            'url'          => isset($link['url']) ? $link['url'] : '',             
1654
+            'url_type'     => isset($link['url_type']) ? $link['url_type'] : '',             
1655
+            'method'       => isset($link['method']) ? $link['method'] : 'get',             
1656
+            'headers'      => isset($link['headers']) ? $link['headers'] : array(),    
1657
+            'params'       => isset($link['params']) ? $link['params'] : array(),           
1658 1658
             'context_data' => isset($link['context_data']) ? $link['context_data'] : '',                
1659
-            'proxies'      => isset($link['proxies'])      ? $link['proxies']      : self::$configs['proxies'],             
1660
-            'try_num'      => isset($link['try_num'])      ? $link['try_num']      : 0,                 
1661
-            'max_try'      => isset($link['max_try'])      ? $link['max_try']      : self::$configs['max_try'],
1662
-            'depth'        => isset($link['depth'])        ? $link['depth']        : 0,             
1659
+            'proxies'      => isset($link['proxies']) ? $link['proxies'] : self::$configs['proxies'],             
1660
+            'try_num'      => isset($link['try_num']) ? $link['try_num'] : 0,                 
1661
+            'max_try'      => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'],
1662
+            'depth'        => isset($link['depth']) ? $link['depth'] : 0,             
1663 1663
         );
1664 1664
 
1665 1665
         return $link;
@@ -1704,12 +1704,12 @@  discard block
 block discarded – undo
1704 1704
                     exit(0);
1705 1705
                 }
1706 1706
 
1707
-                if (version_compare(PHP_VERSION,'5.4.0','<'))
1707
+                if (version_compare(PHP_VERSION, '5.4.0', '<'))
1708 1708
                 {
1709 1709
                     $fields_str = json_encode($fields);
1710
-                    $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) {
1710
+                    $fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function($matchs) {
1711 1711
                         return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1]));
1712
-                    }, $fields_str ); 
1712
+                    }, $fields_str); 
1713 1713
                 } 
1714 1714
                 else
1715 1715
                 {
@@ -1775,7 +1775,7 @@  discard block
 block discarded – undo
1775 1775
             if (!empty($conf['selector'])) 
1776 1776
             {
1777 1777
                 // 如果这个field是上一个field的附带连接
1778
-                if (isset($conf['source_type']) && $conf['source_type']=='attached_url') 
1778
+                if (isset($conf['source_type']) && $conf['source_type'] == 'attached_url') 
1779 1779
                 {
1780 1780
                     // 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的
1781 1781
                     if (!empty($fields[$conf['attached_url']])) 
@@ -1802,15 +1802,15 @@  discard block
 block discarded – undo
1802 1802
                 }
1803 1803
 
1804 1804
                 // 没有设置抽取规则的类型 或者 设置为 xpath
1805
-                if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') 
1805
+                if (!isset($conf['selector_type']) || $conf['selector_type'] == 'xpath') 
1806 1806
                 {
1807 1807
                     $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']);
1808 1808
                 }
1809
-                elseif ($conf['selector_type']=='css') 
1809
+                elseif ($conf['selector_type'] == 'css') 
1810 1810
                 {
1811 1811
                     $values = $this->get_fields_css($html, $conf['selector'], $conf['name']);
1812 1812
                 }
1813
-                elseif ($conf['selector_type']=='regex') 
1813
+                elseif ($conf['selector_type'] == 'regex') 
1814 1814
                 {
1815 1815
                     $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']);
1816 1816
                 }
@@ -1957,7 +1957,7 @@  discard block
 block discarded – undo
1957 1957
 
1958 1958
                 $config = self::$db_config;
1959 1959
                 @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']);
1960
-                if(mysqli_connect_errno())
1960
+                if (mysqli_connect_errno())
1961 1961
                 {
1962 1962
                     log::error("Export data to a database need Mysql support, Error: ".mysqli_connect_error());
1963 1963
                     exit;
@@ -1994,7 +1994,7 @@  discard block
 block discarded – undo
1994 1994
             $msg .= "Do you want to continue? [Y/n]";
1995 1995
             fwrite(STDOUT, $msg);
1996 1996
             $arg = strtolower(trim(fgets(STDIN)));
1997
-            $arg = empty($arg) || !in_array($arg, array('y','n')) ? 'y' : $arg;
1997
+            $arg = empty($arg) || !in_array($arg, array('y', 'n')) ? 'y' : $arg;
1998 1998
             if ($arg == 'n') 
1999 1999
             {
2000 2000
                 foreach ($keys as $key) 
@@ -2034,9 +2034,9 @@  discard block
 block discarded – undo
2034 2034
     public function set_task_status()
2035 2035
     {
2036 2036
         // 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用
2037
-        $mem = round(memory_get_usage(true)/(1024*1024),2);
2038
-        $use_time = microtime(true) - self::$time_start; 
2039
-        $speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2);
2037
+        $mem = round(memory_get_usage(true) / (1024 * 1024), 2);
2038
+        $use_time = microtime(true)-self::$time_start; 
2039
+        $speed = round((self::$collect_succ+self::$collect_fail) / $use_time, 2);
2040 2040
         $status = array(
2041 2041
             'id' => self::$taskid,
2042 2042
             'pid' => self::$taskpid,
@@ -2609,7 +2609,7 @@  discard block
 block discarded – undo
2609 2609
     {
2610 2610
         static $last_lines = 0;
2611 2611
 
2612
-        if(!is_null($force_clear_lines)) 
2612
+        if (!is_null($force_clear_lines)) 
2613 2613
         {
2614 2614
             $last_lines = $force_clear_lines;
2615 2615
         }
@@ -2617,19 +2617,19 @@  discard block
 block discarded – undo
2617 2617
         // 获取终端宽度
2618 2618
         $toss = $status = null;
2619 2619
         $term_width = exec('tput cols', $toss, $status);
2620
-        if($status || empty($term_width)) 
2620
+        if ($status || empty($term_width)) 
2621 2621
         {
2622 2622
             $term_width = 64; // Arbitrary fall-back term width.
2623 2623
         }
2624 2624
 
2625 2625
         $line_count = 0;
2626
-        foreach(explode("\n", $message) as $line) 
2626
+        foreach (explode("\n", $message) as $line) 
2627 2627
         {
2628 2628
             $line_count += count(str_split($line, $term_width));
2629 2629
         }
2630 2630
 
2631 2631
         // Erasure MAGIC: Clear as many lines as the last output had.
2632
-        for($i = 0; $i < $last_lines; $i++) 
2632
+        for ($i = 0; $i < $last_lines; $i++) 
2633 2633
         {
2634 2634
             // Return to the beginning of the line
2635 2635
             echo "\r";
@@ -2666,16 +2666,16 @@  discard block
 block discarded – undo
2666 2666
         $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
2667 2667
         //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m";
2668 2668
         $run_time_str = util::time2second(time()-self::$time_start, false);
2669
-        $display_str .= 'PHPSpider version:' . self::VERSION . "          PHP version:" . PHP_VERSION . "\n";
2670
-        $display_str .= 'start time:'. date('Y-m-d H:i:s', self::$time_start).'   run ' . $run_time_str . " \n";
2669
+        $display_str .= 'PHPSpider version:'.self::VERSION."          PHP version:".PHP_VERSION."\n";
2670
+        $display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).'   run '.$run_time_str." \n";
2671 2671
 
2672
-        $display_str .= 'spider name: ' . self::$configs['name'] . "\n";
2672
+        $display_str .= 'spider name: '.self::$configs['name']."\n";
2673 2673
         if (self::$multiserver) 
2674 2674
         {
2675
-            $display_str .= 'server id: ' . self::$serverid."\n";
2675
+            $display_str .= 'server id: '.self::$serverid."\n";
2676 2676
         }
2677
-        $display_str .= 'task number: ' . self::$tasknum . "\n";
2678
-        $display_str .= 'load average: ' . implode(", ", $loadavg) . "\n";
2677
+        $display_str .= 'task number: '.self::$tasknum."\n";
2678
+        $display_str .= 'load average: '.implode(", ", $loadavg)."\n";
2679 2679
         $display_str .= "document: https://doc.phpspider.org\n";
2680 2680
 
2681 2681
         $display_str .= $this->display_task_ui();
@@ -2705,12 +2705,12 @@  discard block
 block discarded – undo
2705 2705
     {
2706 2706
         $display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n";
2707 2707
 
2708
-        $display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')). 
2709
-            "\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')). 
2710
-            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
2711
-            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
2712
-            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
2713
-            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
2708
+        $display_str .= "\033[47;30mtaskid\033[0m".str_pad('', self::$taskid_length+2-strlen('taskid')). 
2709
+            "\033[47;30mtaskpid\033[0m".str_pad('', self::$pid_length+2-strlen('taskpid')). 
2710
+            "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). 
2711
+            "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). 
2712
+            "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). 
2713
+            "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). 
2714 2714
             "\n";
2715 2715
 
2716 2716
         // "\033[32;40m [OK] \033[0m"
@@ -2738,12 +2738,12 @@  discard block
 block discarded – undo
2738 2738
     {
2739 2739
         $display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n";
2740 2740
 
2741
-        $display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')). 
2742
-            "\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')). 
2743
-            "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). 
2744
-            "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). 
2745
-            "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). 
2746
-            "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). 
2741
+        $display_str .= "\033[47;30mserver\033[0m".str_pad('', self::$server_length+2-strlen('serverid')). 
2742
+            "\033[47;30mtasknum\033[0m".str_pad('', self::$tasknum_length+2-strlen('tasknum')). 
2743
+            "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). 
2744
+            "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). 
2745
+            "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). 
2746
+            "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). 
2747 2747
             "\n";
2748 2748
 
2749 2749
         $server_list_json = queue::get("server_list");
@@ -2784,11 +2784,11 @@  discard block
 block discarded – undo
2784 2784
     {
2785 2785
         $display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n";
2786 2786
 
2787
-        $display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')). 
2788
-            "\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')). 
2789
-            "\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')). 
2790
-            "\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')). 
2791
-            "\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')). 
2787
+        $display_str .= "\033[47;30mfind pages\033[0m".str_pad('', 16-strlen('find pages')). 
2788
+            "\033[47;30mqueue\033[0m".str_pad('', 14-strlen('queue')). 
2789
+            "\033[47;30mcollected\033[0m".str_pad('', 15-strlen('collected')). 
2790
+            "\033[47;30mfields\033[0m".str_pad('', 15-strlen('fields')). 
2791
+            "\033[47;30mdepth\033[0m".str_pad('', 12-strlen('depth')). 
2792 2792
             "\n";
2793 2793
 
2794 2794
         $collect   = $this->get_collect_url_num();
Please login to merge, or discard this patch.
core/requests.php 1 patch
Spacing   +60 added lines, -60 removed lines patch added patch discarded remove patch
@@ -72,21 +72,21 @@  discard block
 block discarded – undo
72 72
     public static $output_encoding = null;
73 73
     public static $cookies         = array(); // array of cookies to pass
74 74
     // $cookies['username'] = "seatle";
75
-    public static $rawheaders = array();                        // array of raw headers to send
76
-    public static $domain_cookies = array();                    // array of cookies for domain to pass
77
-    public static $hosts = array();                             // random host binding for make request faster
78
-    public static $headers = array();                           // headers returned from server sent here
79
-    public static $useragents = array("requests/2.0.0");        // random agent we masquerade as
80
-    public static $client_ips = array();                        // random ip we masquerade as
81
-    public static $proxies = array();                           // random proxy ip
82
-    public static $raw = "";                                    // head + body content returned from server sent here
83
-    public static $head = "";                                   // head content
84
-    public static $content = "";                                // The body before encoding
85
-    public static $text = "";                                   // The body after encoding
86
-    public static $info = array();                              // curl info
87
-    public static $history = 302;                               // http request status before redirect. ex:30x
88
-    public static $status_code = 0;                             // http request status
89
-    public static $error = "";                                  // error messages sent here
75
+    public static $rawheaders = array(); // array of raw headers to send
76
+    public static $domain_cookies = array(); // array of cookies for domain to pass
77
+    public static $hosts = array(); // random host binding for make request faster
78
+    public static $headers = array(); // headers returned from server sent here
79
+    public static $useragents = array("requests/2.0.0"); // random agent we masquerade as
80
+    public static $client_ips = array(); // random ip we masquerade as
81
+    public static $proxies = array(); // random proxy ip
82
+    public static $raw = ""; // head + body content returned from server sent here
83
+    public static $head = ""; // head content
84
+    public static $content = ""; // The body before encoding
85
+    public static $text = ""; // The body after encoding
86
+    public static $info = array(); // curl info
87
+    public static $history = 302; // http request status before redirect. ex:30x
88
+    public static $status_code = 0; // http request status
89
+    public static $error = ""; // error messages sent here
90 90
 
91 91
     /**
92 92
      * set timeout
@@ -289,7 +289,7 @@  discard block
 block discarded – undo
289 289
         {
290 290
             return false;
291 291
         }
292
-        if ( empty($domain) ) 
292
+        if (empty($domain)) 
293 293
         {
294 294
             self::$cookies = array();
295 295
         }
@@ -541,17 +541,17 @@  discard block
 block discarded – undo
541 541
      */
542 542
     public static function init()
543 543
     {
544
-        if (!is_resource ( self::$ch ))
544
+        if (!is_resource(self::$ch))
545 545
         {
546
-            self::$ch = curl_init ();
547
-            curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
548
-            curl_setopt( self::$ch, CURLOPT_HEADER, false );
549
-            curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION );
546
+            self::$ch = curl_init();
547
+            curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, true);
548
+            curl_setopt(self::$ch, CURLOPT_HEADER, false);
549
+            curl_setopt(self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION);
550 550
             // 如果设置了两个时间,就分开设置
551 551
             if (is_array(self::$timeout)) 
552 552
             {
553
-                curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] );
554
-                curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]);
553
+                curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0]);
554
+                curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]);
555 555
             }
556 556
             else 
557 557
             {
@@ -560,7 +560,7 @@  discard block
 block discarded – undo
560 560
             }
561 561
             curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed
562 562
             // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
563
-            curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
563
+            curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true);
564 564
         }
565 565
         return self::$ch;
566 566
     }
@@ -570,7 +570,7 @@  discard block
 block discarded – undo
570 570
      */
571 571
     public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL)
572 572
     {
573
-        self::init ();
573
+        self::init();
574 574
         return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert);
575 575
     }
576 576
 
@@ -593,19 +593,19 @@  discard block
 block discarded – undo
593 593
      */
594 594
     public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
595 595
     {
596
-        self::init ();
596
+        self::init();
597 597
         return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert);
598 598
     }
599 599
 
600 600
     public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL)
601 601
     {
602
-        self::init ();
602
+        self::init();
603 603
         return self::request($url, 'PUT', $fields, $allow_redirects, $cert);
604 604
     }
605 605
 
606 606
     public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL)
607 607
     {
608
-        self::init ();
608
+        self::init();
609 609
         return self::request($url, 'DELETE', $fields, $allow_redirects, $cert);
610 610
     }
611 611
 
@@ -614,19 +614,19 @@  discard block
 block discarded – undo
614 614
     // 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。.
615 615
     public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL)
616 616
     {
617
-        self::init ();
617
+        self::init();
618 618
         self::request($url, 'HEAD', $fields, $allow_redirects, $cert);
619 619
     }
620 620
 
621 621
     public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL)
622 622
     {
623
-        self::init ();
623
+        self::init();
624 624
         return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert);
625 625
     }
626 626
 
627 627
     public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL)
628 628
     {
629
-        self::init ();
629
+        self::init();
630 630
         return self::request($url, 'PATCH', $fields, $allow_redirects, $cert);
631 631
     }
632 632
 
@@ -645,7 +645,7 @@  discard block
 block discarded – undo
645 645
     public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL)
646 646
     {
647 647
         $method = strtoupper($method);
648
-        if(!self::_is_url($url))
648
+        if (!self::_is_url($url))
649 649
         {
650 650
             self::$error = "You have requested URL ({$url}) is not a valid HTTP address";
651 651
             return false;
@@ -679,7 +679,7 @@  discard block
 block discarded – undo
679 679
             }
680 680
         }
681 681
 
682
-        curl_setopt( self::$ch, CURLOPT_URL, $url );
682
+        curl_setopt(self::$ch, CURLOPT_URL, $url);
683 683
 
684 684
         if ($method != 'GET')
685 685
         {
@@ -692,13 +692,13 @@  discard block
 block discarded – undo
692 692
                 // CURLOPT_POST会把上傳的文件类型设为 multipart/form-data
693 693
                 // 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码
694 694
                 // CURLOPT_CUSTOMREQUEST可以按指定内容上传
695
-                if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) 
695
+                if (isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json') 
696 696
                 {
697
-                    curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); 
697
+                    curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); 
698 698
                 }
699 699
                 else 
700 700
                 {
701
-                    curl_setopt( self::$ch, CURLOPT_POST, true );
701
+                    curl_setopt(self::$ch, CURLOPT_POST, true);
702 702
                 }
703 703
 
704 704
                 $file_fields = array();
@@ -724,16 +724,16 @@  discard block
 block discarded – undo
724 724
             else
725 725
             {
726 726
                 self::$rawheaders['X-HTTP-Method-Override'] = $method;
727
-                curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); 
727
+                curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); 
728 728
             }
729 729
 
730
-            if ( $method == 'POST' ) 
730
+            if ($method == 'POST') 
731 731
             {
732 732
                 // 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包
733
-                if ( empty($file_fields) ) 
733
+                if (empty($file_fields)) 
734 734
                 {
735 735
                     // post方式
736
-                    if ( is_array($fields) ) 
736
+                    if (is_array($fields)) 
737 737
                     {
738 738
                         $fields = http_build_query($fields);
739 739
                     }
@@ -741,7 +741,7 @@  discard block
 block discarded – undo
741 741
                 else 
742 742
                 {
743 743
                     // 有post数据
744
-                    if ( is_array($fields) && !empty($fields) ) 
744
+                    if (is_array($fields) && !empty($fields)) 
745 745
                     {
746 746
                         // 某些server可能会有问题
747 747
                         $fields = array_merge($fields, $file_fields);
@@ -753,13 +753,13 @@  discard block
 block discarded – undo
753 753
                 }
754 754
 
755 755
                 // 不能直接传数组,不知道是什么Bug,会非常慢
756
-                curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
756
+                curl_setopt(self::$ch, CURLOPT_POSTFIELDS, $fields);
757 757
             }
758 758
         }
759 759
 
760 760
         $cookies = self::get_cookies();
761 761
         $domain_cookies = self::get_cookies($domain);
762
-        $cookies =  array_merge($cookies, $domain_cookies);
762
+        $cookies = array_merge($cookies, $domain_cookies);
763 763
         // 是否设置了cookie
764 764
         if (!empty($cookies)) 
765 765
         {
@@ -773,13 +773,13 @@  discard block
 block discarded – undo
773 773
 
774 774
         if (!empty(self::$useragents)) 
775 775
         {
776
-            $key = rand(0, count(self::$useragents) - 1);
776
+            $key = rand(0, count(self::$useragents)-1);
777 777
             self::$rawheaders['User-Agent'] = self::$useragents[$key];
778 778
         }
779 779
 
780 780
         if (!empty(self::$client_ips)) 
781 781
         {
782
-            $key                                 = rand(0, count(self::$client_ips) - 1);
782
+            $key                                 = rand(0, count(self::$client_ips)-1);
783 783
             self::$rawheaders['CLIENT-IP']       = self::$client_ips[$key];
784 784
             self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key];
785 785
         }
@@ -791,10 +791,10 @@  discard block
 block discarded – undo
791 791
             {
792 792
                 $http_headers[] = $k.': '.$v;
793 793
             }
794
-            curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers );
794
+            curl_setopt(self::$ch, CURLOPT_HTTPHEADER, $http_headers);
795 795
         }
796 796
 
797
-        curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
797
+        curl_setopt(self::$ch, CURLOPT_ENCODING, 'gzip');
798 798
 
799 799
         // 关闭验证
800 800
         if ($scheme == 'https') 
@@ -805,33 +805,33 @@  discard block
 block discarded – undo
805 805
 
806 806
         if (self::$proxies)
807 807
         {
808
-            $key = rand(0, count(self::$proxies) - 1);
808
+            $key = rand(0, count(self::$proxies)-1);
809 809
             $proxy = self::$proxies[$key];
810
-            curl_setopt( self::$ch, CURLOPT_PROXY, $proxy );
810
+            curl_setopt(self::$ch, CURLOPT_PROXY, $proxy);
811 811
         }
812 812
 
813 813
         // header + body,header 里面有 cookie
814
-        curl_setopt( self::$ch, CURLOPT_HEADER, true );
814
+        curl_setopt(self::$ch, CURLOPT_HEADER, true);
815 815
         // 请求跳转后的内容
816 816
         if ($allow_redirects)
817 817
         {
818
-            curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true);
818
+            curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, true);
819 819
         }
820 820
 
821
-        self::$raw = curl_exec ( self::$ch );
821
+        self::$raw = curl_exec(self::$ch);
822 822
         // 真实url
823 823
         //$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL);
824
-        self::$info = curl_getinfo( self::$ch );
824
+        self::$info = curl_getinfo(self::$ch);
825 825
         //print_r(self::$info);
826 826
         self::$status_code = self::$info['http_code'];
827 827
         if (self::$raw === false)
828 828
         {
829
-            self::$error = 'Curl error: ' . curl_error( self::$ch );
829
+            self::$error = 'Curl error: '.curl_error(self::$ch);
830 830
             //trigger_error(self::$error, E_USER_WARNING);
831 831
         }
832 832
 
833 833
         // 关闭句柄
834
-        curl_close( self::$ch );
834
+        curl_close(self::$ch);
835 835
 
836 836
         // 请求成功之后才把URL存起来
837 837
         list($header, $text) = self::split_header_body();
@@ -861,7 +861,7 @@  discard block
 block discarded – undo
861 861
     // 获取 mimetype
862 862
     public static function get_mimetype($filepath)
863 863
     {
864
-        $fp  = finfo_open(FILEINFO_MIME);
864
+        $fp = finfo_open(FILEINFO_MIME);
865 865
         $mime = finfo_file($fp, $filepath);
866 866
         finfo_close($fp);
867 867
         $arr  = explode(';', $mime);
@@ -883,7 +883,7 @@  discard block
 block discarded – undo
883 883
     {
884 884
         // 构造post数据
885 885
         $data = '';
886
-        $delimiter = '-------------' . uniqid();
886
+        $delimiter = '-------------'.uniqid();
887 887
         // 表单数据
888 888
         foreach ($post_fields as $name => $content) 
889 889
         {
@@ -941,11 +941,11 @@  discard block
 block discarded – undo
941 941
         {
942 942
             $out = self::$output_encoding;
943 943
         }
944
-        if ( ! isset($out))
944
+        if (!isset($out))
945 945
         {
946 946
             $out = 'UTF-8';
947 947
         }
948
-        if ( ! in_array($mode, $valid))
948
+        if (!in_array($mode, $valid))
949 949
         {
950 950
             throw new Exception('invalid mode, mode='.$mode);
951 951
         }
@@ -965,7 +965,7 @@  discard block
 block discarded – undo
965 965
         }
966 966
 
967 967
         $pattern = '/(<meta[^>]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is';
968
-        if ( ! isset($in))
968
+        if (!isset($in))
969 969
         {
970 970
             $n = preg_match($pattern, $html, $in);
971 971
             if ($n > 0)
Please login to merge, or discard this patch.