@@ -607,35 +607,35 @@ discard block |
||
607 | 607 | switch($command) |
608 | 608 | { |
609 | 609 | // 启动 phpspider |
610 | - case 'start': |
|
611 | - if ($command2 === '-d') |
|
612 | - { |
|
613 | - self::$daemonize = true; |
|
614 | - } |
|
615 | - break; |
|
616 | - case 'stop': |
|
617 | - exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}'", $info); |
|
618 | - if (count($info) <= 1) |
|
619 | - { |
|
620 | - echo "PHPSpider[$start_file] not run\n"; |
|
621 | - } |
|
622 | - else |
|
623 | - { |
|
624 | - //echo "PHPSpider[$start_file] is stoping ...\n"; |
|
625 | - echo "PHPSpider[$start_file] stop success"; |
|
626 | - exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGINT", $info); |
|
627 | - } |
|
628 | - exit; |
|
629 | - break; |
|
630 | - case 'kill': |
|
631 | - exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGKILL"); |
|
632 | - break; |
|
633 | - // 显示 phpspider 运行状态 |
|
634 | - case 'status': |
|
635 | - exit(0); |
|
636 | - // 未知命令 |
|
637 | - default : |
|
638 | - exit("Usage: php yourfile.php {start|stop|status|kill}\n"); |
|
610 | + case 'start': |
|
611 | + if ($command2 === '-d') |
|
612 | + { |
|
613 | + self::$daemonize = true; |
|
614 | + } |
|
615 | + break; |
|
616 | + case 'stop': |
|
617 | + exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}'", $info); |
|
618 | + if (count($info) <= 1) |
|
619 | + { |
|
620 | + echo "PHPSpider[$start_file] not run\n"; |
|
621 | + } |
|
622 | + else |
|
623 | + { |
|
624 | + //echo "PHPSpider[$start_file] is stoping ...\n"; |
|
625 | + echo "PHPSpider[$start_file] stop success"; |
|
626 | + exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGINT", $info); |
|
627 | + } |
|
628 | + exit; |
|
629 | + break; |
|
630 | + case 'kill': |
|
631 | + exec("ps aux | grep $start_file | grep -v grep | awk '{print $2}' |xargs kill -SIGKILL"); |
|
632 | + break; |
|
633 | + // 显示 phpspider 运行状态 |
|
634 | + case 'status': |
|
635 | + exit(0); |
|
636 | + // 未知命令 |
|
637 | + default : |
|
638 | + exit("Usage: php yourfile.php {start|stop|status|kill}\n"); |
|
639 | 639 | } |
640 | 640 | } |
641 | 641 | |
@@ -648,14 +648,14 @@ discard block |
||
648 | 648 | { |
649 | 649 | switch ($signal) { |
650 | 650 | // Stop. |
651 | - case SIGINT: |
|
652 | - log::warn("Program stopping..."); |
|
653 | - self::$terminate = true; |
|
654 | - break; |
|
655 | - // Show status. |
|
656 | - case SIGUSR2: |
|
657 | - echo "show status\n"; |
|
658 | - break; |
|
651 | + case SIGINT: |
|
652 | + log::warn("Program stopping..."); |
|
653 | + self::$terminate = true; |
|
654 | + break; |
|
655 | + // Show status. |
|
656 | + case SIGUSR2: |
|
657 | + echo "show status\n"; |
|
658 | + break; |
|
659 | 659 | } |
660 | 660 | } |
661 | 661 |
@@ -15,7 +15,7 @@ discard block |
||
15 | 15 | |
16 | 16 | namespace phpspider\core; |
17 | 17 | |
18 | -require_once __DIR__ . '/constants.php'; |
|
18 | +require_once __DIR__.'/constants.php'; |
|
19 | 19 | |
20 | 20 | use phpspider\core\requests; |
21 | 21 | use phpspider\core\selector; |
@@ -352,7 +352,7 @@ discard block |
||
352 | 352 | function __construct($configs = array()) |
353 | 353 | { |
354 | 354 | // 产生时钟云,解决php7下面ctrl+c无法停止bug |
355 | - declare(ticks = 1); |
|
355 | + declare(ticks=1); |
|
356 | 356 | |
357 | 357 | // 先打开以显示验证报错内容 |
358 | 358 | log::$log_show = true; |
@@ -369,25 +369,25 @@ discard block |
||
369 | 369 | exit; |
370 | 370 | } |
371 | 371 | |
372 | - $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; |
|
373 | - $configs['proxies'] = isset($configs['proxies']) ? $configs['proxies'] : ''; |
|
374 | - $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; |
|
372 | + $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; |
|
373 | + $configs['proxies'] = isset($configs['proxies']) ? $configs['proxies'] : ''; |
|
374 | + $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; |
|
375 | 375 | $configs['user_agents'] = isset($configs['user_agents']) ? $configs['user_agents'] : null; |
376 | - $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : null; |
|
377 | - $configs['client_ips'] = isset($configs['client_ips']) ? $configs['client_ips'] : null; |
|
378 | - $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; |
|
379 | - $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; |
|
380 | - $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; |
|
381 | - $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; |
|
382 | - $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; |
|
383 | - $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); |
|
376 | + $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : null; |
|
377 | + $configs['client_ips'] = isset($configs['client_ips']) ? $configs['client_ips'] : null; |
|
378 | + $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; |
|
379 | + $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; |
|
380 | + $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; |
|
381 | + $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; |
|
382 | + $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; |
|
383 | + $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); |
|
384 | 384 | |
385 | 385 | // csv、sql、db |
386 | - self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : ''; |
|
387 | - self::$export_file = isset($configs['export']['file']) ? $configs['export']['file'] : ''; |
|
386 | + self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : ''; |
|
387 | + self::$export_file = isset($configs['export']['file']) ? $configs['export']['file'] : ''; |
|
388 | 388 | self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : ''; |
389 | - self::$db_config = isset($configs['db_config']) ? $configs['db_config'] : array(); |
|
390 | - self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array(); |
|
389 | + self::$db_config = isset($configs['db_config']) ? $configs['db_config'] : array(); |
|
390 | + self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array(); |
|
391 | 391 | |
392 | 392 | // 是否设置了并发任务数, 并且大于1, 而且不是windows环境 |
393 | 393 | if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win()) |
@@ -604,7 +604,7 @@ discard block |
||
604 | 604 | $command2 = isset($argv[2]) ? $argv[2] : ''; |
605 | 605 | |
606 | 606 | // 根据命令做相应处理 |
607 | - switch($command) |
|
607 | + switch ($command) |
|
608 | 608 | { |
609 | 609 | // 启动 phpspider |
610 | 610 | case 'start': |
@@ -766,7 +766,7 @@ discard block |
||
766 | 766 | // 显示最后结果 |
767 | 767 | log::$log_show = true; |
768 | 768 | |
769 | - $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); |
|
769 | + $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start)); |
|
770 | 770 | log::note("Spider finished in {$spider_time_run}"); |
771 | 771 | |
772 | 772 | $get_collected_url_num = $this->get_collected_url_num(); |
@@ -795,12 +795,12 @@ discard block |
||
795 | 795 | // 检查PHP版本 |
796 | 796 | if (version_compare(PHP_VERSION, '5.3.0', 'lt')) |
797 | 797 | { |
798 | - log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion()); |
|
798 | + log::error('PHP 5.3+ is required, currently installed version is: '.phpversion()); |
|
799 | 799 | exit; |
800 | 800 | } |
801 | 801 | |
802 | 802 | // 检查CURL扩展 |
803 | - if(!function_exists('curl_init')) |
|
803 | + if (!function_exists('curl_init')) |
|
804 | 804 | { |
805 | 805 | log::error("The curl extension was not found"); |
806 | 806 | exit; |
@@ -861,7 +861,7 @@ discard block |
||
861 | 861 | exit; |
862 | 862 | } |
863 | 863 | |
864 | - foreach ( self::$configs['scan_urls'] as $url ) |
|
864 | + foreach (self::$configs['scan_urls'] as $url) |
|
865 | 865 | { |
866 | 866 | // 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了. |
867 | 867 | if (!$this->is_scan_page($url)) |
@@ -917,7 +917,7 @@ discard block |
||
917 | 917 | //-------------------------------------------------------------------------------- |
918 | 918 | |
919 | 919 | // 添加入口URL到队列 |
920 | - foreach ( self::$configs['scan_urls'] as $url ) |
|
920 | + foreach (self::$configs['scan_urls'] as $url) |
|
921 | 921 | { |
922 | 922 | // false 表示不允许重复 |
923 | 923 | $this->add_scan_url($url, null, false); |
@@ -965,13 +965,13 @@ discard block |
||
965 | 965 | $pid = pcntl_fork(); |
966 | 966 | |
967 | 967 | // 主进程记录子进程pid |
968 | - if($pid > 0) |
|
968 | + if ($pid > 0) |
|
969 | 969 | { |
970 | 970 | // 暂时没用 |
971 | 971 | //self::$taskpids[$taskid] = $pid; |
972 | 972 | } |
973 | 973 | // 子进程运行 |
974 | - elseif(0 === $pid) |
|
974 | + elseif (0 === $pid) |
|
975 | 975 | { |
976 | 976 | log::warn("Fork children task({$taskid}) successful..."); |
977 | 977 | |
@@ -999,7 +999,7 @@ discard block |
||
999 | 999 | { |
1000 | 1000 | queue::set_connect('default', self::$queue_config); |
1001 | 1001 | queue::init(); |
1002 | - while( $queue_lsize = $this->queue_lsize() ) |
|
1002 | + while ($queue_lsize = $this->queue_lsize()) |
|
1003 | 1003 | { |
1004 | 1004 | // 如果是主任务 |
1005 | 1005 | if (self::$taskmaster) |
@@ -1008,7 +1008,7 @@ discard block |
||
1008 | 1008 | if (self::$tasknum > 1 && !self::$fork_task_complete) |
1009 | 1009 | { |
1010 | 1010 | // 主进程采集到两倍于任务数时, 生成子任务一起采集 |
1011 | - if ( $queue_lsize > self::$tasknum*2 ) |
|
1011 | + if ($queue_lsize > self::$tasknum * 2) |
|
1012 | 1012 | { |
1013 | 1013 | self::$fork_task_complete = true; |
1014 | 1014 | |
@@ -1037,7 +1037,7 @@ discard block |
||
1037 | 1037 | else |
1038 | 1038 | { |
1039 | 1039 | // 如果队列中的网页比任务数2倍多, 子任务可以采集, 否则等待... |
1040 | - if ( $queue_lsize > self::$tasknum*2 ) |
|
1040 | + if ($queue_lsize > self::$tasknum * 2) |
|
1041 | 1041 | { |
1042 | 1042 | // 抓取页面 |
1043 | 1043 | $this->collect_page(); |
@@ -1176,7 +1176,7 @@ discard block |
||
1176 | 1176 | if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth']) |
1177 | 1177 | { |
1178 | 1178 | // 分析提取HTML页面中的URL |
1179 | - $this->get_urls($page['raw'], $url, $link['depth'] + 1); |
|
1179 | + $this->get_urls($page['raw'], $url, $link['depth']+1); |
|
1180 | 1180 | } |
1181 | 1181 | } |
1182 | 1182 | |
@@ -1191,10 +1191,10 @@ discard block |
||
1191 | 1191 | $this->incr_depth_num($link['depth']); |
1192 | 1192 | |
1193 | 1193 | // 处理页面耗时时间 |
1194 | - $time_run = round(microtime(true) - $page_time_start, 3); |
|
1194 | + $time_run = round(microtime(true)-$page_time_start, 3); |
|
1195 | 1195 | log::debug("Success process page {$url} in {$time_run} s"); |
1196 | 1196 | |
1197 | - $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); |
|
1197 | + $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start)); |
|
1198 | 1198 | log::info("Spider running in {$spider_time_run}"); |
1199 | 1199 | |
1200 | 1200 | // 爬虫爬取每个网页的时间间隔, 单位: 毫秒 |
@@ -1314,12 +1314,12 @@ discard block |
||
1314 | 1314 | log::error("Failed to download page {$url}"); |
1315 | 1315 | self::$collect_fail++; |
1316 | 1316 | } |
1317 | - elseif (in_array($http_code, array('0','502','503','429'))) |
|
1317 | + elseif (in_array($http_code, array('0', '502', '503', '429'))) |
|
1318 | 1318 | { |
1319 | 1319 | // 采集次数加一 |
1320 | 1320 | $link['try_num']++; |
1321 | 1321 | // 抓取次数 小于 允许抓取失败次数 |
1322 | - if ( $link['try_num'] <= $link['max_try'] ) |
|
1322 | + if ($link['try_num'] <= $link['max_try']) |
|
1323 | 1323 | { |
1324 | 1324 | // 扔到队列头部去, 继续采集 |
1325 | 1325 | $this->queue_rpush($link); |
@@ -1337,7 +1337,7 @@ discard block |
||
1337 | 1337 | } |
1338 | 1338 | |
1339 | 1339 | // 爬取页面耗时时间 |
1340 | - $time_run = round(microtime(true) - $time_start, 3); |
|
1340 | + $time_run = round(microtime(true)-$time_start, 3); |
|
1341 | 1341 | log::debug("Success download page {$url} in {$time_run} s"); |
1342 | 1342 | self::$collect_succ++; |
1343 | 1343 | |
@@ -1382,7 +1382,7 @@ discard block |
||
1382 | 1382 | |
1383 | 1383 | foreach ($urls as $key=>$url) |
1384 | 1384 | { |
1385 | - $urls[$key] = str_replace(array("\"", "'",'&'), array("",'','&'), $url); |
|
1385 | + $urls[$key] = str_replace(array("\"", "'", '&'), array("", '', '&'), $url); |
|
1386 | 1386 | } |
1387 | 1387 | |
1388 | 1388 | //-------------------------------------------------------------------------------- |
@@ -1458,12 +1458,12 @@ discard block |
||
1458 | 1458 | |
1459 | 1459 | // 排除JavaScript的连接 |
1460 | 1460 | //if (strpos($url, "javascript:") !== false) |
1461 | - if( preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '') |
|
1461 | + if (preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '') |
|
1462 | 1462 | { |
1463 | 1463 | return false; |
1464 | 1464 | } |
1465 | 1465 | // 排除没有被解析成功的语言标签 |
1466 | - if(substr($url, 0, 3) == '<%=') |
|
1466 | + if (substr($url, 0, 3) == '<%=') |
|
1467 | 1467 | { |
1468 | 1468 | return false; |
1469 | 1469 | } |
@@ -1482,46 +1482,46 @@ discard block |
||
1482 | 1482 | $domain = $parse_url['host']; |
1483 | 1483 | $path = empty($parse_url['path']) ? '' : $parse_url['path']; |
1484 | 1484 | $base_url_path = $domain.$path; |
1485 | - $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/","/",$base_url_path); |
|
1486 | - $base_url_path = preg_replace("/\/$/",'',$base_url_path); |
|
1485 | + $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", "/", $base_url_path); |
|
1486 | + $base_url_path = preg_replace("/\/$/", '', $base_url_path); |
|
1487 | 1487 | |
1488 | 1488 | $i = $path_step = 0; |
1489 | 1489 | $dstr = $pstr = ''; |
1490 | - $pos = strpos($url,'#'); |
|
1491 | - if($pos > 0) |
|
1490 | + $pos = strpos($url, '#'); |
|
1491 | + if ($pos > 0) |
|
1492 | 1492 | { |
1493 | 1493 | // 去掉#和后面的字符串 |
1494 | 1494 | $url = substr($url, 0, $pos); |
1495 | 1495 | } |
1496 | 1496 | |
1497 | 1497 | // 京东变态的都是 //www.jd.com/111.html |
1498 | - if(substr($url, 0, 2) == '//') |
|
1498 | + if (substr($url, 0, 2) == '//') |
|
1499 | 1499 | { |
1500 | 1500 | $url = str_replace("//", "", $url); |
1501 | 1501 | } |
1502 | 1502 | // /1234.html |
1503 | - elseif($url[0] == '/') |
|
1503 | + elseif ($url[0] == '/') |
|
1504 | 1504 | { |
1505 | 1505 | $url = $domain.$url; |
1506 | 1506 | } |
1507 | 1507 | // ./1234.html、../1234.html 这种类型的 |
1508 | - elseif($url[0] == '.') |
|
1508 | + elseif ($url[0] == '.') |
|
1509 | 1509 | { |
1510 | - if(!isset($url[2])) |
|
1510 | + if (!isset($url[2])) |
|
1511 | 1511 | { |
1512 | 1512 | return false; |
1513 | 1513 | } |
1514 | 1514 | else |
1515 | 1515 | { |
1516 | - $urls = explode('/',$url); |
|
1517 | - foreach($urls as $u) |
|
1516 | + $urls = explode('/', $url); |
|
1517 | + foreach ($urls as $u) |
|
1518 | 1518 | { |
1519 | - if( $u == '..' ) |
|
1519 | + if ($u == '..') |
|
1520 | 1520 | { |
1521 | 1521 | $path_step++; |
1522 | 1522 | } |
1523 | 1523 | // 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的 |
1524 | - else if( $i < count($urls)-1 ) |
|
1524 | + else if ($i < count($urls)-1) |
|
1525 | 1525 | { |
1526 | 1526 | $dstr .= $urls[$i].'/'; |
1527 | 1527 | } |
@@ -1531,29 +1531,29 @@ discard block |
||
1531 | 1531 | } |
1532 | 1532 | $i++; |
1533 | 1533 | } |
1534 | - $urls = explode('/',$base_url_path); |
|
1535 | - if(count($urls) <= $path_step) |
|
1534 | + $urls = explode('/', $base_url_path); |
|
1535 | + if (count($urls) <= $path_step) |
|
1536 | 1536 | { |
1537 | 1537 | return false; |
1538 | 1538 | } |
1539 | 1539 | else |
1540 | 1540 | { |
1541 | 1541 | $pstr = ''; |
1542 | - for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; } |
|
1542 | + for ($i = 0; $i < count($urls)-$path_step; $i++) { $pstr .= $urls[$i].'/'; } |
|
1543 | 1543 | $url = $pstr.$dstr; |
1544 | 1544 | } |
1545 | 1545 | } |
1546 | 1546 | } |
1547 | 1547 | else |
1548 | 1548 | { |
1549 | - if( strtolower(substr($url, 0, 7))=='http://' ) |
|
1549 | + if (strtolower(substr($url, 0, 7)) == 'http://') |
|
1550 | 1550 | { |
1551 | - $url = preg_replace('#^http://#i','',$url); |
|
1551 | + $url = preg_replace('#^http://#i', '', $url); |
|
1552 | 1552 | $scheme = "http"; |
1553 | 1553 | } |
1554 | - else if( strtolower(substr($url, 0, 8))=='https://' ) |
|
1554 | + else if (strtolower(substr($url, 0, 8)) == 'https://') |
|
1555 | 1555 | { |
1556 | - $url = preg_replace('#^https://#i','',$url); |
|
1556 | + $url = preg_replace('#^https://#i', '', $url); |
|
1557 | 1557 | $scheme = "https"; |
1558 | 1558 | } |
1559 | 1559 | else |
@@ -1650,16 +1650,16 @@ discard block |
||
1650 | 1650 | public function link_uncompress($link) |
1651 | 1651 | { |
1652 | 1652 | $link = array( |
1653 | - 'url' => isset($link['url']) ? $link['url'] : '', |
|
1654 | - 'url_type' => isset($link['url_type']) ? $link['url_type'] : '', |
|
1655 | - 'method' => isset($link['method']) ? $link['method'] : 'get', |
|
1656 | - 'headers' => isset($link['headers']) ? $link['headers'] : array(), |
|
1657 | - 'params' => isset($link['params']) ? $link['params'] : array(), |
|
1653 | + 'url' => isset($link['url']) ? $link['url'] : '', |
|
1654 | + 'url_type' => isset($link['url_type']) ? $link['url_type'] : '', |
|
1655 | + 'method' => isset($link['method']) ? $link['method'] : 'get', |
|
1656 | + 'headers' => isset($link['headers']) ? $link['headers'] : array(), |
|
1657 | + 'params' => isset($link['params']) ? $link['params'] : array(), |
|
1658 | 1658 | 'context_data' => isset($link['context_data']) ? $link['context_data'] : '', |
1659 | - 'proxies' => isset($link['proxies']) ? $link['proxies'] : self::$configs['proxies'], |
|
1660 | - 'try_num' => isset($link['try_num']) ? $link['try_num'] : 0, |
|
1661 | - 'max_try' => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'], |
|
1662 | - 'depth' => isset($link['depth']) ? $link['depth'] : 0, |
|
1659 | + 'proxies' => isset($link['proxies']) ? $link['proxies'] : self::$configs['proxies'], |
|
1660 | + 'try_num' => isset($link['try_num']) ? $link['try_num'] : 0, |
|
1661 | + 'max_try' => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'], |
|
1662 | + 'depth' => isset($link['depth']) ? $link['depth'] : 0, |
|
1663 | 1663 | ); |
1664 | 1664 | |
1665 | 1665 | return $link; |
@@ -1704,12 +1704,12 @@ discard block |
||
1704 | 1704 | exit(0); |
1705 | 1705 | } |
1706 | 1706 | |
1707 | - if (version_compare(PHP_VERSION,'5.4.0','<')) |
|
1707 | + if (version_compare(PHP_VERSION, '5.4.0', '<')) |
|
1708 | 1708 | { |
1709 | 1709 | $fields_str = json_encode($fields); |
1710 | - $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) { |
|
1710 | + $fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function($matchs) { |
|
1711 | 1711 | return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1])); |
1712 | - }, $fields_str ); |
|
1712 | + }, $fields_str); |
|
1713 | 1713 | } |
1714 | 1714 | else |
1715 | 1715 | { |
@@ -1775,7 +1775,7 @@ discard block |
||
1775 | 1775 | if (!empty($conf['selector'])) |
1776 | 1776 | { |
1777 | 1777 | // 如果这个field是上一个field的附带连接 |
1778 | - if (isset($conf['source_type']) && $conf['source_type']=='attached_url') |
|
1778 | + if (isset($conf['source_type']) && $conf['source_type'] == 'attached_url') |
|
1779 | 1779 | { |
1780 | 1780 | // 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的 |
1781 | 1781 | if (!empty($fields[$conf['attached_url']])) |
@@ -1802,15 +1802,15 @@ discard block |
||
1802 | 1802 | } |
1803 | 1803 | |
1804 | 1804 | // 没有设置抽取规则的类型 或者 设置为 xpath |
1805 | - if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') |
|
1805 | + if (!isset($conf['selector_type']) || $conf['selector_type'] == 'xpath') |
|
1806 | 1806 | { |
1807 | 1807 | $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']); |
1808 | 1808 | } |
1809 | - elseif ($conf['selector_type']=='css') |
|
1809 | + elseif ($conf['selector_type'] == 'css') |
|
1810 | 1810 | { |
1811 | 1811 | $values = $this->get_fields_css($html, $conf['selector'], $conf['name']); |
1812 | 1812 | } |
1813 | - elseif ($conf['selector_type']=='regex') |
|
1813 | + elseif ($conf['selector_type'] == 'regex') |
|
1814 | 1814 | { |
1815 | 1815 | $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']); |
1816 | 1816 | } |
@@ -1957,7 +1957,7 @@ discard block |
||
1957 | 1957 | |
1958 | 1958 | $config = self::$db_config; |
1959 | 1959 | @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']); |
1960 | - if(mysqli_connect_errno()) |
|
1960 | + if (mysqli_connect_errno()) |
|
1961 | 1961 | { |
1962 | 1962 | log::error("Export data to a database need Mysql support, Error: ".mysqli_connect_error()); |
1963 | 1963 | exit; |
@@ -1994,7 +1994,7 @@ discard block |
||
1994 | 1994 | $msg .= "Do you want to continue? [Y/n]"; |
1995 | 1995 | fwrite(STDOUT, $msg); |
1996 | 1996 | $arg = strtolower(trim(fgets(STDIN))); |
1997 | - $arg = empty($arg) || !in_array($arg, array('y','n')) ? 'y' : $arg; |
|
1997 | + $arg = empty($arg) || !in_array($arg, array('y', 'n')) ? 'y' : $arg; |
|
1998 | 1998 | if ($arg == 'n') |
1999 | 1999 | { |
2000 | 2000 | foreach ($keys as $key) |
@@ -2034,9 +2034,9 @@ discard block |
||
2034 | 2034 | public function set_task_status() |
2035 | 2035 | { |
2036 | 2036 | // 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用 |
2037 | - $mem = round(memory_get_usage(true)/(1024*1024),2); |
|
2038 | - $use_time = microtime(true) - self::$time_start; |
|
2039 | - $speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2); |
|
2037 | + $mem = round(memory_get_usage(true) / (1024 * 1024), 2); |
|
2038 | + $use_time = microtime(true)-self::$time_start; |
|
2039 | + $speed = round((self::$collect_succ+self::$collect_fail) / $use_time, 2); |
|
2040 | 2040 | $status = array( |
2041 | 2041 | 'id' => self::$taskid, |
2042 | 2042 | 'pid' => self::$taskpid, |
@@ -2609,7 +2609,7 @@ discard block |
||
2609 | 2609 | { |
2610 | 2610 | static $last_lines = 0; |
2611 | 2611 | |
2612 | - if(!is_null($force_clear_lines)) |
|
2612 | + if (!is_null($force_clear_lines)) |
|
2613 | 2613 | { |
2614 | 2614 | $last_lines = $force_clear_lines; |
2615 | 2615 | } |
@@ -2617,19 +2617,19 @@ discard block |
||
2617 | 2617 | // 获取终端宽度 |
2618 | 2618 | $toss = $status = null; |
2619 | 2619 | $term_width = exec('tput cols', $toss, $status); |
2620 | - if($status || empty($term_width)) |
|
2620 | + if ($status || empty($term_width)) |
|
2621 | 2621 | { |
2622 | 2622 | $term_width = 64; // Arbitrary fall-back term width. |
2623 | 2623 | } |
2624 | 2624 | |
2625 | 2625 | $line_count = 0; |
2626 | - foreach(explode("\n", $message) as $line) |
|
2626 | + foreach (explode("\n", $message) as $line) |
|
2627 | 2627 | { |
2628 | 2628 | $line_count += count(str_split($line, $term_width)); |
2629 | 2629 | } |
2630 | 2630 | |
2631 | 2631 | // Erasure MAGIC: Clear as many lines as the last output had. |
2632 | - for($i = 0; $i < $last_lines; $i++) |
|
2632 | + for ($i = 0; $i < $last_lines; $i++) |
|
2633 | 2633 | { |
2634 | 2634 | // Return to the beginning of the line |
2635 | 2635 | echo "\r"; |
@@ -2666,16 +2666,16 @@ discard block |
||
2666 | 2666 | $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; |
2667 | 2667 | //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; |
2668 | 2668 | $run_time_str = util::time2second(time()-self::$time_start, false); |
2669 | - $display_str .= 'PHPSpider version:' . self::VERSION . " PHP version:" . PHP_VERSION . "\n"; |
|
2670 | - $display_str .= 'start time:'. date('Y-m-d H:i:s', self::$time_start).' run ' . $run_time_str . " \n"; |
|
2669 | + $display_str .= 'PHPSpider version:'.self::VERSION." PHP version:".PHP_VERSION."\n"; |
|
2670 | + $display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).' run '.$run_time_str." \n"; |
|
2671 | 2671 | |
2672 | - $display_str .= 'spider name: ' . self::$configs['name'] . "\n"; |
|
2672 | + $display_str .= 'spider name: '.self::$configs['name']."\n"; |
|
2673 | 2673 | if (self::$multiserver) |
2674 | 2674 | { |
2675 | - $display_str .= 'server id: ' . self::$serverid."\n"; |
|
2675 | + $display_str .= 'server id: '.self::$serverid."\n"; |
|
2676 | 2676 | } |
2677 | - $display_str .= 'task number: ' . self::$tasknum . "\n"; |
|
2678 | - $display_str .= 'load average: ' . implode(", ", $loadavg) . "\n"; |
|
2677 | + $display_str .= 'task number: '.self::$tasknum."\n"; |
|
2678 | + $display_str .= 'load average: '.implode(", ", $loadavg)."\n"; |
|
2679 | 2679 | $display_str .= "document: https://doc.phpspider.org\n"; |
2680 | 2680 | |
2681 | 2681 | $display_str .= $this->display_task_ui(); |
@@ -2705,12 +2705,12 @@ discard block |
||
2705 | 2705 | { |
2706 | 2706 | $display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n"; |
2707 | 2707 | |
2708 | - $display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')). |
|
2709 | - "\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')). |
|
2710 | - "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). |
|
2711 | - "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). |
|
2712 | - "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). |
|
2713 | - "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). |
|
2708 | + $display_str .= "\033[47;30mtaskid\033[0m".str_pad('', self::$taskid_length+2-strlen('taskid')). |
|
2709 | + "\033[47;30mtaskpid\033[0m".str_pad('', self::$pid_length+2-strlen('taskpid')). |
|
2710 | + "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). |
|
2711 | + "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). |
|
2712 | + "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). |
|
2713 | + "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). |
|
2714 | 2714 | "\n"; |
2715 | 2715 | |
2716 | 2716 | // "\033[32;40m [OK] \033[0m" |
@@ -2738,12 +2738,12 @@ discard block |
||
2738 | 2738 | { |
2739 | 2739 | $display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n"; |
2740 | 2740 | |
2741 | - $display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')). |
|
2742 | - "\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')). |
|
2743 | - "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). |
|
2744 | - "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). |
|
2745 | - "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). |
|
2746 | - "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). |
|
2741 | + $display_str .= "\033[47;30mserver\033[0m".str_pad('', self::$server_length+2-strlen('serverid')). |
|
2742 | + "\033[47;30mtasknum\033[0m".str_pad('', self::$tasknum_length+2-strlen('tasknum')). |
|
2743 | + "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). |
|
2744 | + "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). |
|
2745 | + "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). |
|
2746 | + "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). |
|
2747 | 2747 | "\n"; |
2748 | 2748 | |
2749 | 2749 | $server_list_json = queue::get("server_list"); |
@@ -2784,11 +2784,11 @@ discard block |
||
2784 | 2784 | { |
2785 | 2785 | $display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n"; |
2786 | 2786 | |
2787 | - $display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')). |
|
2788 | - "\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')). |
|
2789 | - "\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')). |
|
2790 | - "\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')). |
|
2791 | - "\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')). |
|
2787 | + $display_str .= "\033[47;30mfind pages\033[0m".str_pad('', 16-strlen('find pages')). |
|
2788 | + "\033[47;30mqueue\033[0m".str_pad('', 14-strlen('queue')). |
|
2789 | + "\033[47;30mcollected\033[0m".str_pad('', 15-strlen('collected')). |
|
2790 | + "\033[47;30mfields\033[0m".str_pad('', 15-strlen('fields')). |
|
2791 | + "\033[47;30mdepth\033[0m".str_pad('', 12-strlen('depth')). |
|
2792 | 2792 | "\n"; |
2793 | 2793 | |
2794 | 2794 | $collect = $this->get_collect_url_num(); |
@@ -436,13 +436,11 @@ discard block |
||
436 | 436 | { |
437 | 437 | $link['url_type'] = 'list_page'; |
438 | 438 | $status = $this->queue_lpush($link, $allowed_repeat); |
439 | - } |
|
440 | - elseif ($this->is_content_page($url)) |
|
439 | + } elseif ($this->is_content_page($url)) |
|
441 | 440 | { |
442 | 441 | $link['url_type'] = 'content_page'; |
443 | 442 | $status = $this->queue_lpush($link, $allowed_repeat); |
444 | - } |
|
445 | - else |
|
443 | + } else |
|
446 | 444 | { |
447 | 445 | $status = $this->queue_lpush($link, $allowed_repeat); |
448 | 446 | } |
@@ -452,12 +450,10 @@ discard block |
||
452 | 450 | if ($link['url_type'] == 'scan_page') |
453 | 451 | { |
454 | 452 | log::debug("Find scan page: {$url}"); |
455 | - } |
|
456 | - elseif ($link['url_type'] == 'list_page') |
|
453 | + } elseif ($link['url_type'] == 'list_page') |
|
457 | 454 | { |
458 | 455 | log::debug("Find list page: {$url}"); |
459 | - } |
|
460 | - elseif ($link['url_type'] == 'content_page') |
|
456 | + } elseif ($link['url_type'] == 'content_page') |
|
461 | 457 | { |
462 | 458 | log::debug("Find content page: {$url}"); |
463 | 459 | } |
@@ -503,12 +499,10 @@ discard block |
||
503 | 499 | if ($link['url_type'] == 'scan_page') |
504 | 500 | { |
505 | 501 | log::debug("Find scan page: {$url}"); |
506 | - } |
|
507 | - elseif ($link['url_type'] == 'list_page') |
|
502 | + } elseif ($link['url_type'] == 'list_page') |
|
508 | 503 | { |
509 | 504 | log::debug("Find list page: {$url}"); |
510 | - } |
|
511 | - elseif ($link['url_type'] == 'content_page') |
|
505 | + } elseif ($link['url_type'] == 'content_page') |
|
512 | 506 | { |
513 | 507 | log::debug("Find content page: {$url}"); |
514 | 508 | } |
@@ -618,8 +612,7 @@ discard block |
||
618 | 612 | if (count($info) <= 1) |
619 | 613 | { |
620 | 614 | echo "PHPSpider[$start_file] not run\n"; |
621 | - } |
|
622 | - else |
|
615 | + } else |
|
623 | 616 | { |
624 | 617 | //echo "PHPSpider[$start_file] is stoping ...\n"; |
625 | 618 | echo "PHPSpider[$start_file] stop success"; |
@@ -697,8 +690,7 @@ discard block |
||
697 | 690 | if (-1 === $pid) |
698 | 691 | { |
699 | 692 | throw new Exception('fork fail'); |
700 | - } |
|
701 | - elseif ($pid > 0) |
|
693 | + } elseif ($pid > 0) |
|
702 | 694 | { |
703 | 695 | exit(0); |
704 | 696 | } |
@@ -711,8 +703,7 @@ discard block |
||
711 | 703 | if (-1 === $pid) |
712 | 704 | { |
713 | 705 | throw new Exception("fork fail"); |
714 | - } |
|
715 | - elseif (0 !== $pid) |
|
706 | + } elseif (0 !== $pid) |
|
716 | 707 | { |
717 | 708 | exit(0); |
718 | 709 | } |
@@ -753,8 +744,7 @@ discard block |
||
753 | 744 | if ($all_stop) |
754 | 745 | { |
755 | 746 | break; |
756 | - } |
|
757 | - else |
|
747 | + } else |
|
758 | 748 | { |
759 | 749 | log::warn("Task stop waiting..."); |
760 | 750 | } |
@@ -876,8 +866,7 @@ discard block |
||
876 | 866 | { |
877 | 867 | self::$configs['name'] = iconv("UTF-8", "GB2312//IGNORE", self::$configs['name']); |
878 | 868 | log::$log_show = true; |
879 | - } |
|
880 | - else |
|
869 | + } else |
|
881 | 870 | { |
882 | 871 | log::$log_show = isset(self::$configs['log_show']) ? self::$configs['log_show'] : false; |
883 | 872 | } |
@@ -893,13 +882,17 @@ discard block |
||
893 | 882 | $start_file = $argv[0]; |
894 | 883 | |
895 | 884 | $header = ""; |
896 | - if (!util::is_win()) $header .= "\033[33m"; |
|
885 | + if (!util::is_win()) { |
|
886 | + $header .= "\033[33m"; |
|
887 | + } |
|
897 | 888 | $header .= "\n[ ".self::$configs['name']." Spider ] is started...\n\n"; |
898 | 889 | $header .= " * PHPSpider Version: ".self::VERSION."\n"; |
899 | 890 | $header .= " * Documentation: https://doc.phpspider.org\n"; |
900 | 891 | $header .= " * Task Number: ".self::$tasknum."\n\n"; |
901 | 892 | $header .= "Input \"php $start_file stop\" to quit. Start success.\n"; |
902 | - if (!util::is_win()) $header .= "\033[0m"; |
|
893 | + if (!util::is_win()) { |
|
894 | + $header .= "\033[0m"; |
|
895 | + } |
|
903 | 896 | log::note($header); |
904 | 897 | } |
905 | 898 | |
@@ -939,8 +932,7 @@ discard block |
||
939 | 932 | // 先显示一次面板, 然后下面再每次采集成功显示一次 |
940 | 933 | $this->display_ui(); |
941 | 934 | } |
942 | - } |
|
943 | - else |
|
935 | + } else |
|
944 | 936 | { |
945 | 937 | $this->daemonize(); |
946 | 938 | } |
@@ -987,8 +979,7 @@ discard block |
||
987 | 979 | |
988 | 980 | // 这里用0表示正常退出 |
989 | 981 | exit(0); |
990 | - } |
|
991 | - else |
|
982 | + } else |
|
992 | 983 | { |
993 | 984 | log::error("Fork children task({$taskid}) fail..."); |
994 | 985 | exit; |
@@ -1043,8 +1034,7 @@ discard block |
||
1043 | 1034 | $this->collect_page(); |
1044 | 1035 | // 保存任务状态 |
1045 | 1036 | $this->set_task_status(); |
1046 | - } |
|
1047 | - else |
|
1037 | + } else |
|
1048 | 1038 | { |
1049 | 1039 | log::warn("Task(".self::$taskid.") waiting..."); |
1050 | 1040 | sleep(1); |
@@ -1138,7 +1128,9 @@ discard block |
||
1138 | 1128 | { |
1139 | 1129 | $return = call_user_func($this->on_download_page, $page, $this); |
1140 | 1130 | // 针对那些老是忘记return的人 |
1141 | - if (isset($return)) $page = $return; |
|
1131 | + if (isset($return)) { |
|
1132 | + $page = $return; |
|
1133 | + } |
|
1142 | 1134 | } |
1143 | 1135 | |
1144 | 1136 | // 是否从当前页面分析提取URL |
@@ -1149,23 +1141,27 @@ discard block |
||
1149 | 1141 | if ($this->on_scan_page) |
1150 | 1142 | { |
1151 | 1143 | $return = call_user_func($this->on_scan_page, $page, $page['raw'], $this); |
1152 | - if (isset($return)) $is_find_url = $return; |
|
1144 | + if (isset($return)) { |
|
1145 | + $is_find_url = $return; |
|
1146 | + } |
|
1153 | 1147 | } |
1154 | - } |
|
1155 | - elseif ($link['url_type'] == 'list_page') |
|
1148 | + } elseif ($link['url_type'] == 'list_page') |
|
1156 | 1149 | { |
1157 | 1150 | if ($this->on_list_page) |
1158 | 1151 | { |
1159 | 1152 | $return = call_user_func($this->on_list_page, $page, $page['raw'], $this); |
1160 | - if (isset($return)) $is_find_url = $return; |
|
1153 | + if (isset($return)) { |
|
1154 | + $is_find_url = $return; |
|
1155 | + } |
|
1161 | 1156 | } |
1162 | - } |
|
1163 | - elseif ($link['url_type'] == 'content_page') |
|
1157 | + } elseif ($link['url_type'] == 'content_page') |
|
1164 | 1158 | { |
1165 | 1159 | if ($this->on_content_page) |
1166 | 1160 | { |
1167 | 1161 | $return = call_user_func($this->on_content_page, $page, $page['raw'], $this); |
1168 | - if (isset($return)) $is_find_url = $return; |
|
1162 | + if (isset($return)) { |
|
1163 | + $is_find_url = $return; |
|
1164 | + } |
|
1169 | 1165 | } |
1170 | 1166 | } |
1171 | 1167 | |
@@ -1299,13 +1295,11 @@ discard block |
||
1299 | 1295 | { |
1300 | 1296 | $html .= $link['context_data']; |
1301 | 1297 | } |
1302 | - } |
|
1303 | - else |
|
1298 | + } else |
|
1304 | 1299 | { |
1305 | 1300 | return false; |
1306 | 1301 | } |
1307 | - } |
|
1308 | - else |
|
1302 | + } else |
|
1309 | 1303 | { |
1310 | 1304 | if ($http_code == 407) |
1311 | 1305 | { |
@@ -1313,8 +1307,7 @@ discard block |
||
1313 | 1307 | $this->queue_rpush($link); |
1314 | 1308 | log::error("Failed to download page {$url}"); |
1315 | 1309 | self::$collect_fail++; |
1316 | - } |
|
1317 | - elseif (in_array($http_code, array('0','502','503','429'))) |
|
1310 | + } elseif (in_array($http_code, array('0','502','503','429'))) |
|
1318 | 1311 | { |
1319 | 1312 | // 采集次数加一 |
1320 | 1313 | $link['try_num']++; |
@@ -1325,8 +1318,7 @@ discard block |
||
1325 | 1318 | $this->queue_rpush($link); |
1326 | 1319 | } |
1327 | 1320 | log::error("Failed to download page {$url}, retry({$link['try_num']})"); |
1328 | - } |
|
1329 | - else |
|
1321 | + } else |
|
1330 | 1322 | { |
1331 | 1323 | log::error("Failed to download page {$url}"); |
1332 | 1324 | self::$collect_fail++; |
@@ -1402,8 +1394,7 @@ discard block |
||
1402 | 1394 | if ($val) |
1403 | 1395 | { |
1404 | 1396 | $urls[$k] = $val; |
1405 | - } |
|
1406 | - else |
|
1397 | + } else |
|
1407 | 1398 | { |
1408 | 1399 | unset($urls[$k]); |
1409 | 1400 | } |
@@ -1510,8 +1501,7 @@ discard block |
||
1510 | 1501 | if(!isset($url[2])) |
1511 | 1502 | { |
1512 | 1503 | return false; |
1513 | - } |
|
1514 | - else |
|
1504 | + } else |
|
1515 | 1505 | { |
1516 | 1506 | $urls = explode('/',$url); |
1517 | 1507 | foreach($urls as $u) |
@@ -1524,8 +1514,7 @@ discard block |
||
1524 | 1514 | else if( $i < count($urls)-1 ) |
1525 | 1515 | { |
1526 | 1516 | $dstr .= $urls[$i].'/'; |
1527 | - } |
|
1528 | - else |
|
1517 | + } else |
|
1529 | 1518 | { |
1530 | 1519 | $dstr .= $urls[$i]; |
1531 | 1520 | } |
@@ -1535,28 +1524,24 @@ discard block |
||
1535 | 1524 | if(count($urls) <= $path_step) |
1536 | 1525 | { |
1537 | 1526 | return false; |
1538 | - } |
|
1539 | - else |
|
1527 | + } else |
|
1540 | 1528 | { |
1541 | 1529 | $pstr = ''; |
1542 | 1530 | for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; } |
1543 | 1531 | $url = $pstr.$dstr; |
1544 | 1532 | } |
1545 | 1533 | } |
1546 | - } |
|
1547 | - else |
|
1534 | + } else |
|
1548 | 1535 | { |
1549 | 1536 | if( strtolower(substr($url, 0, 7))=='http://' ) |
1550 | 1537 | { |
1551 | 1538 | $url = preg_replace('#^http://#i','',$url); |
1552 | 1539 | $scheme = "http"; |
1553 | - } |
|
1554 | - else if( strtolower(substr($url, 0, 8))=='https://' ) |
|
1540 | + } else if( strtolower(substr($url, 0, 8))=='https://' ) |
|
1555 | 1541 | { |
1556 | 1542 | $url = preg_replace('#^https://#i','',$url); |
1557 | 1543 | $scheme = "https"; |
1558 | - } |
|
1559 | - else |
|
1544 | + } else |
|
1560 | 1545 | { |
1561 | 1546 | $url = $base_url_path.'/'.$url; |
1562 | 1547 | } |
@@ -1685,12 +1670,10 @@ discard block |
||
1685 | 1670 | if (!isset($return)) |
1686 | 1671 | { |
1687 | 1672 | log::warn("on_extract_page return value can't be empty"); |
1688 | - } |
|
1689 | - elseif (!is_array($return)) |
|
1673 | + } elseif (!is_array($return)) |
|
1690 | 1674 | { |
1691 | 1675 | log::warn("on_extract_page return value must be an array"); |
1692 | - } |
|
1693 | - else |
|
1676 | + } else |
|
1694 | 1677 | { |
1695 | 1678 | $fields = $return; |
1696 | 1679 | } |
@@ -1710,8 +1693,7 @@ discard block |
||
1710 | 1693 | $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) { |
1711 | 1694 | return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1])); |
1712 | 1695 | }, $fields_str ); |
1713 | - } |
|
1714 | - else |
|
1696 | + } else |
|
1715 | 1697 | { |
1716 | 1698 | $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE); |
1717 | 1699 | } |
@@ -1729,13 +1711,11 @@ discard block |
||
1729 | 1711 | if (self::$export_type == 'csv') |
1730 | 1712 | { |
1731 | 1713 | util::put_file(self::$export_file, util::format_csv($fields)."\n", FILE_APPEND); |
1732 | - } |
|
1733 | - elseif (self::$export_type == 'sql') |
|
1714 | + } elseif (self::$export_type == 'sql') |
|
1734 | 1715 | { |
1735 | 1716 | $sql = db::insert(self::$export_table, $fields, true); |
1736 | 1717 | util::put_file(self::$export_file, $sql.";\n", FILE_APPEND); |
1737 | - } |
|
1738 | - elseif (self::$export_type == 'db') |
|
1718 | + } elseif (self::$export_type == 'db') |
|
1739 | 1719 | { |
1740 | 1720 | db::insert(self::$export_table, $fields); |
1741 | 1721 | } |
@@ -1805,12 +1785,10 @@ discard block |
||
1805 | 1785 | if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') |
1806 | 1786 | { |
1807 | 1787 | $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']); |
1808 | - } |
|
1809 | - elseif ($conf['selector_type']=='css') |
|
1788 | + } elseif ($conf['selector_type']=='css') |
|
1810 | 1789 | { |
1811 | 1790 | $values = $this->get_fields_css($html, $conf['selector'], $conf['name']); |
1812 | - } |
|
1813 | - elseif ($conf['selector_type']=='regex') |
|
1791 | + } elseif ($conf['selector_type']=='regex') |
|
1814 | 1792 | { |
1815 | 1793 | $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']); |
1816 | 1794 | } |
@@ -1849,21 +1827,18 @@ discard block |
||
1849 | 1827 | // 避免内容分页时attached_url拼接时候string + array了 |
1850 | 1828 | $fields[$conf['name']] = ''; |
1851 | 1829 | //$fields[$conf['name']] = array(); |
1852 | - } |
|
1853 | - else |
|
1830 | + } else |
|
1854 | 1831 | { |
1855 | 1832 | if (is_array($values)) |
1856 | 1833 | { |
1857 | 1834 | if ($repeated) |
1858 | 1835 | { |
1859 | 1836 | $fields[$conf['name']] = $values; |
1860 | - } |
|
1861 | - else |
|
1837 | + } else |
|
1862 | 1838 | { |
1863 | 1839 | $fields[$conf['name']] = $values[0]; |
1864 | 1840 | } |
1865 | - } |
|
1866 | - else |
|
1841 | + } else |
|
1867 | 1842 | { |
1868 | 1843 | $fields[$conf['name']] = $values; |
1869 | 1844 | } |
@@ -1885,8 +1860,7 @@ discard block |
||
1885 | 1860 | if (!isset($return)) |
1886 | 1861 | { |
1887 | 1862 | log::warn("on_handle_img return value can't be empty\n"); |
1888 | - } |
|
1889 | - else |
|
1863 | + } else |
|
1890 | 1864 | { |
1891 | 1865 | // 有数据才会执行 on_handle_img 方法, 所以这里不要被替换没了 |
1892 | 1866 | $data = $return; |
@@ -1900,8 +1874,7 @@ discard block |
||
1900 | 1874 | if (!isset($return)) |
1901 | 1875 | { |
1902 | 1876 | log::warn("on_extract_field return value can't be empty\n"); |
1903 | - } |
|
1904 | - else |
|
1877 | + } else |
|
1905 | 1878 | { |
1906 | 1879 | // 有数据才会执行 on_extract_field 方法, 所以这里不要被替换没了 |
1907 | 1880 | $fields[$fieldname] = $return; |
@@ -1932,16 +1905,14 @@ discard block |
||
1932 | 1905 | log::error("Export data into CSV files need to Set the file path."); |
1933 | 1906 | exit; |
1934 | 1907 | } |
1935 | - } |
|
1936 | - elseif (self::$export_type == 'sql') |
|
1908 | + } elseif (self::$export_type == 'sql') |
|
1937 | 1909 | { |
1938 | 1910 | if (empty(self::$export_file)) |
1939 | 1911 | { |
1940 | 1912 | log::error("Export data into SQL files need to Set the file path."); |
1941 | 1913 | exit; |
1942 | 1914 | } |
1943 | - } |
|
1944 | - elseif (self::$export_type == 'db') |
|
1915 | + } elseif (self::$export_type == 'db') |
|
1945 | 1916 | { |
1946 | 1917 | if (!function_exists('mysqli_connect')) |
1947 | 1918 | { |
@@ -2051,8 +2022,7 @@ discard block |
||
2051 | 2022 | { |
2052 | 2023 | $key = "server-".self::$serverid."-task_status-".self::$taskid; |
2053 | 2024 | queue::set($key, $task_status); |
2054 | - } |
|
2055 | - else |
|
2025 | + } else |
|
2056 | 2026 | { |
2057 | 2027 | self::$task_status = array($task_status); |
2058 | 2028 | } |
@@ -2111,8 +2081,7 @@ discard block |
||
2111 | 2081 | $key = "server-{$serverid}-task_status-".$i; |
2112 | 2082 | $task_status[] = queue::get($key); |
2113 | 2083 | } |
2114 | - } |
|
2115 | - else |
|
2084 | + } else |
|
2116 | 2085 | { |
2117 | 2086 | $task_status = self::$task_status; |
2118 | 2087 | } |
@@ -2143,8 +2112,7 @@ discard block |
||
2143 | 2112 | 'tasknum' => $tasknum, |
2144 | 2113 | 'time' => time(), |
2145 | 2114 | ); |
2146 | - } |
|
2147 | - else |
|
2115 | + } else |
|
2148 | 2116 | { |
2149 | 2117 | $server_list = json_decode($server_list_json, true); |
2150 | 2118 | $server_list[$serverid] = array( |
@@ -2203,8 +2171,7 @@ discard block |
||
2203 | 2171 | if (self::$use_redis) |
2204 | 2172 | { |
2205 | 2173 | $count = queue::get("collect_urls_num"); |
2206 | - } |
|
2207 | - else |
|
2174 | + } else |
|
2208 | 2175 | { |
2209 | 2176 | $count = self::$collect_urls_num; |
2210 | 2177 | } |
@@ -2224,8 +2191,7 @@ discard block |
||
2224 | 2191 | if (self::$use_redis) |
2225 | 2192 | { |
2226 | 2193 | $count = queue::get("collected_urls_num"); |
2227 | - } |
|
2228 | - else |
|
2194 | + } else |
|
2229 | 2195 | { |
2230 | 2196 | $count = self::$collected_urls_num; |
2231 | 2197 | } |
@@ -2245,8 +2211,7 @@ discard block |
||
2245 | 2211 | if (self::$use_redis) |
2246 | 2212 | { |
2247 | 2213 | queue::incr("collected_urls_num"); |
2248 | - } |
|
2249 | - else |
|
2214 | + } else |
|
2250 | 2215 | { |
2251 | 2216 | self::$collected_urls_num++; |
2252 | 2217 | } |
@@ -2293,8 +2258,7 @@ discard block |
||
2293 | 2258 | // 解锁 |
2294 | 2259 | queue::unlock($lock); |
2295 | 2260 | } |
2296 | - } |
|
2297 | - else |
|
2261 | + } else |
|
2298 | 2262 | { |
2299 | 2263 | $key = md5($url); |
2300 | 2264 | if (!array_key_exists($key, self::$collect_urls)) |
@@ -2348,8 +2312,7 @@ discard block |
||
2348 | 2312 | // 解锁 |
2349 | 2313 | queue::unlock($lock); |
2350 | 2314 | } |
2351 | - } |
|
2352 | - else |
|
2315 | + } else |
|
2353 | 2316 | { |
2354 | 2317 | $key = md5($url); |
2355 | 2318 | if (!array_key_exists($key, self::$collect_urls)) |
@@ -2379,8 +2342,7 @@ discard block |
||
2379 | 2342 | { |
2380 | 2343 | $link = queue::lpop("collect_queue"); |
2381 | 2344 | $link = json_decode($link, true); |
2382 | - } |
|
2383 | - else |
|
2345 | + } else |
|
2384 | 2346 | { |
2385 | 2347 | $link = array_pop(self::$collect_queue); |
2386 | 2348 | } |
@@ -2400,8 +2362,7 @@ discard block |
||
2400 | 2362 | { |
2401 | 2363 | $link = queue::rpop("collect_queue"); |
2402 | 2364 | $link = json_decode($link, true); |
2403 | - } |
|
2404 | - else |
|
2365 | + } else |
|
2405 | 2366 | { |
2406 | 2367 | $link = array_shift(self::$collect_queue); |
2407 | 2368 | } |
@@ -2420,8 +2381,7 @@ discard block |
||
2420 | 2381 | if (self::$use_redis) |
2421 | 2382 | { |
2422 | 2383 | $lsize = queue::lsize("collect_queue"); |
2423 | - } |
|
2424 | - else |
|
2384 | + } else |
|
2425 | 2385 | { |
2426 | 2386 | $lsize = count(self::$collect_queue); |
2427 | 2387 | } |
@@ -2450,8 +2410,7 @@ discard block |
||
2450 | 2410 | |
2451 | 2411 | queue::unlock($lock); |
2452 | 2412 | } |
2453 | - } |
|
2454 | - else |
|
2413 | + } else |
|
2455 | 2414 | { |
2456 | 2415 | if (self::$depth_num < $depth) |
2457 | 2416 | { |
@@ -2473,8 +2432,7 @@ discard block |
||
2473 | 2432 | { |
2474 | 2433 | $depth_num = queue::get("depth_num"); |
2475 | 2434 | return $depth_num ? $depth_num : 0; |
2476 | - } |
|
2477 | - else |
|
2435 | + } else |
|
2478 | 2436 | { |
2479 | 2437 | return self::$depth_num; |
2480 | 2438 | } |
@@ -2492,8 +2450,7 @@ discard block |
||
2492 | 2450 | if (self::$use_redis) |
2493 | 2451 | { |
2494 | 2452 | $fields_num = queue::incr("fields_num"); |
2495 | - } |
|
2496 | - else |
|
2453 | + } else |
|
2497 | 2454 | { |
2498 | 2455 | self::$fields_num++; |
2499 | 2456 | $fields_num = self::$fields_num; |
@@ -2513,8 +2470,7 @@ discard block |
||
2513 | 2470 | if (self::$use_redis) |
2514 | 2471 | { |
2515 | 2472 | $fields_num = queue::get("fields_num"); |
2516 | - } |
|
2517 | - else |
|
2473 | + } else |
|
2518 | 2474 | { |
2519 | 2475 | $fields_num = self::$fields_num; |
2520 | 2476 | } |
@@ -72,21 +72,21 @@ discard block |
||
72 | 72 | public static $output_encoding = null; |
73 | 73 | public static $cookies = array(); // array of cookies to pass |
74 | 74 | // $cookies['username'] = "seatle"; |
75 | - public static $rawheaders = array(); // array of raw headers to send |
|
76 | - public static $domain_cookies = array(); // array of cookies for domain to pass |
|
77 | - public static $hosts = array(); // random host binding for make request faster |
|
78 | - public static $headers = array(); // headers returned from server sent here |
|
79 | - public static $useragents = array("requests/2.0.0"); // random agent we masquerade as |
|
80 | - public static $client_ips = array(); // random ip we masquerade as |
|
81 | - public static $proxies = array(); // random proxy ip |
|
82 | - public static $raw = ""; // head + body content returned from server sent here |
|
83 | - public static $head = ""; // head content |
|
84 | - public static $content = ""; // The body before encoding |
|
85 | - public static $text = ""; // The body after encoding |
|
86 | - public static $info = array(); // curl info |
|
87 | - public static $history = 302; // http request status before redirect. ex:30x |
|
88 | - public static $status_code = 0; // http request status |
|
89 | - public static $error = ""; // error messages sent here |
|
75 | + public static $rawheaders = array(); // array of raw headers to send |
|
76 | + public static $domain_cookies = array(); // array of cookies for domain to pass |
|
77 | + public static $hosts = array(); // random host binding for make request faster |
|
78 | + public static $headers = array(); // headers returned from server sent here |
|
79 | + public static $useragents = array("requests/2.0.0"); // random agent we masquerade as |
|
80 | + public static $client_ips = array(); // random ip we masquerade as |
|
81 | + public static $proxies = array(); // random proxy ip |
|
82 | + public static $raw = ""; // head + body content returned from server sent here |
|
83 | + public static $head = ""; // head content |
|
84 | + public static $content = ""; // The body before encoding |
|
85 | + public static $text = ""; // The body after encoding |
|
86 | + public static $info = array(); // curl info |
|
87 | + public static $history = 302; // http request status before redirect. ex:30x |
|
88 | + public static $status_code = 0; // http request status |
|
89 | + public static $error = ""; // error messages sent here |
|
90 | 90 | |
91 | 91 | /** |
92 | 92 | * set timeout |
@@ -289,7 +289,7 @@ discard block |
||
289 | 289 | { |
290 | 290 | return false; |
291 | 291 | } |
292 | - if ( empty($domain) ) |
|
292 | + if (empty($domain)) |
|
293 | 293 | { |
294 | 294 | self::$cookies = array(); |
295 | 295 | } |
@@ -541,17 +541,17 @@ discard block |
||
541 | 541 | */ |
542 | 542 | public static function init() |
543 | 543 | { |
544 | - if (!is_resource ( self::$ch )) |
|
544 | + if (!is_resource(self::$ch)) |
|
545 | 545 | { |
546 | - self::$ch = curl_init (); |
|
547 | - curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); |
|
548 | - curl_setopt( self::$ch, CURLOPT_HEADER, false ); |
|
549 | - curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION ); |
|
546 | + self::$ch = curl_init(); |
|
547 | + curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, true); |
|
548 | + curl_setopt(self::$ch, CURLOPT_HEADER, false); |
|
549 | + curl_setopt(self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION); |
|
550 | 550 | // 如果设置了两个时间,就分开设置 |
551 | 551 | if (is_array(self::$timeout)) |
552 | 552 | { |
553 | - curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] ); |
|
554 | - curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); |
|
553 | + curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0]); |
|
554 | + curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); |
|
555 | 555 | } |
556 | 556 | else |
557 | 557 | { |
@@ -560,7 +560,7 @@ discard block |
||
560 | 560 | } |
561 | 561 | curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed |
562 | 562 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 |
563 | - curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); |
|
563 | + curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true); |
|
564 | 564 | } |
565 | 565 | return self::$ch; |
566 | 566 | } |
@@ -570,7 +570,7 @@ discard block |
||
570 | 570 | */ |
571 | 571 | public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
572 | 572 | { |
573 | - self::init (); |
|
573 | + self::init(); |
|
574 | 574 | return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert); |
575 | 575 | } |
576 | 576 | |
@@ -593,19 +593,19 @@ discard block |
||
593 | 593 | */ |
594 | 594 | public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) |
595 | 595 | { |
596 | - self::init (); |
|
596 | + self::init(); |
|
597 | 597 | return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert); |
598 | 598 | } |
599 | 599 | |
600 | 600 | public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
601 | 601 | { |
602 | - self::init (); |
|
602 | + self::init(); |
|
603 | 603 | return self::request($url, 'PUT', $fields, $allow_redirects, $cert); |
604 | 604 | } |
605 | 605 | |
606 | 606 | public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
607 | 607 | { |
608 | - self::init (); |
|
608 | + self::init(); |
|
609 | 609 | return self::request($url, 'DELETE', $fields, $allow_redirects, $cert); |
610 | 610 | } |
611 | 611 | |
@@ -614,19 +614,19 @@ discard block |
||
614 | 614 | // 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。. |
615 | 615 | public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
616 | 616 | { |
617 | - self::init (); |
|
617 | + self::init(); |
|
618 | 618 | self::request($url, 'HEAD', $fields, $allow_redirects, $cert); |
619 | 619 | } |
620 | 620 | |
621 | 621 | public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
622 | 622 | { |
623 | - self::init (); |
|
623 | + self::init(); |
|
624 | 624 | return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert); |
625 | 625 | } |
626 | 626 | |
627 | 627 | public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
628 | 628 | { |
629 | - self::init (); |
|
629 | + self::init(); |
|
630 | 630 | return self::request($url, 'PATCH', $fields, $allow_redirects, $cert); |
631 | 631 | } |
632 | 632 | |
@@ -645,7 +645,7 @@ discard block |
||
645 | 645 | public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) |
646 | 646 | { |
647 | 647 | $method = strtoupper($method); |
648 | - if(!self::_is_url($url)) |
|
648 | + if (!self::_is_url($url)) |
|
649 | 649 | { |
650 | 650 | self::$error = "You have requested URL ({$url}) is not a valid HTTP address"; |
651 | 651 | return false; |
@@ -679,7 +679,7 @@ discard block |
||
679 | 679 | } |
680 | 680 | } |
681 | 681 | |
682 | - curl_setopt( self::$ch, CURLOPT_URL, $url ); |
|
682 | + curl_setopt(self::$ch, CURLOPT_URL, $url); |
|
683 | 683 | |
684 | 684 | if ($method != 'GET') |
685 | 685 | { |
@@ -692,13 +692,13 @@ discard block |
||
692 | 692 | // CURLOPT_POST会把上傳的文件类型设为 multipart/form-data |
693 | 693 | // 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码 |
694 | 694 | // CURLOPT_CUSTOMREQUEST可以按指定内容上传 |
695 | - if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) |
|
695 | + if (isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json') |
|
696 | 696 | { |
697 | - curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
|
697 | + curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); |
|
698 | 698 | } |
699 | 699 | else |
700 | 700 | { |
701 | - curl_setopt( self::$ch, CURLOPT_POST, true ); |
|
701 | + curl_setopt(self::$ch, CURLOPT_POST, true); |
|
702 | 702 | } |
703 | 703 | |
704 | 704 | $file_fields = array(); |
@@ -724,16 +724,16 @@ discard block |
||
724 | 724 | else |
725 | 725 | { |
726 | 726 | self::$rawheaders['X-HTTP-Method-Override'] = $method; |
727 | - curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
|
727 | + curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); |
|
728 | 728 | } |
729 | 729 | |
730 | - if ( $method == 'POST' ) |
|
730 | + if ($method == 'POST') |
|
731 | 731 | { |
732 | 732 | // 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包 |
733 | - if ( empty($file_fields) ) |
|
733 | + if (empty($file_fields)) |
|
734 | 734 | { |
735 | 735 | // post方式 |
736 | - if ( is_array($fields) ) |
|
736 | + if (is_array($fields)) |
|
737 | 737 | { |
738 | 738 | $fields = http_build_query($fields); |
739 | 739 | } |
@@ -741,7 +741,7 @@ discard block |
||
741 | 741 | else |
742 | 742 | { |
743 | 743 | // 有post数据 |
744 | - if ( is_array($fields) && !empty($fields) ) |
|
744 | + if (is_array($fields) && !empty($fields)) |
|
745 | 745 | { |
746 | 746 | // 某些server可能会有问题 |
747 | 747 | $fields = array_merge($fields, $file_fields); |
@@ -753,13 +753,13 @@ discard block |
||
753 | 753 | } |
754 | 754 | |
755 | 755 | // 不能直接传数组,不知道是什么Bug,会非常慢 |
756 | - curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); |
|
756 | + curl_setopt(self::$ch, CURLOPT_POSTFIELDS, $fields); |
|
757 | 757 | } |
758 | 758 | } |
759 | 759 | |
760 | 760 | $cookies = self::get_cookies(); |
761 | 761 | $domain_cookies = self::get_cookies($domain); |
762 | - $cookies = array_merge($cookies, $domain_cookies); |
|
762 | + $cookies = array_merge($cookies, $domain_cookies); |
|
763 | 763 | // 是否设置了cookie |
764 | 764 | if (!empty($cookies)) |
765 | 765 | { |
@@ -773,13 +773,13 @@ discard block |
||
773 | 773 | |
774 | 774 | if (!empty(self::$useragents)) |
775 | 775 | { |
776 | - $key = rand(0, count(self::$useragents) - 1); |
|
776 | + $key = rand(0, count(self::$useragents)-1); |
|
777 | 777 | self::$rawheaders['User-Agent'] = self::$useragents[$key]; |
778 | 778 | } |
779 | 779 | |
780 | 780 | if (!empty(self::$client_ips)) |
781 | 781 | { |
782 | - $key = rand(0, count(self::$client_ips) - 1); |
|
782 | + $key = rand(0, count(self::$client_ips)-1); |
|
783 | 783 | self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key]; |
784 | 784 | self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key]; |
785 | 785 | } |
@@ -791,10 +791,10 @@ discard block |
||
791 | 791 | { |
792 | 792 | $http_headers[] = $k.': '.$v; |
793 | 793 | } |
794 | - curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers ); |
|
794 | + curl_setopt(self::$ch, CURLOPT_HTTPHEADER, $http_headers); |
|
795 | 795 | } |
796 | 796 | |
797 | - curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); |
|
797 | + curl_setopt(self::$ch, CURLOPT_ENCODING, 'gzip'); |
|
798 | 798 | |
799 | 799 | // 关闭验证 |
800 | 800 | if ($scheme == 'https') |
@@ -805,33 +805,33 @@ discard block |
||
805 | 805 | |
806 | 806 | if (self::$proxies) |
807 | 807 | { |
808 | - $key = rand(0, count(self::$proxies) - 1); |
|
808 | + $key = rand(0, count(self::$proxies)-1); |
|
809 | 809 | $proxy = self::$proxies[$key]; |
810 | - curl_setopt( self::$ch, CURLOPT_PROXY, $proxy ); |
|
810 | + curl_setopt(self::$ch, CURLOPT_PROXY, $proxy); |
|
811 | 811 | } |
812 | 812 | |
813 | 813 | // header + body,header 里面有 cookie |
814 | - curl_setopt( self::$ch, CURLOPT_HEADER, true ); |
|
814 | + curl_setopt(self::$ch, CURLOPT_HEADER, true); |
|
815 | 815 | // 请求跳转后的内容 |
816 | 816 | if ($allow_redirects) |
817 | 817 | { |
818 | - curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true); |
|
818 | + curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, true); |
|
819 | 819 | } |
820 | 820 | |
821 | - self::$raw = curl_exec ( self::$ch ); |
|
821 | + self::$raw = curl_exec(self::$ch); |
|
822 | 822 | // 真实url |
823 | 823 | //$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL); |
824 | - self::$info = curl_getinfo( self::$ch ); |
|
824 | + self::$info = curl_getinfo(self::$ch); |
|
825 | 825 | //print_r(self::$info); |
826 | 826 | self::$status_code = self::$info['http_code']; |
827 | 827 | if (self::$raw === false) |
828 | 828 | { |
829 | - self::$error = 'Curl error: ' . curl_error( self::$ch ); |
|
829 | + self::$error = 'Curl error: '.curl_error(self::$ch); |
|
830 | 830 | //trigger_error(self::$error, E_USER_WARNING); |
831 | 831 | } |
832 | 832 | |
833 | 833 | // 关闭句柄 |
834 | - curl_close( self::$ch ); |
|
834 | + curl_close(self::$ch); |
|
835 | 835 | |
836 | 836 | // 请求成功之后才把URL存起来 |
837 | 837 | list($header, $text) = self::split_header_body(); |
@@ -861,7 +861,7 @@ discard block |
||
861 | 861 | // 获取 mimetype |
862 | 862 | public static function get_mimetype($filepath) |
863 | 863 | { |
864 | - $fp = finfo_open(FILEINFO_MIME); |
|
864 | + $fp = finfo_open(FILEINFO_MIME); |
|
865 | 865 | $mime = finfo_file($fp, $filepath); |
866 | 866 | finfo_close($fp); |
867 | 867 | $arr = explode(';', $mime); |
@@ -883,7 +883,7 @@ discard block |
||
883 | 883 | { |
884 | 884 | // 构造post数据 |
885 | 885 | $data = ''; |
886 | - $delimiter = '-------------' . uniqid(); |
|
886 | + $delimiter = '-------------'.uniqid(); |
|
887 | 887 | // 表单数据 |
888 | 888 | foreach ($post_fields as $name => $content) |
889 | 889 | { |
@@ -941,11 +941,11 @@ discard block |
||
941 | 941 | { |
942 | 942 | $out = self::$output_encoding; |
943 | 943 | } |
944 | - if ( ! isset($out)) |
|
944 | + if (!isset($out)) |
|
945 | 945 | { |
946 | 946 | $out = 'UTF-8'; |
947 | 947 | } |
948 | - if ( ! in_array($mode, $valid)) |
|
948 | + if (!in_array($mode, $valid)) |
|
949 | 949 | { |
950 | 950 | throw new Exception('invalid mode, mode='.$mode); |
951 | 951 | } |
@@ -965,7 +965,7 @@ discard block |
||
965 | 965 | } |
966 | 966 | |
967 | 967 | $pattern = '/(<meta[^>]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is'; |
968 | - if ( ! isset($in)) |
|
968 | + if (!isset($in)) |
|
969 | 969 | { |
970 | 970 | $n = preg_match($pattern, $html, $in); |
971 | 971 | if ($n > 0) |
@@ -159,8 +159,7 @@ discard block |
||
159 | 159 | if (!empty($domain)) |
160 | 160 | { |
161 | 161 | self::$domain_cookies[$domain][$key] = $value; |
162 | - } |
|
163 | - else |
|
162 | + } else |
|
164 | 163 | { |
165 | 164 | self::$cookies[$key] = $value; |
166 | 165 | } |
@@ -193,8 +192,7 @@ discard block |
||
193 | 192 | if (!empty($domain)) |
194 | 193 | { |
195 | 194 | self::$domain_cookies[$domain][$key] = $value; |
196 | - } |
|
197 | - else |
|
195 | + } else |
|
198 | 196 | { |
199 | 197 | self::$cookies[$key] = $value; |
200 | 198 | } |
@@ -264,8 +262,7 @@ discard block |
||
264 | 262 | { |
265 | 263 | unset(self::$domain_cookies[$domain][$key]); |
266 | 264 | } |
267 | - } |
|
268 | - else |
|
265 | + } else |
|
269 | 266 | { |
270 | 267 | if (isset(self::$cookies[$key])) |
271 | 268 | { |
@@ -292,8 +289,7 @@ discard block |
||
292 | 289 | if ( empty($domain) ) |
293 | 290 | { |
294 | 291 | self::$cookies = array(); |
295 | - } |
|
296 | - else |
|
292 | + } else |
|
297 | 293 | { |
298 | 294 | if (isset(self::$domain_cookies[$domain])) |
299 | 295 | { |
@@ -552,8 +548,7 @@ discard block |
||
552 | 548 | { |
553 | 549 | curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] ); |
554 | 550 | curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); |
555 | - } |
|
556 | - else |
|
551 | + } else |
|
557 | 552 | { |
558 | 553 | curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2)); |
559 | 554 | curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout); |
@@ -695,8 +690,7 @@ discard block |
||
695 | 690 | if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) |
696 | 691 | { |
697 | 692 | curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
698 | - } |
|
699 | - else |
|
693 | + } else |
|
700 | 694 | { |
701 | 695 | curl_setopt( self::$ch, CURLOPT_POST, true ); |
702 | 696 | } |
@@ -720,8 +714,7 @@ discard block |
||
720 | 714 | //$cfile = '@'.realpath($filename).";type=".$type.";filename=".$filename; |
721 | 715 | } |
722 | 716 | } |
723 | - } |
|
724 | - else |
|
717 | + } else |
|
725 | 718 | { |
726 | 719 | self::$rawheaders['X-HTTP-Method-Override'] = $method; |
727 | 720 | curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
@@ -737,16 +730,14 @@ discard block |
||
737 | 730 | { |
738 | 731 | $fields = http_build_query($fields); |
739 | 732 | } |
740 | - } |
|
741 | - else |
|
733 | + } else |
|
742 | 734 | { |
743 | 735 | // 有post数据 |
744 | 736 | if ( is_array($fields) && !empty($fields) ) |
745 | 737 | { |
746 | 738 | // 某些server可能会有问题 |
747 | 739 | $fields = array_merge($fields, $file_fields); |
748 | - } |
|
749 | - else |
|
740 | + } else |
|
750 | 741 | { |
751 | 742 | $fields = $file_fields; |
752 | 743 | } |
@@ -954,12 +945,10 @@ discard block |
||
954 | 945 | if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv')) |
955 | 946 | { |
956 | 947 | $func = 'iconv'; |
957 | - } |
|
958 | - elseif ($if) |
|
948 | + } elseif ($if) |
|
959 | 949 | { |
960 | 950 | $func = 'mb_convert_encoding'; |
961 | - } |
|
962 | - else |
|
951 | + } else |
|
963 | 952 | { |
964 | 953 | throw new Exception('charsetTrans failed, no function'); |
965 | 954 | } |
@@ -971,8 +960,7 @@ discard block |
||
971 | 960 | if ($n > 0) |
972 | 961 | { |
973 | 962 | $in = $in[3]; |
974 | - } |
|
975 | - else |
|
963 | + } else |
|
976 | 964 | { |
977 | 965 | $in = null; |
978 | 966 | } |