@@ -15,7 +15,7 @@ discard block |
||
| 15 | 15 | |
| 16 | 16 | namespace phpspider\core; |
| 17 | 17 | |
| 18 | -require_once __DIR__ . '/constants.php'; |
|
| 18 | +require_once __DIR__.'/constants.php'; |
|
| 19 | 19 | |
| 20 | 20 | use phpspider\core\requests; |
| 21 | 21 | use phpspider\core\selector; |
@@ -352,7 +352,7 @@ discard block |
||
| 352 | 352 | function __construct($configs = array()) |
| 353 | 353 | { |
| 354 | 354 | // 产生时钟云,解决php7下面ctrl+c无法停止bug |
| 355 | - declare(ticks = 1); |
|
| 355 | + declare(ticks=1); |
|
| 356 | 356 | |
| 357 | 357 | // 先打开以显示验证报错内容 |
| 358 | 358 | log::$log_show = true; |
@@ -369,25 +369,25 @@ discard block |
||
| 369 | 369 | exit; |
| 370 | 370 | } |
| 371 | 371 | |
| 372 | - $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; |
|
| 373 | - $configs['proxies'] = isset($configs['proxies']) ? $configs['proxies'] : ''; |
|
| 374 | - $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; |
|
| 372 | + $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; |
|
| 373 | + $configs['proxies'] = isset($configs['proxies']) ? $configs['proxies'] : ''; |
|
| 374 | + $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; |
|
| 375 | 375 | $configs['user_agents'] = isset($configs['user_agents']) ? $configs['user_agents'] : null; |
| 376 | - $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : null; |
|
| 377 | - $configs['client_ips'] = isset($configs['client_ips']) ? $configs['client_ips'] : null; |
|
| 378 | - $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; |
|
| 379 | - $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; |
|
| 380 | - $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; |
|
| 381 | - $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; |
|
| 382 | - $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; |
|
| 383 | - $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); |
|
| 376 | + $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : null; |
|
| 377 | + $configs['client_ips'] = isset($configs['client_ips']) ? $configs['client_ips'] : null; |
|
| 378 | + $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; |
|
| 379 | + $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; |
|
| 380 | + $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; |
|
| 381 | + $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; |
|
| 382 | + $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; |
|
| 383 | + $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); |
|
| 384 | 384 | |
| 385 | 385 | // csv、sql、db |
| 386 | - self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : ''; |
|
| 387 | - self::$export_file = isset($configs['export']['file']) ? $configs['export']['file'] : ''; |
|
| 386 | + self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : ''; |
|
| 387 | + self::$export_file = isset($configs['export']['file']) ? $configs['export']['file'] : ''; |
|
| 388 | 388 | self::$export_table = isset($configs['export']['table']) ? $configs['export']['table'] : ''; |
| 389 | - self::$db_config = isset($configs['db_config']) ? $configs['db_config'] : array(); |
|
| 390 | - self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array(); |
|
| 389 | + self::$db_config = isset($configs['db_config']) ? $configs['db_config'] : array(); |
|
| 390 | + self::$queue_config = isset($configs['queue_config']) ? $configs['queue_config'] : array(); |
|
| 391 | 391 | |
| 392 | 392 | // 是否设置了并发任务数, 并且大于1, 而且不是windows环境 |
| 393 | 393 | if (isset($configs['tasknum']) && $configs['tasknum'] > 1 && !util::is_win()) |
@@ -604,7 +604,7 @@ discard block |
||
| 604 | 604 | $command2 = isset($argv[2]) ? $argv[2] : ''; |
| 605 | 605 | |
| 606 | 606 | // 根据命令做相应处理 |
| 607 | - switch($command) |
|
| 607 | + switch ($command) |
|
| 608 | 608 | { |
| 609 | 609 | // 启动 phpspider |
| 610 | 610 | case 'start': |
@@ -766,7 +766,7 @@ discard block |
||
| 766 | 766 | // 显示最后结果 |
| 767 | 767 | log::$log_show = true; |
| 768 | 768 | |
| 769 | - $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); |
|
| 769 | + $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start)); |
|
| 770 | 770 | log::note("Spider finished in {$spider_time_run}"); |
| 771 | 771 | |
| 772 | 772 | $get_collected_url_num = $this->get_collected_url_num(); |
@@ -795,12 +795,12 @@ discard block |
||
| 795 | 795 | // 检查PHP版本 |
| 796 | 796 | if (version_compare(PHP_VERSION, '5.3.0', 'lt')) |
| 797 | 797 | { |
| 798 | - log::error('PHP 5.3+ is required, currently installed version is: ' . phpversion()); |
|
| 798 | + log::error('PHP 5.3+ is required, currently installed version is: '.phpversion()); |
|
| 799 | 799 | exit; |
| 800 | 800 | } |
| 801 | 801 | |
| 802 | 802 | // 检查CURL扩展 |
| 803 | - if(!function_exists('curl_init')) |
|
| 803 | + if (!function_exists('curl_init')) |
|
| 804 | 804 | { |
| 805 | 805 | log::error("The curl extension was not found"); |
| 806 | 806 | exit; |
@@ -861,7 +861,7 @@ discard block |
||
| 861 | 861 | exit; |
| 862 | 862 | } |
| 863 | 863 | |
| 864 | - foreach ( self::$configs['scan_urls'] as $url ) |
|
| 864 | + foreach (self::$configs['scan_urls'] as $url) |
|
| 865 | 865 | { |
| 866 | 866 | // 只检查配置中的入口URL, 通过 add_scan_url 添加的不检查了. |
| 867 | 867 | if (!$this->is_scan_page($url)) |
@@ -917,7 +917,7 @@ discard block |
||
| 917 | 917 | //-------------------------------------------------------------------------------- |
| 918 | 918 | |
| 919 | 919 | // 添加入口URL到队列 |
| 920 | - foreach ( self::$configs['scan_urls'] as $url ) |
|
| 920 | + foreach (self::$configs['scan_urls'] as $url) |
|
| 921 | 921 | { |
| 922 | 922 | // false 表示不允许重复 |
| 923 | 923 | $this->add_scan_url($url, null, false); |
@@ -965,13 +965,13 @@ discard block |
||
| 965 | 965 | $pid = pcntl_fork(); |
| 966 | 966 | |
| 967 | 967 | // 主进程记录子进程pid |
| 968 | - if($pid > 0) |
|
| 968 | + if ($pid > 0) |
|
| 969 | 969 | { |
| 970 | 970 | // 暂时没用 |
| 971 | 971 | //self::$taskpids[$taskid] = $pid; |
| 972 | 972 | } |
| 973 | 973 | // 子进程运行 |
| 974 | - elseif(0 === $pid) |
|
| 974 | + elseif (0 === $pid) |
|
| 975 | 975 | { |
| 976 | 976 | log::warn("Fork children task({$taskid}) successful..."); |
| 977 | 977 | |
@@ -999,7 +999,7 @@ discard block |
||
| 999 | 999 | { |
| 1000 | 1000 | queue::set_connect('default', self::$queue_config); |
| 1001 | 1001 | queue::init(); |
| 1002 | - while( $queue_lsize = $this->queue_lsize() ) |
|
| 1002 | + while ($queue_lsize = $this->queue_lsize()) |
|
| 1003 | 1003 | { |
| 1004 | 1004 | // 如果是主任务 |
| 1005 | 1005 | if (self::$taskmaster) |
@@ -1008,7 +1008,7 @@ discard block |
||
| 1008 | 1008 | if (self::$tasknum > 1 && !self::$fork_task_complete) |
| 1009 | 1009 | { |
| 1010 | 1010 | // 主进程采集到两倍于任务数时, 生成子任务一起采集 |
| 1011 | - if ( $queue_lsize > self::$tasknum*2 ) |
|
| 1011 | + if ($queue_lsize > self::$tasknum * 2) |
|
| 1012 | 1012 | { |
| 1013 | 1013 | self::$fork_task_complete = true; |
| 1014 | 1014 | |
@@ -1037,7 +1037,7 @@ discard block |
||
| 1037 | 1037 | else |
| 1038 | 1038 | { |
| 1039 | 1039 | // 如果队列中的网页比任务数2倍多, 子任务可以采集, 否则等待... |
| 1040 | - if ( $queue_lsize > self::$tasknum*2 ) |
|
| 1040 | + if ($queue_lsize > self::$tasknum * 2) |
|
| 1041 | 1041 | { |
| 1042 | 1042 | // 抓取页面 |
| 1043 | 1043 | $this->collect_page(); |
@@ -1176,7 +1176,7 @@ discard block |
||
| 1176 | 1176 | if (self::$configs['max_depth'] == 0 || $link['depth'] < self::$configs['max_depth']) |
| 1177 | 1177 | { |
| 1178 | 1178 | // 分析提取HTML页面中的URL |
| 1179 | - $this->get_urls($page['raw'], $url, $link['depth'] + 1); |
|
| 1179 | + $this->get_urls($page['raw'], $url, $link['depth']+1); |
|
| 1180 | 1180 | } |
| 1181 | 1181 | } |
| 1182 | 1182 | |
@@ -1191,10 +1191,10 @@ discard block |
||
| 1191 | 1191 | $this->incr_depth_num($link['depth']); |
| 1192 | 1192 | |
| 1193 | 1193 | // 处理页面耗时时间 |
| 1194 | - $time_run = round(microtime(true) - $page_time_start, 3); |
|
| 1194 | + $time_run = round(microtime(true)-$page_time_start, 3); |
|
| 1195 | 1195 | log::debug("Success process page {$url} in {$time_run} s"); |
| 1196 | 1196 | |
| 1197 | - $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); |
|
| 1197 | + $spider_time_run = util::time2second(intval(microtime(true)-self::$time_start)); |
|
| 1198 | 1198 | log::info("Spider running in {$spider_time_run}"); |
| 1199 | 1199 | |
| 1200 | 1200 | // 爬虫爬取每个网页的时间间隔, 单位: 毫秒 |
@@ -1314,12 +1314,12 @@ discard block |
||
| 1314 | 1314 | log::error("Failed to download page {$url}"); |
| 1315 | 1315 | self::$collect_fail++; |
| 1316 | 1316 | } |
| 1317 | - elseif (in_array($http_code, array('0','502','503','429'))) |
|
| 1317 | + elseif (in_array($http_code, array('0', '502', '503', '429'))) |
|
| 1318 | 1318 | { |
| 1319 | 1319 | // 采集次数加一 |
| 1320 | 1320 | $link['try_num']++; |
| 1321 | 1321 | // 抓取次数 小于 允许抓取失败次数 |
| 1322 | - if ( $link['try_num'] <= $link['max_try'] ) |
|
| 1322 | + if ($link['try_num'] <= $link['max_try']) |
|
| 1323 | 1323 | { |
| 1324 | 1324 | // 扔到队列头部去, 继续采集 |
| 1325 | 1325 | $this->queue_rpush($link); |
@@ -1337,7 +1337,7 @@ discard block |
||
| 1337 | 1337 | } |
| 1338 | 1338 | |
| 1339 | 1339 | // 爬取页面耗时时间 |
| 1340 | - $time_run = round(microtime(true) - $time_start, 3); |
|
| 1340 | + $time_run = round(microtime(true)-$time_start, 3); |
|
| 1341 | 1341 | log::debug("Success download page {$url} in {$time_run} s"); |
| 1342 | 1342 | self::$collect_succ++; |
| 1343 | 1343 | |
@@ -1382,7 +1382,7 @@ discard block |
||
| 1382 | 1382 | |
| 1383 | 1383 | foreach ($urls as $key=>$url) |
| 1384 | 1384 | { |
| 1385 | - $urls[$key] = str_replace(array("\"", "'",'&'), array("",'','&'), $url); |
|
| 1385 | + $urls[$key] = str_replace(array("\"", "'", '&'), array("", '', '&'), $url); |
|
| 1386 | 1386 | } |
| 1387 | 1387 | |
| 1388 | 1388 | //-------------------------------------------------------------------------------- |
@@ -1458,12 +1458,12 @@ discard block |
||
| 1458 | 1458 | |
| 1459 | 1459 | // 排除JavaScript的连接 |
| 1460 | 1460 | //if (strpos($url, "javascript:") !== false) |
| 1461 | - if( preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '') |
|
| 1461 | + if (preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '') |
|
| 1462 | 1462 | { |
| 1463 | 1463 | return false; |
| 1464 | 1464 | } |
| 1465 | 1465 | // 排除没有被解析成功的语言标签 |
| 1466 | - if(substr($url, 0, 3) == '<%=') |
|
| 1466 | + if (substr($url, 0, 3) == '<%=') |
|
| 1467 | 1467 | { |
| 1468 | 1468 | return false; |
| 1469 | 1469 | } |
@@ -1482,46 +1482,46 @@ discard block |
||
| 1482 | 1482 | $domain = $parse_url['host']; |
| 1483 | 1483 | $path = empty($parse_url['path']) ? '' : $parse_url['path']; |
| 1484 | 1484 | $base_url_path = $domain.$path; |
| 1485 | - $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/","/",$base_url_path); |
|
| 1486 | - $base_url_path = preg_replace("/\/$/",'',$base_url_path); |
|
| 1485 | + $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", "/", $base_url_path); |
|
| 1486 | + $base_url_path = preg_replace("/\/$/", '', $base_url_path); |
|
| 1487 | 1487 | |
| 1488 | 1488 | $i = $path_step = 0; |
| 1489 | 1489 | $dstr = $pstr = ''; |
| 1490 | - $pos = strpos($url,'#'); |
|
| 1491 | - if($pos > 0) |
|
| 1490 | + $pos = strpos($url, '#'); |
|
| 1491 | + if ($pos > 0) |
|
| 1492 | 1492 | { |
| 1493 | 1493 | // 去掉#和后面的字符串 |
| 1494 | 1494 | $url = substr($url, 0, $pos); |
| 1495 | 1495 | } |
| 1496 | 1496 | |
| 1497 | 1497 | // 京东变态的都是 //www.jd.com/111.html |
| 1498 | - if(substr($url, 0, 2) == '//') |
|
| 1498 | + if (substr($url, 0, 2) == '//') |
|
| 1499 | 1499 | { |
| 1500 | 1500 | $url = str_replace("//", "", $url); |
| 1501 | 1501 | } |
| 1502 | 1502 | // /1234.html |
| 1503 | - elseif($url[0] == '/') |
|
| 1503 | + elseif ($url[0] == '/') |
|
| 1504 | 1504 | { |
| 1505 | 1505 | $url = $domain.$url; |
| 1506 | 1506 | } |
| 1507 | 1507 | // ./1234.html、../1234.html 这种类型的 |
| 1508 | - elseif($url[0] == '.') |
|
| 1508 | + elseif ($url[0] == '.') |
|
| 1509 | 1509 | { |
| 1510 | - if(!isset($url[2])) |
|
| 1510 | + if (!isset($url[2])) |
|
| 1511 | 1511 | { |
| 1512 | 1512 | return false; |
| 1513 | 1513 | } |
| 1514 | 1514 | else |
| 1515 | 1515 | { |
| 1516 | - $urls = explode('/',$url); |
|
| 1517 | - foreach($urls as $u) |
|
| 1516 | + $urls = explode('/', $url); |
|
| 1517 | + foreach ($urls as $u) |
|
| 1518 | 1518 | { |
| 1519 | - if( $u == '..' ) |
|
| 1519 | + if ($u == '..') |
|
| 1520 | 1520 | { |
| 1521 | 1521 | $path_step++; |
| 1522 | 1522 | } |
| 1523 | 1523 | // 遇到 ., 不知道为什么不直接写$u == '.', 貌似一样的 |
| 1524 | - else if( $i < count($urls)-1 ) |
|
| 1524 | + else if ($i < count($urls)-1) |
|
| 1525 | 1525 | { |
| 1526 | 1526 | $dstr .= $urls[$i].'/'; |
| 1527 | 1527 | } |
@@ -1531,29 +1531,29 @@ discard block |
||
| 1531 | 1531 | } |
| 1532 | 1532 | $i++; |
| 1533 | 1533 | } |
| 1534 | - $urls = explode('/',$base_url_path); |
|
| 1535 | - if(count($urls) <= $path_step) |
|
| 1534 | + $urls = explode('/', $base_url_path); |
|
| 1535 | + if (count($urls) <= $path_step) |
|
| 1536 | 1536 | { |
| 1537 | 1537 | return false; |
| 1538 | 1538 | } |
| 1539 | 1539 | else |
| 1540 | 1540 | { |
| 1541 | 1541 | $pstr = ''; |
| 1542 | - for($i=0;$i<count($urls)-$path_step;$i++){ $pstr .= $urls[$i].'/'; } |
|
| 1542 | + for ($i = 0; $i < count($urls)-$path_step; $i++) { $pstr .= $urls[$i].'/'; } |
|
| 1543 | 1543 | $url = $pstr.$dstr; |
| 1544 | 1544 | } |
| 1545 | 1545 | } |
| 1546 | 1546 | } |
| 1547 | 1547 | else |
| 1548 | 1548 | { |
| 1549 | - if( strtolower(substr($url, 0, 7))=='http://' ) |
|
| 1549 | + if (strtolower(substr($url, 0, 7)) == 'http://') |
|
| 1550 | 1550 | { |
| 1551 | - $url = preg_replace('#^http://#i','',$url); |
|
| 1551 | + $url = preg_replace('#^http://#i', '', $url); |
|
| 1552 | 1552 | $scheme = "http"; |
| 1553 | 1553 | } |
| 1554 | - else if( strtolower(substr($url, 0, 8))=='https://' ) |
|
| 1554 | + else if (strtolower(substr($url, 0, 8)) == 'https://') |
|
| 1555 | 1555 | { |
| 1556 | - $url = preg_replace('#^https://#i','',$url); |
|
| 1556 | + $url = preg_replace('#^https://#i', '', $url); |
|
| 1557 | 1557 | $scheme = "https"; |
| 1558 | 1558 | } |
| 1559 | 1559 | else |
@@ -1650,16 +1650,16 @@ discard block |
||
| 1650 | 1650 | public function link_uncompress($link) |
| 1651 | 1651 | { |
| 1652 | 1652 | $link = array( |
| 1653 | - 'url' => isset($link['url']) ? $link['url'] : '', |
|
| 1654 | - 'url_type' => isset($link['url_type']) ? $link['url_type'] : '', |
|
| 1655 | - 'method' => isset($link['method']) ? $link['method'] : 'get', |
|
| 1656 | - 'headers' => isset($link['headers']) ? $link['headers'] : array(), |
|
| 1657 | - 'params' => isset($link['params']) ? $link['params'] : array(), |
|
| 1653 | + 'url' => isset($link['url']) ? $link['url'] : '', |
|
| 1654 | + 'url_type' => isset($link['url_type']) ? $link['url_type'] : '', |
|
| 1655 | + 'method' => isset($link['method']) ? $link['method'] : 'get', |
|
| 1656 | + 'headers' => isset($link['headers']) ? $link['headers'] : array(), |
|
| 1657 | + 'params' => isset($link['params']) ? $link['params'] : array(), |
|
| 1658 | 1658 | 'context_data' => isset($link['context_data']) ? $link['context_data'] : '', |
| 1659 | - 'proxies' => isset($link['proxies']) ? $link['proxies'] : self::$configs['proxies'], |
|
| 1660 | - 'try_num' => isset($link['try_num']) ? $link['try_num'] : 0, |
|
| 1661 | - 'max_try' => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'], |
|
| 1662 | - 'depth' => isset($link['depth']) ? $link['depth'] : 0, |
|
| 1659 | + 'proxies' => isset($link['proxies']) ? $link['proxies'] : self::$configs['proxies'], |
|
| 1660 | + 'try_num' => isset($link['try_num']) ? $link['try_num'] : 0, |
|
| 1661 | + 'max_try' => isset($link['max_try']) ? $link['max_try'] : self::$configs['max_try'], |
|
| 1662 | + 'depth' => isset($link['depth']) ? $link['depth'] : 0, |
|
| 1663 | 1663 | ); |
| 1664 | 1664 | |
| 1665 | 1665 | return $link; |
@@ -1704,12 +1704,12 @@ discard block |
||
| 1704 | 1704 | exit(0); |
| 1705 | 1705 | } |
| 1706 | 1706 | |
| 1707 | - if (version_compare(PHP_VERSION,'5.4.0','<')) |
|
| 1707 | + if (version_compare(PHP_VERSION, '5.4.0', '<')) |
|
| 1708 | 1708 | { |
| 1709 | 1709 | $fields_str = json_encode($fields); |
| 1710 | - $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) { |
|
| 1710 | + $fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function($matchs) { |
|
| 1711 | 1711 | return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1])); |
| 1712 | - }, $fields_str ); |
|
| 1712 | + }, $fields_str); |
|
| 1713 | 1713 | } |
| 1714 | 1714 | else |
| 1715 | 1715 | { |
@@ -1775,7 +1775,7 @@ discard block |
||
| 1775 | 1775 | if (!empty($conf['selector'])) |
| 1776 | 1776 | { |
| 1777 | 1777 | // 如果这个field是上一个field的附带连接 |
| 1778 | - if (isset($conf['source_type']) && $conf['source_type']=='attached_url') |
|
| 1778 | + if (isset($conf['source_type']) && $conf['source_type'] == 'attached_url') |
|
| 1779 | 1779 | { |
| 1780 | 1780 | // 取出上个field的内容作为连接, 内容分页是不进队列直接下载网页的 |
| 1781 | 1781 | if (!empty($fields[$conf['attached_url']])) |
@@ -1802,15 +1802,15 @@ discard block |
||
| 1802 | 1802 | } |
| 1803 | 1803 | |
| 1804 | 1804 | // 没有设置抽取规则的类型 或者 设置为 xpath |
| 1805 | - if (!isset($conf['selector_type']) || $conf['selector_type']=='xpath') |
|
| 1805 | + if (!isset($conf['selector_type']) || $conf['selector_type'] == 'xpath') |
|
| 1806 | 1806 | { |
| 1807 | 1807 | $values = $this->get_fields_xpath($html, $conf['selector'], $conf['name']); |
| 1808 | 1808 | } |
| 1809 | - elseif ($conf['selector_type']=='css') |
|
| 1809 | + elseif ($conf['selector_type'] == 'css') |
|
| 1810 | 1810 | { |
| 1811 | 1811 | $values = $this->get_fields_css($html, $conf['selector'], $conf['name']); |
| 1812 | 1812 | } |
| 1813 | - elseif ($conf['selector_type']=='regex') |
|
| 1813 | + elseif ($conf['selector_type'] == 'regex') |
|
| 1814 | 1814 | { |
| 1815 | 1815 | $values = $this->get_fields_regex($html, $conf['selector'], $conf['name']); |
| 1816 | 1816 | } |
@@ -1957,7 +1957,7 @@ discard block |
||
| 1957 | 1957 | |
| 1958 | 1958 | $config = self::$db_config; |
| 1959 | 1959 | @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']); |
| 1960 | - if(mysqli_connect_errno()) |
|
| 1960 | + if (mysqli_connect_errno()) |
|
| 1961 | 1961 | { |
| 1962 | 1962 | log::error("Export data to a database need Mysql support, Error: ".mysqli_connect_error()); |
| 1963 | 1963 | exit; |
@@ -1994,7 +1994,7 @@ discard block |
||
| 1994 | 1994 | $msg .= "Do you want to continue? [Y/n]"; |
| 1995 | 1995 | fwrite(STDOUT, $msg); |
| 1996 | 1996 | $arg = strtolower(trim(fgets(STDIN))); |
| 1997 | - $arg = empty($arg) || !in_array($arg, array('y','n')) ? 'y' : $arg; |
|
| 1997 | + $arg = empty($arg) || !in_array($arg, array('y', 'n')) ? 'y' : $arg; |
|
| 1998 | 1998 | if ($arg == 'n') |
| 1999 | 1999 | { |
| 2000 | 2000 | foreach ($keys as $key) |
@@ -2034,9 +2034,9 @@ discard block |
||
| 2034 | 2034 | public function set_task_status() |
| 2035 | 2035 | { |
| 2036 | 2036 | // 每采集成功一个页面, 生成当前进程状态到文件, 供主进程使用 |
| 2037 | - $mem = round(memory_get_usage(true)/(1024*1024),2); |
|
| 2038 | - $use_time = microtime(true) - self::$time_start; |
|
| 2039 | - $speed = round((self::$collect_succ + self::$collect_fail) / $use_time, 2); |
|
| 2037 | + $mem = round(memory_get_usage(true) / (1024 * 1024), 2); |
|
| 2038 | + $use_time = microtime(true)-self::$time_start; |
|
| 2039 | + $speed = round((self::$collect_succ+self::$collect_fail) / $use_time, 2); |
|
| 2040 | 2040 | $status = array( |
| 2041 | 2041 | 'id' => self::$taskid, |
| 2042 | 2042 | 'pid' => self::$taskpid, |
@@ -2609,7 +2609,7 @@ discard block |
||
| 2609 | 2609 | { |
| 2610 | 2610 | static $last_lines = 0; |
| 2611 | 2611 | |
| 2612 | - if(!is_null($force_clear_lines)) |
|
| 2612 | + if (!is_null($force_clear_lines)) |
|
| 2613 | 2613 | { |
| 2614 | 2614 | $last_lines = $force_clear_lines; |
| 2615 | 2615 | } |
@@ -2617,19 +2617,19 @@ discard block |
||
| 2617 | 2617 | // 获取终端宽度 |
| 2618 | 2618 | $toss = $status = null; |
| 2619 | 2619 | $term_width = exec('tput cols', $toss, $status); |
| 2620 | - if($status || empty($term_width)) |
|
| 2620 | + if ($status || empty($term_width)) |
|
| 2621 | 2621 | { |
| 2622 | 2622 | $term_width = 64; // Arbitrary fall-back term width. |
| 2623 | 2623 | } |
| 2624 | 2624 | |
| 2625 | 2625 | $line_count = 0; |
| 2626 | - foreach(explode("\n", $message) as $line) |
|
| 2626 | + foreach (explode("\n", $message) as $line) |
|
| 2627 | 2627 | { |
| 2628 | 2628 | $line_count += count(str_split($line, $term_width)); |
| 2629 | 2629 | } |
| 2630 | 2630 | |
| 2631 | 2631 | // Erasure MAGIC: Clear as many lines as the last output had. |
| 2632 | - for($i = 0; $i < $last_lines; $i++) |
|
| 2632 | + for ($i = 0; $i < $last_lines; $i++) |
|
| 2633 | 2633 | { |
| 2634 | 2634 | // Return to the beginning of the line |
| 2635 | 2635 | echo "\r"; |
@@ -2666,16 +2666,16 @@ discard block |
||
| 2666 | 2666 | $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; |
| 2667 | 2667 | //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; |
| 2668 | 2668 | $run_time_str = util::time2second(time()-self::$time_start, false); |
| 2669 | - $display_str .= 'PHPSpider version:' . self::VERSION . " PHP version:" . PHP_VERSION . "\n"; |
|
| 2670 | - $display_str .= 'start time:'. date('Y-m-d H:i:s', self::$time_start).' run ' . $run_time_str . " \n"; |
|
| 2669 | + $display_str .= 'PHPSpider version:'.self::VERSION." PHP version:".PHP_VERSION."\n"; |
|
| 2670 | + $display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).' run '.$run_time_str." \n"; |
|
| 2671 | 2671 | |
| 2672 | - $display_str .= 'spider name: ' . self::$configs['name'] . "\n"; |
|
| 2672 | + $display_str .= 'spider name: '.self::$configs['name']."\n"; |
|
| 2673 | 2673 | if (self::$multiserver) |
| 2674 | 2674 | { |
| 2675 | - $display_str .= 'server id: ' . self::$serverid."\n"; |
|
| 2675 | + $display_str .= 'server id: '.self::$serverid."\n"; |
|
| 2676 | 2676 | } |
| 2677 | - $display_str .= 'task number: ' . self::$tasknum . "\n"; |
|
| 2678 | - $display_str .= 'load average: ' . implode(", ", $loadavg) . "\n"; |
|
| 2677 | + $display_str .= 'task number: '.self::$tasknum."\n"; |
|
| 2678 | + $display_str .= 'load average: '.implode(", ", $loadavg)."\n"; |
|
| 2679 | 2679 | $display_str .= "document: https://doc.phpspider.org\n"; |
| 2680 | 2680 | |
| 2681 | 2681 | $display_str .= $this->display_task_ui(); |
@@ -2705,12 +2705,12 @@ discard block |
||
| 2705 | 2705 | { |
| 2706 | 2706 | $display_str = "-------------------------------\033[47;30m TASKS \033[0m-------------------------------\n"; |
| 2707 | 2707 | |
| 2708 | - $display_str .= "\033[47;30mtaskid\033[0m". str_pad('', self::$taskid_length+2-strlen('taskid')). |
|
| 2709 | - "\033[47;30mtaskpid\033[0m". str_pad('', self::$pid_length+2-strlen('taskpid')). |
|
| 2710 | - "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). |
|
| 2711 | - "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). |
|
| 2712 | - "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). |
|
| 2713 | - "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). |
|
| 2708 | + $display_str .= "\033[47;30mtaskid\033[0m".str_pad('', self::$taskid_length+2-strlen('taskid')). |
|
| 2709 | + "\033[47;30mtaskpid\033[0m".str_pad('', self::$pid_length+2-strlen('taskpid')). |
|
| 2710 | + "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). |
|
| 2711 | + "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). |
|
| 2712 | + "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). |
|
| 2713 | + "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). |
|
| 2714 | 2714 | "\n"; |
| 2715 | 2715 | |
| 2716 | 2716 | // "\033[32;40m [OK] \033[0m" |
@@ -2738,12 +2738,12 @@ discard block |
||
| 2738 | 2738 | { |
| 2739 | 2739 | $display_str = "-------------------------------\033[47;30m SERVER \033[0m------------------------------\n"; |
| 2740 | 2740 | |
| 2741 | - $display_str .= "\033[47;30mserver\033[0m". str_pad('', self::$server_length+2-strlen('serverid')). |
|
| 2742 | - "\033[47;30mtasknum\033[0m". str_pad('', self::$tasknum_length+2-strlen('tasknum')). |
|
| 2743 | - "\033[47;30mmem\033[0m". str_pad('', self::$mem_length+2-strlen('mem')). |
|
| 2744 | - "\033[47;30mcollect succ\033[0m". str_pad('', self::$urls_length-strlen('collect succ')). |
|
| 2745 | - "\033[47;30mcollect fail\033[0m". str_pad('', self::$urls_length-strlen('collect fail')). |
|
| 2746 | - "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). |
|
| 2741 | + $display_str .= "\033[47;30mserver\033[0m".str_pad('', self::$server_length+2-strlen('serverid')). |
|
| 2742 | + "\033[47;30mtasknum\033[0m".str_pad('', self::$tasknum_length+2-strlen('tasknum')). |
|
| 2743 | + "\033[47;30mmem\033[0m".str_pad('', self::$mem_length+2-strlen('mem')). |
|
| 2744 | + "\033[47;30mcollect succ\033[0m".str_pad('', self::$urls_length-strlen('collect succ')). |
|
| 2745 | + "\033[47;30mcollect fail\033[0m".str_pad('', self::$urls_length-strlen('collect fail')). |
|
| 2746 | + "\033[47;30mspeed\033[0m".str_pad('', self::$speed_length+2-strlen('speed')). |
|
| 2747 | 2747 | "\n"; |
| 2748 | 2748 | |
| 2749 | 2749 | $server_list_json = queue::get("server_list"); |
@@ -2784,11 +2784,11 @@ discard block |
||
| 2784 | 2784 | { |
| 2785 | 2785 | $display_str = "---------------------------\033[47;30m COLLECT STATUS \033[0m--------------------------\n"; |
| 2786 | 2786 | |
| 2787 | - $display_str .= "\033[47;30mfind pages\033[0m". str_pad('', 16-strlen('find pages')). |
|
| 2788 | - "\033[47;30mqueue\033[0m". str_pad('', 14-strlen('queue')). |
|
| 2789 | - "\033[47;30mcollected\033[0m". str_pad('', 15-strlen('collected')). |
|
| 2790 | - "\033[47;30mfields\033[0m". str_pad('', 15-strlen('fields')). |
|
| 2791 | - "\033[47;30mdepth\033[0m". str_pad('', 12-strlen('depth')). |
|
| 2787 | + $display_str .= "\033[47;30mfind pages\033[0m".str_pad('', 16-strlen('find pages')). |
|
| 2788 | + "\033[47;30mqueue\033[0m".str_pad('', 14-strlen('queue')). |
|
| 2789 | + "\033[47;30mcollected\033[0m".str_pad('', 15-strlen('collected')). |
|
| 2790 | + "\033[47;30mfields\033[0m".str_pad('', 15-strlen('fields')). |
|
| 2791 | + "\033[47;30mdepth\033[0m".str_pad('', 12-strlen('depth')). |
|
| 2792 | 2792 | "\n"; |
| 2793 | 2793 | |
| 2794 | 2794 | $collect = $this->get_collect_url_num(); |
@@ -72,21 +72,21 @@ discard block |
||
| 72 | 72 | public static $output_encoding = null; |
| 73 | 73 | public static $cookies = array(); // array of cookies to pass |
| 74 | 74 | // $cookies['username'] = "seatle"; |
| 75 | - public static $rawheaders = array(); // array of raw headers to send |
|
| 76 | - public static $domain_cookies = array(); // array of cookies for domain to pass |
|
| 77 | - public static $hosts = array(); // random host binding for make request faster |
|
| 78 | - public static $headers = array(); // headers returned from server sent here |
|
| 79 | - public static $useragents = array("requests/2.0.0"); // random agent we masquerade as |
|
| 80 | - public static $client_ips = array(); // random ip we masquerade as |
|
| 81 | - public static $proxies = array(); // random proxy ip |
|
| 82 | - public static $raw = ""; // head + body content returned from server sent here |
|
| 83 | - public static $head = ""; // head content |
|
| 84 | - public static $content = ""; // The body before encoding |
|
| 85 | - public static $text = ""; // The body after encoding |
|
| 86 | - public static $info = array(); // curl info |
|
| 87 | - public static $history = 302; // http request status before redirect. ex:30x |
|
| 88 | - public static $status_code = 0; // http request status |
|
| 89 | - public static $error = ""; // error messages sent here |
|
| 75 | + public static $rawheaders = array(); // array of raw headers to send |
|
| 76 | + public static $domain_cookies = array(); // array of cookies for domain to pass |
|
| 77 | + public static $hosts = array(); // random host binding for make request faster |
|
| 78 | + public static $headers = array(); // headers returned from server sent here |
|
| 79 | + public static $useragents = array("requests/2.0.0"); // random agent we masquerade as |
|
| 80 | + public static $client_ips = array(); // random ip we masquerade as |
|
| 81 | + public static $proxies = array(); // random proxy ip |
|
| 82 | + public static $raw = ""; // head + body content returned from server sent here |
|
| 83 | + public static $head = ""; // head content |
|
| 84 | + public static $content = ""; // The body before encoding |
|
| 85 | + public static $text = ""; // The body after encoding |
|
| 86 | + public static $info = array(); // curl info |
|
| 87 | + public static $history = 302; // http request status before redirect. ex:30x |
|
| 88 | + public static $status_code = 0; // http request status |
|
| 89 | + public static $error = ""; // error messages sent here |
|
| 90 | 90 | |
| 91 | 91 | /** |
| 92 | 92 | * set timeout |
@@ -289,7 +289,7 @@ discard block |
||
| 289 | 289 | { |
| 290 | 290 | return false; |
| 291 | 291 | } |
| 292 | - if ( empty($domain) ) |
|
| 292 | + if (empty($domain)) |
|
| 293 | 293 | { |
| 294 | 294 | self::$cookies = array(); |
| 295 | 295 | } |
@@ -541,17 +541,17 @@ discard block |
||
| 541 | 541 | */ |
| 542 | 542 | public static function init() |
| 543 | 543 | { |
| 544 | - if (!is_resource ( self::$ch )) |
|
| 544 | + if (!is_resource(self::$ch)) |
|
| 545 | 545 | { |
| 546 | - self::$ch = curl_init (); |
|
| 547 | - curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); |
|
| 548 | - curl_setopt( self::$ch, CURLOPT_HEADER, false ); |
|
| 549 | - curl_setopt( self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION ); |
|
| 546 | + self::$ch = curl_init(); |
|
| 547 | + curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, true); |
|
| 548 | + curl_setopt(self::$ch, CURLOPT_HEADER, false); |
|
| 549 | + curl_setopt(self::$ch, CURLOPT_USERAGENT, "phpspider-requests/".self::VERSION); |
|
| 550 | 550 | // 如果设置了两个时间,就分开设置 |
| 551 | 551 | if (is_array(self::$timeout)) |
| 552 | 552 | { |
| 553 | - curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0] ); |
|
| 554 | - curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); |
|
| 553 | + curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout[0]); |
|
| 554 | + curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout[1]); |
|
| 555 | 555 | } |
| 556 | 556 | else |
| 557 | 557 | { |
@@ -560,7 +560,7 @@ discard block |
||
| 560 | 560 | } |
| 561 | 561 | curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed |
| 562 | 562 | // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 |
| 563 | - curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); |
|
| 563 | + curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true); |
|
| 564 | 564 | } |
| 565 | 565 | return self::$ch; |
| 566 | 566 | } |
@@ -570,7 +570,7 @@ discard block |
||
| 570 | 570 | */ |
| 571 | 571 | public static function get($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 572 | 572 | { |
| 573 | - self::init (); |
|
| 573 | + self::init(); |
|
| 574 | 574 | return self::request($url, 'get', $fields, NULL, $allow_redirects, $cert); |
| 575 | 575 | } |
| 576 | 576 | |
@@ -593,19 +593,19 @@ discard block |
||
| 593 | 593 | */ |
| 594 | 594 | public static function post($url, $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) |
| 595 | 595 | { |
| 596 | - self::init (); |
|
| 596 | + self::init(); |
|
| 597 | 597 | return self::request($url, 'POST', $fields, $files, $allow_redirects, $cert); |
| 598 | 598 | } |
| 599 | 599 | |
| 600 | 600 | public static function put($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 601 | 601 | { |
| 602 | - self::init (); |
|
| 602 | + self::init(); |
|
| 603 | 603 | return self::request($url, 'PUT', $fields, $allow_redirects, $cert); |
| 604 | 604 | } |
| 605 | 605 | |
| 606 | 606 | public static function delete($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 607 | 607 | { |
| 608 | - self::init (); |
|
| 608 | + self::init(); |
|
| 609 | 609 | return self::request($url, 'DELETE', $fields, $allow_redirects, $cert); |
| 610 | 610 | } |
| 611 | 611 | |
@@ -614,19 +614,19 @@ discard block |
||
| 614 | 614 | // 此方法经常被用来测试超文本链接的有效性,可访问性,和最近的改变。. |
| 615 | 615 | public static function head($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 616 | 616 | { |
| 617 | - self::init (); |
|
| 617 | + self::init(); |
|
| 618 | 618 | self::request($url, 'HEAD', $fields, $allow_redirects, $cert); |
| 619 | 619 | } |
| 620 | 620 | |
| 621 | 621 | public static function options($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 622 | 622 | { |
| 623 | - self::init (); |
|
| 623 | + self::init(); |
|
| 624 | 624 | return self::request($url, 'OPTIONS', $fields, $allow_redirects, $cert); |
| 625 | 625 | } |
| 626 | 626 | |
| 627 | 627 | public static function patch($url, $fields = array(), $allow_redirects = true, $cert = NULL) |
| 628 | 628 | { |
| 629 | - self::init (); |
|
| 629 | + self::init(); |
|
| 630 | 630 | return self::request($url, 'PATCH', $fields, $allow_redirects, $cert); |
| 631 | 631 | } |
| 632 | 632 | |
@@ -645,7 +645,7 @@ discard block |
||
| 645 | 645 | public static function request($url, $method = 'GET', $fields = array(), $files = array(), $allow_redirects = true, $cert = NULL) |
| 646 | 646 | { |
| 647 | 647 | $method = strtoupper($method); |
| 648 | - if(!self::_is_url($url)) |
|
| 648 | + if (!self::_is_url($url)) |
|
| 649 | 649 | { |
| 650 | 650 | self::$error = "You have requested URL ({$url}) is not a valid HTTP address"; |
| 651 | 651 | return false; |
@@ -679,7 +679,7 @@ discard block |
||
| 679 | 679 | } |
| 680 | 680 | } |
| 681 | 681 | |
| 682 | - curl_setopt( self::$ch, CURLOPT_URL, $url ); |
|
| 682 | + curl_setopt(self::$ch, CURLOPT_URL, $url); |
|
| 683 | 683 | |
| 684 | 684 | if ($method != 'GET') |
| 685 | 685 | { |
@@ -692,13 +692,13 @@ discard block |
||
| 692 | 692 | // CURLOPT_POST会把上傳的文件类型设为 multipart/form-data |
| 693 | 693 | // 把CURLOPT_POSTFIELDS的内容按multipart/form-data 的形式编码 |
| 694 | 694 | // CURLOPT_CUSTOMREQUEST可以按指定内容上传 |
| 695 | - if ( isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json' ) |
|
| 695 | + if (isset($tmpheaders['content-type']) && $tmpheaders['content-type'] == 'application/json') |
|
| 696 | 696 | { |
| 697 | - curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
|
| 697 | + curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); |
|
| 698 | 698 | } |
| 699 | 699 | else |
| 700 | 700 | { |
| 701 | - curl_setopt( self::$ch, CURLOPT_POST, true ); |
|
| 701 | + curl_setopt(self::$ch, CURLOPT_POST, true); |
|
| 702 | 702 | } |
| 703 | 703 | |
| 704 | 704 | $file_fields = array(); |
@@ -724,16 +724,16 @@ discard block |
||
| 724 | 724 | else |
| 725 | 725 | { |
| 726 | 726 | self::$rawheaders['X-HTTP-Method-Override'] = $method; |
| 727 | - curl_setopt( self::$ch, CURLOPT_CUSTOMREQUEST, $method ); |
|
| 727 | + curl_setopt(self::$ch, CURLOPT_CUSTOMREQUEST, $method); |
|
| 728 | 728 | } |
| 729 | 729 | |
| 730 | - if ( $method == 'POST' ) |
|
| 730 | + if ($method == 'POST') |
|
| 731 | 731 | { |
| 732 | 732 | // 不是上传文件的,用http_build_query, 能实现更好的兼容性,更小的请求数据包 |
| 733 | - if ( empty($file_fields) ) |
|
| 733 | + if (empty($file_fields)) |
|
| 734 | 734 | { |
| 735 | 735 | // post方式 |
| 736 | - if ( is_array($fields) ) |
|
| 736 | + if (is_array($fields)) |
|
| 737 | 737 | { |
| 738 | 738 | $fields = http_build_query($fields); |
| 739 | 739 | } |
@@ -741,7 +741,7 @@ discard block |
||
| 741 | 741 | else |
| 742 | 742 | { |
| 743 | 743 | // 有post数据 |
| 744 | - if ( is_array($fields) && !empty($fields) ) |
|
| 744 | + if (is_array($fields) && !empty($fields)) |
|
| 745 | 745 | { |
| 746 | 746 | // 某些server可能会有问题 |
| 747 | 747 | $fields = array_merge($fields, $file_fields); |
@@ -753,13 +753,13 @@ discard block |
||
| 753 | 753 | } |
| 754 | 754 | |
| 755 | 755 | // 不能直接传数组,不知道是什么Bug,会非常慢 |
| 756 | - curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); |
|
| 756 | + curl_setopt(self::$ch, CURLOPT_POSTFIELDS, $fields); |
|
| 757 | 757 | } |
| 758 | 758 | } |
| 759 | 759 | |
| 760 | 760 | $cookies = self::get_cookies(); |
| 761 | 761 | $domain_cookies = self::get_cookies($domain); |
| 762 | - $cookies = array_merge($cookies, $domain_cookies); |
|
| 762 | + $cookies = array_merge($cookies, $domain_cookies); |
|
| 763 | 763 | // 是否设置了cookie |
| 764 | 764 | if (!empty($cookies)) |
| 765 | 765 | { |
@@ -773,13 +773,13 @@ discard block |
||
| 773 | 773 | |
| 774 | 774 | if (!empty(self::$useragents)) |
| 775 | 775 | { |
| 776 | - $key = rand(0, count(self::$useragents) - 1); |
|
| 776 | + $key = rand(0, count(self::$useragents)-1); |
|
| 777 | 777 | self::$rawheaders['User-Agent'] = self::$useragents[$key]; |
| 778 | 778 | } |
| 779 | 779 | |
| 780 | 780 | if (!empty(self::$client_ips)) |
| 781 | 781 | { |
| 782 | - $key = rand(0, count(self::$client_ips) - 1); |
|
| 782 | + $key = rand(0, count(self::$client_ips)-1); |
|
| 783 | 783 | self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key]; |
| 784 | 784 | self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key]; |
| 785 | 785 | } |
@@ -791,10 +791,10 @@ discard block |
||
| 791 | 791 | { |
| 792 | 792 | $http_headers[] = $k.': '.$v; |
| 793 | 793 | } |
| 794 | - curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $http_headers ); |
|
| 794 | + curl_setopt(self::$ch, CURLOPT_HTTPHEADER, $http_headers); |
|
| 795 | 795 | } |
| 796 | 796 | |
| 797 | - curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); |
|
| 797 | + curl_setopt(self::$ch, CURLOPT_ENCODING, 'gzip'); |
|
| 798 | 798 | |
| 799 | 799 | // 关闭验证 |
| 800 | 800 | if ($scheme == 'https') |
@@ -805,33 +805,33 @@ discard block |
||
| 805 | 805 | |
| 806 | 806 | if (self::$proxies) |
| 807 | 807 | { |
| 808 | - $key = rand(0, count(self::$proxies) - 1); |
|
| 808 | + $key = rand(0, count(self::$proxies)-1); |
|
| 809 | 809 | $proxy = self::$proxies[$key]; |
| 810 | - curl_setopt( self::$ch, CURLOPT_PROXY, $proxy ); |
|
| 810 | + curl_setopt(self::$ch, CURLOPT_PROXY, $proxy); |
|
| 811 | 811 | } |
| 812 | 812 | |
| 813 | 813 | // header + body,header 里面有 cookie |
| 814 | - curl_setopt( self::$ch, CURLOPT_HEADER, true ); |
|
| 814 | + curl_setopt(self::$ch, CURLOPT_HEADER, true); |
|
| 815 | 815 | // 请求跳转后的内容 |
| 816 | 816 | if ($allow_redirects) |
| 817 | 817 | { |
| 818 | - curl_setopt( self::$ch, CURLOPT_FOLLOWLOCATION, true); |
|
| 818 | + curl_setopt(self::$ch, CURLOPT_FOLLOWLOCATION, true); |
|
| 819 | 819 | } |
| 820 | 820 | |
| 821 | - self::$raw = curl_exec ( self::$ch ); |
|
| 821 | + self::$raw = curl_exec(self::$ch); |
|
| 822 | 822 | // 真实url |
| 823 | 823 | //$location = curl_getinfo( self::$ch, CURLINFO_EFFECTIVE_URL); |
| 824 | - self::$info = curl_getinfo( self::$ch ); |
|
| 824 | + self::$info = curl_getinfo(self::$ch); |
|
| 825 | 825 | //print_r(self::$info); |
| 826 | 826 | self::$status_code = self::$info['http_code']; |
| 827 | 827 | if (self::$raw === false) |
| 828 | 828 | { |
| 829 | - self::$error = 'Curl error: ' . curl_error( self::$ch ); |
|
| 829 | + self::$error = 'Curl error: '.curl_error(self::$ch); |
|
| 830 | 830 | //trigger_error(self::$error, E_USER_WARNING); |
| 831 | 831 | } |
| 832 | 832 | |
| 833 | 833 | // 关闭句柄 |
| 834 | - curl_close( self::$ch ); |
|
| 834 | + curl_close(self::$ch); |
|
| 835 | 835 | |
| 836 | 836 | // 请求成功之后才把URL存起来 |
| 837 | 837 | list($header, $text) = self::split_header_body(); |
@@ -861,7 +861,7 @@ discard block |
||
| 861 | 861 | // 获取 mimetype |
| 862 | 862 | public static function get_mimetype($filepath) |
| 863 | 863 | { |
| 864 | - $fp = finfo_open(FILEINFO_MIME); |
|
| 864 | + $fp = finfo_open(FILEINFO_MIME); |
|
| 865 | 865 | $mime = finfo_file($fp, $filepath); |
| 866 | 866 | finfo_close($fp); |
| 867 | 867 | $arr = explode(';', $mime); |
@@ -883,7 +883,7 @@ discard block |
||
| 883 | 883 | { |
| 884 | 884 | // 构造post数据 |
| 885 | 885 | $data = ''; |
| 886 | - $delimiter = '-------------' . uniqid(); |
|
| 886 | + $delimiter = '-------------'.uniqid(); |
|
| 887 | 887 | // 表单数据 |
| 888 | 888 | foreach ($post_fields as $name => $content) |
| 889 | 889 | { |
@@ -941,11 +941,11 @@ discard block |
||
| 941 | 941 | { |
| 942 | 942 | $out = self::$output_encoding; |
| 943 | 943 | } |
| 944 | - if ( ! isset($out)) |
|
| 944 | + if (!isset($out)) |
|
| 945 | 945 | { |
| 946 | 946 | $out = 'UTF-8'; |
| 947 | 947 | } |
| 948 | - if ( ! in_array($mode, $valid)) |
|
| 948 | + if (!in_array($mode, $valid)) |
|
| 949 | 949 | { |
| 950 | 950 | throw new Exception('invalid mode, mode='.$mode); |
| 951 | 951 | } |
@@ -965,7 +965,7 @@ discard block |
||
| 965 | 965 | } |
| 966 | 966 | |
| 967 | 967 | $pattern = '/(<meta[^>]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is'; |
| 968 | - if ( ! isset($in)) |
|
| 968 | + if (!isset($in)) |
|
| 969 | 969 | { |
| 970 | 970 | $n = preg_match($pattern, $html, $in); |
| 971 | 971 | if ($n > 0) |