PiedWeb /
SeoPocketCrawler
| 1 | <?php |
||
| 2 | |||
| 3 | include 'vendor/autoload.php'; |
||
| 4 | |||
| 5 | // --------------- |
||
| 6 | // Configure |
||
| 7 | // --------------- |
||
| 8 | $config = [ |
||
| 9 | 'start' => [ |
||
| 10 | 'alias' => 's', |
||
| 11 | 'help' => 'Define where the crawl start.', |
||
| 12 | ], |
||
| 13 | 'limit' => [ |
||
| 14 | 'alias' => 'l', |
||
| 15 | 'default' => 5, |
||
| 16 | 'help' => 'Define where a depth limit for the crawler (default 5).', |
||
| 17 | 'filter' => 'int', |
||
| 18 | ], |
||
| 19 | 'ignore' => [ |
||
| 20 | 'alias' => 'i', |
||
| 21 | 'default' => null, |
||
| 22 | 'help' => 'Virtual Robots.txt wich will be interpreted for this crawl (could be a string or an URL).', |
||
| 23 | ], |
||
| 24 | 'user-agent' => [ |
||
| 25 | 'alias' => 'u', |
||
| 26 | 'default' => 'PockectCr. From PiedWeb', |
||
| 27 | 'help' => 'Define the user-agent used during the crawl', |
||
| 28 | ], |
||
| 29 | 'verbose' => [ |
||
| 30 | 'alias' => 'v', |
||
| 31 | 'default' => 1, |
||
| 32 | 'help' => 'Display debugging information (0/1, default 1).', |
||
| 33 | 'filter' => 'int', |
||
| 34 | ], |
||
| 35 | |||
| 36 | ]; |
||
| 37 | |||
| 38 | $CliArgs = new CliArgs\CliArgs($config); |
||
| 39 | if ($CliArgs->isFlagExists('help', 'h')) echo $CliArgs->getHelp('help'); |
||
| 40 | |||
| 41 | $startUrl = $CliArgs->getArg('start'); |
||
| 42 | $limit = $CliArgs->getArg('limit'); |
||
| 43 | |||
| 44 | $ignore = $CliArgs->getArg('ignore') ?? ''; |
||
| 45 | if (filter_var($ignore, FILTER_VALIDATE_URL)) $ignore = \PiedWeb\Curl\Request::get($ignore); |
||
| 46 | |||
| 47 | $userAgent = $CliArgs->getArg('user-agent'); |
||
| 48 | $debug = $CliArgs->getArg('verbose'); |
||
| 49 | |||
| 50 | // --------------- |
||
| 51 | // Crawler working |
||
| 52 | // --------------- |
||
| 53 | if ($startUrl === null) { |
||
| 54 | throw new \Exception('--start is required'); |
||
| 55 | } |
||
| 56 | |||
| 57 | if ($debug) echo PHP_EOL.PHP_EOL.PHP_EOL.' Crawl starting'.PHP_EOL.' '.$startUrl.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL; |
||
| 58 | |||
| 59 | $crawl = new \PiedWeb\SeoPocketCrawler\Crawler($startUrl, $ignore, $limit, $userAgent); |
||
| 60 | $crawl->crawl($debug); |
||
| 61 | |||
| 62 | if ($debug) { |
||
| 63 | echo PHP_EOL.PHP_EOL.'---------------'.PHP_EOL.PHP_EOL.' '.'Crawl succeed'.PHP_EOL; |
||
| 64 | echo ' '.'You can find your data in '.PHP_EOL.$crawl->getDataFolder().'/index.csv'.PHP_EOL; |
||
| 65 | echo ' You can find source code from html crawled pages in '.$crawl->getCacheFolder() |
||
| 66 | echo .PHP_EOL.PHP_EOL; |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 67 |