1 | <?php |
||
2 | |||
3 | include 'vendor/autoload.php'; |
||
4 | |||
5 | // --------------- |
||
6 | // Configure |
||
7 | // --------------- |
||
8 | $config = [ |
||
9 | 'start' => [ |
||
10 | 'alias' => 's', |
||
11 | 'help' => 'Define where the crawl start.', |
||
12 | ], |
||
13 | 'limit' => [ |
||
14 | 'alias' => 'l', |
||
15 | 'default' => 5, |
||
16 | 'help' => 'Define where a depth limit for the crawler (default 5).', |
||
17 | 'filter' => 'int', |
||
18 | ], |
||
19 | 'ignore' => [ |
||
20 | 'alias' => 'i', |
||
21 | 'default' => null, |
||
22 | 'help' => 'Virtual Robots.txt wich will be interpreted for this crawl (could be a string or an URL).', |
||
23 | ], |
||
24 | 'user-agent' => [ |
||
25 | 'alias' => 'u', |
||
26 | 'default' => 'PockectCr. From PiedWeb', |
||
27 | 'help' => 'Define the user-agent used during the crawl', |
||
28 | ], |
||
29 | 'verbose' => [ |
||
30 | 'alias' => 'v', |
||
31 | 'default' => 1, |
||
32 | 'help' => 'Display debugging information (0/1, default 1).', |
||
33 | 'filter' => 'int', |
||
34 | ], |
||
35 | |||
36 | ]; |
||
37 | |||
38 | $CliArgs = new CliArgs\CliArgs($config); |
||
39 | if ($CliArgs->isFlagExists('help', 'h')) echo $CliArgs->getHelp('help'); |
||
40 | |||
41 | $startUrl = $CliArgs->getArg('start'); |
||
42 | $limit = $CliArgs->getArg('limit'); |
||
43 | |||
44 | $ignore = $CliArgs->getArg('ignore') ?? ''; |
||
45 | if (filter_var($ignore, FILTER_VALIDATE_URL)) $ignore = \PiedWeb\Curl\Request::get($ignore); |
||
46 | |||
47 | $userAgent = $CliArgs->getArg('user-agent'); |
||
48 | $debug = $CliArgs->getArg('verbose'); |
||
49 | |||
50 | // --------------- |
||
51 | // Crawler working |
||
52 | // --------------- |
||
53 | if ($startUrl === null) { |
||
54 | throw new \Exception('--start is required'); |
||
55 | } |
||
56 | |||
57 | if ($debug) echo PHP_EOL.PHP_EOL.PHP_EOL.' Crawl starting'.PHP_EOL.' '.$startUrl.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL; |
||
58 | |||
59 | $crawl = new \PiedWeb\SeoPocketCrawler\Crawler($startUrl, $ignore, $limit, $userAgent); |
||
60 | $crawl->crawl($debug); |
||
61 | |||
62 | if ($debug) { |
||
63 | echo PHP_EOL.PHP_EOL.'---------------'.PHP_EOL.PHP_EOL.' '.'Crawl succeed'.PHP_EOL; |
||
64 | echo ' '.'You can find your data in '.PHP_EOL.$crawl->getDataFolder().'/index.csv'.PHP_EOL; |
||
65 | echo ' You can find source code from html crawled pages in '.$crawl->getCacheFolder() |
||
66 | echo .PHP_EOL.PHP_EOL; |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
67 |