Completed
Push — master ( 31ed5b...1b5be9 )
by Dev
08:32 queued 07:20
created

Crawler.php (1 issue)

Labels
Severity
1
<?php
2
3
include 'vendor/autoload.php';
4
5
// ---------------
6
// Configure
7
// ---------------
8
$config = [
9
    'start' => [
10
        'alias' => 's',
11
        'help' => 'Define where the crawl start.',
12
    ],
13
    'limit' => [
14
        'alias' => 'l',
15
        'default' => 5,
16
        'help' => 'Define where a depth limit for the crawler (default 5).',
17
        'filter' => 'int',
18
    ],
19
    'ignore' => [
20
        'alias' => 'i',
21
        'default' => null,
22
        'help' => 'Virtual Robots.txt wich will be interpreted for this crawl (could be a string or an URL).',
23
    ],
24
    'user-agent' => [
25
        'alias' => 'u',
26
        'default' => 'PockectCr. From PiedWeb',
27
        'help' => 'Define the user-agent used during the crawl',
28
    ],
29
    'verbose' => [
30
        'alias' => 'v',
31
        'default' => 1,
32
        'help' => 'Display debugging information (0/1, default 1).',
33
        'filter' => 'int',
34
    ],
35
36
];
37
38
$CliArgs = new CliArgs\CliArgs($config);
39
if ($CliArgs->isFlagExists('help', 'h')) echo $CliArgs->getHelp('help');
40
41
$startUrl = $CliArgs->getArg('start');
42
$limit    = $CliArgs->getArg('limit');
43
44
$ignore   = $CliArgs->getArg('ignore') ?? '';
45
if (filter_var($ignore, FILTER_VALIDATE_URL)) $ignore = \PiedWeb\Curl\Request::get($ignore);
46
47
$userAgent = $CliArgs->getArg('user-agent');
48
$debug     = $CliArgs->getArg('verbose');
49
50
// ---------------
51
// Crawler working
52
// ---------------
53
if ($startUrl === null) {
54
    throw new \Exception('--start is required');
55
}
56
57
if ($debug) echo PHP_EOL.PHP_EOL.PHP_EOL.'    Crawl starting'.PHP_EOL.'    '.$startUrl.PHP_EOL.PHP_EOL.PHP_EOL.PHP_EOL;
58
59
$crawl = new \PiedWeb\SeoPocketCrawler\Crawler($startUrl, $ignore, $limit, $userAgent);
60
$crawl->crawl($debug);
61
62
if ($debug) {
63
    echo PHP_EOL.PHP_EOL.'---------------'.PHP_EOL.PHP_EOL.'    '.'Crawl succeed'.PHP_EOL;
64
    echo '    '.'You can find your data in '.PHP_EOL.$crawl->getDataFolder().'/index.csv'.PHP_EOL;
65
    echo '    You can find source code from html crawled pages in '.$crawl->getCacheFolder()
66
    echo .PHP_EOL.PHP_EOL;
0 ignored issues
show
A parse error occurred: Syntax error, unexpected T_ECHO, expecting ',' or ';' on line 66 at column 4
Loading history...
67