CrawlerCommand::configure()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 45
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 1
eloc 37
c 1
b 0
f 1
nc 1
nop 0
dl 0
loc 45
rs 9.328
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler\Command;
4
5
use PiedWeb\SeoPocketCrawler\Crawler;
6
use PiedWeb\SeoPocketCrawler\CrawlerContinue;
7
use PiedWeb\SeoPocketCrawler\CrawlerRestart;
8
use Symfony\Component\Console\Command\Command;
9
use Symfony\Component\Console\Input\InputArgument;
10
use Symfony\Component\Console\Input\InputInterface;
11
use Symfony\Component\Console\Input\InputOption;
12
use Symfony\Component\Console\Output\OutputInterface;
13
14
class CrawlerCommand extends Command
15
{
16
    protected static $defaultName = 'crawler:go';
17
18
    protected $id;
19
20
    protected function configure()
21
    {
22
        $this->setDescription('Crawl a website.');
23
24
        $this
25
            ->addArgument(
26
                'start',
27
                InputArgument::REQUIRED,
28
                'Define where the crawl start. Eg: https://piedweb.com'
29
                .PHP_EOL.'You can specify an id from a previous crawl. Other options will not be listen.'
30
                .PHP_EOL.'You can use `last` to continue the last crawl (just stopped).'
31
            )
32
            ->addOption('limit', 'l', InputOption::VALUE_REQUIRED, 'Define where a depth limit', 5)
33
            ->addOption(
34
                'ignore',
35
                'i',
36
                InputOption::VALUE_REQUIRED,
37
                'Virtual Robots.txt to respect (could be a string or an URL).'
38
            )
39
            ->addOption(
40
                'user-agent',
41
                'u',
42
                InputOption::VALUE_REQUIRED,
43
                'Define the user-agent used during the crawl.',
44
                'SEO Pocket Crawler - PiedWeb.com/seo/crawler'
45
            )
46
            ->addOption(
47
                'wait',
48
                'w',
49
                InputOption::VALUE_REQUIRED,
50
                'In Microseconds, the time to wait between 2 requests. Default 0,1s.',
51
                100000
52
            )
53
            ->addOption(
54
                'cache-method',
55
                'c',
56
                InputOption::VALUE_REQUIRED,
57
                'In Microseconds, the time to wait between two request. Default : 100000 (0,1s).',
58
                \PiedWeb\SeoPocketCrawler\Recorder::CACHE_ID
59
            )
60
            ->addOption(
61
                'restart',
62
                'r',
63
                InputOption::VALUE_REQUIRED,
64
                'Permit to restart a previous crawl. Values 1 = fresh restart, 2 = restart from cache'
65
            )
66
        ;
67
    }
68
69
    protected function execute(InputInterface $input, OutputInterface $output)
70
    {
71
        $this->checkArguments($input);
72
73
        $start = microtime(true);
74
75
        $crawler = $this->initCrawler($input);
76
77
        $output->writeln(['', '', 'Crawl starting !', '============', '', 'ID: '.$crawler->getConfig()->getId()]);
78
        $output->writeln([
79
            null !== $this->id ? ($input->getOption('restart') ? 'Restart' : 'Continue') : '',
80
            '',
81
            'Details : ',
82
            '- Crawl starting at '.$crawler->getConfig()->getBase().$crawler->getConfig()->getStartUrl(),
83
            '- User-Agent used `'.$crawler->getConfig()->getUserAgent(),
84
            '- `'.$crawler->getConfig()->getWait().' ms between two requests',
85
        ]);
86
87
        $crawler->crawl();
88
89
        $end = microtime(true);
90
91
        $output->writeln(['', '---------------', 'Crawl succeed', 'You can find your data in ']);
92
93
        echo realpath($crawler->getConfig()->getDataFolder()).'/data.csv'.PHP_EOL;
94
95
        $output->writeln(['', '', '----Chrono----', (round(($end - $start), 2)).'s', '', '']);
96
97
        return 0;
98
    }
99
100
    public function checkArguments(InputInterface $input)
101
    {
102
        if (! filter_var($input->getArgument('start'), FILTER_VALIDATE_URL)) {
103
            $this->id = $input->getArgument('start');
104
        }
105
    }
106
107
    /**
108
     * @return Crawler
109
     */
110
    public function initCrawler(InputInterface $input)
111
    {
112
        if (null === $this->id) {
113
            return new Crawler(
114
                (string) $input->getArgument('start'),
115
                $this->loadVirtualRobotsTxt($input),
116
                intval($input->getOption('limit')),
117
                (string) $input->getOption('user-agent'),
118
                intval($input->getOption('cache-method')),
119
                intval($input->getOption('wait')),
120
                ! $input->getOption('quiet')
121
            );
122
        }
123
124
        if ($input->getOption('restart')) {
125
            return new CrawlerRestart(
126
                $this->id,
127
                2 == $input->getOption('restart') ? true : false, // $fromCache
128
                ! $input->getOption('quiet')
129
            );
130
        }
131
132
        return new CrawlerContinue($this->id, ! $input->getOption('quiet'));
133
    }
134
135
    public function loadVirtualRobotsTxt(InputInterface $input)
136
    {
137
        if (null === $input->getOption('ignore')) {
138
            return '';
139
        }
140
141
        $ignore = (string) $input->getOption('ignore');
142
143
        if (filter_var($ignore, FILTER_VALIDATE_URL)) {
144
            return \PiedWeb\Curl\Request::get($ignore);
145
        }
146
147
        if (file_exists($ignore)) {
148
            return file_get_contents($ignore);
149
        }
150
151
        throw new \Exception('An error occured with your --ignore option');
152
    }
153
}
154