Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

CrawlerCommand::execute()   A

Complexity

Conditions 3
Paths 1

Size

Total Lines 29
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 3
eloc 17
c 1
b 0
f 1
nc 1
nop 2
dl 0
loc 29
rs 9.7
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler\Command;
4
5
use PiedWeb\SeoPocketCrawler\Crawler;
6
use PiedWeb\SeoPocketCrawler\CrawlerContinue;
7
use PiedWeb\SeoPocketCrawler\CrawlerRestart;
8
use Symfony\Component\Console\Command\Command;
9
use Symfony\Component\Console\Input\InputArgument;
10
use Symfony\Component\Console\Input\InputInterface;
11
use Symfony\Component\Console\Input\InputOption;
12
use Symfony\Component\Console\Output\OutputInterface;
13
14
class CrawlerCommand extends Command
15
{
16
    protected static $defaultName = 'crawler:go';
17
18
    protected $id;
19
20
    protected function configure()
21
    {
22
        $this->setDescription('Crawl a website.');
23
24
        $this
25
            ->addArgument(
26
                'start',
27
                InputArgument::REQUIRED,
28
                'Define where the crawl start. Eg: https://piedweb.com'
29
                .PHP_EOL.'You can specify an id from a previous crawl. Other options will not be listen.'
30
            )
31
            ->addOption('limit', 'l', InputOption::VALUE_REQUIRED, 'Define where a depth limit', 5)
32
            ->addOption(
33
                'ignore',
34
                'i',
35
                InputOption::VALUE_REQUIRED,
36
                'Virtual Robots.txt to respect (could be a string or an URL).'
37
            )
38
            ->addOption(
39
                'user-agent',
40
                'u',
41
                InputOption::VALUE_REQUIRED,
42
                'Define the user-agent used during the crawl.',
43
                'SEO Pocket Crawler - PiedWeb.com/seo/crawler'
44
            )
45
            ->addOption(
46
                'wait',
47
                'w',
48
                InputOption::VALUE_REQUIRED,
49
                'In Microseconds, the time to wait between 2 requests. Default 0,1s.',
50
                100000
51
            )
52
            ->addOption(
53
                'cache-method',
54
                'c',
55
                InputOption::VALUE_REQUIRED,
56
                'In Microseconds, the time to wait between two request. Default : 100000 (0,1s).',
57
                \PiedWeb\SeoPocketCrawler\Recorder::CACHE_ID
58
            )
59
            ->addOption(
60
                'restart',
61
                'r',
62
                InputOption::VALUE_REQUIRED,
63
                 'Permit to restart a previous crawl. Values 1 = fresh restart, 2 = restart from cache'
64
            )
65
        ;
66
    }
67
68
    protected function execute(InputInterface $input, OutputInterface $output)
69
    {
70
        $this->checkArguments($input);
71
72
        $start = microtime(true);
73
74
        $crawler = $this->initCrawler($input);
75
76
        $output->writeln(['', '', 'Crawl starting !', '============', '', 'ID: '.$crawler->getConfig()->getId()]);
77
        $output->writeln([
78
            null !== $this->id ? ($input->getOption('restart') ? 'Restart' : 'Continue') : '',
79
            '',
80
            'Details : ',
81
            '- Crawl starting at '.$crawler->getConfig()->getBase().$crawler->getConfig()->getStartUrl(),
82
            '- User-Agent used `'.$crawler->getConfig()->getUserAgent(),
83
            '- `'.$crawler->getConfig()->getWait().' ms between two requests',
84
        ]);
85
86
        $crawler->crawl(!$input->getOption('quiet'));
87
88
        $end = microtime(true);
89
90
        $output->writeln(['', '---------------', 'Crawl succeed', 'You can find your data in ']);
91
92
        echo realpath($crawler->getConfig()->getDataFolder()).'/data.csv'.PHP_EOL;
93
94
        $output->writeln(['', '', '----Chrono----', (round(($end - $start), 2)).'s', '', '']);
95
96
        return 0;
97
    }
98
99
    public function checkArguments(InputInterface $input)
100
    {
101
        if (!filter_var($input->getArgument('start'), FILTER_VALIDATE_URL)) {
102
            $this->id = $input->getArgument('start');
103
        }
104
    }
105
106
    /**
107
     * @return Crawler
108
     */
109
    public function initCrawler(InputInterface $input)
110
    {
111
        if (null === $this->id) {
112
            return new Crawler(
113
                $input->getArgument('start'),
0 ignored issues
show
Bug introduced by
It seems like $input->getArgument('start') can also be of type null and string[]; however, parameter $startUrl of PiedWeb\SeoPocketCrawler\Crawler::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

113
                /** @scrutinizer ignore-type */ $input->getArgument('start'),
Loading history...
114
                $this->loadVirtualRobotsTxt($input),
115
                intval($input->getOption('limit')),
116
                (string) $input->getOption('user-agent'),
117
                intval($input->getOption('cache-method')),
118
                intval($input->getOption('wait'))
119
            );
120
        }
121
122
        if ($input->getOption('restart')) {
123
            return new CrawlerRestart(
124
                $this->id,
125
                2 == $input->getOption('restart') ? true : false // $fromCache
126
            );
127
        }
128
129
        return new CrawlerContinue($this->id);
130
    }
131
132
    public function loadVirtualRobotsTxt(InputInterface $input)
133
    {
134
        if (null === $input->getOption('ignore')) {
135
            return '';
136
        }
137
138
        $ignore = $input->getOption('ignore');
139
140
        if (filter_var($ignore, FILTER_VALIDATE_URL)) {
141
            return \PiedWeb\Curl\Request::get($ignore);
142
        }
143
144
        if (file_exists($ignore)) {
0 ignored issues
show
Bug introduced by
It seems like $ignore can also be of type string[]; however, parameter $filename of file_exists() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

144
        if (file_exists(/** @scrutinizer ignore-type */ $ignore)) {
Loading history...
145
            return file_get_contents($ignore);
0 ignored issues
show
Bug introduced by
It seems like $ignore can also be of type string[]; however, parameter $filename of file_get_contents() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

145
            return file_get_contents(/** @scrutinizer ignore-type */ $ignore);
Loading history...
146
        }
147
148
        throw new \Exception('An error occured with your --ignore option');
149
    }
150
}
151