Passed
Push — issue/615 ( d1bb44 )
by Tomas Norre
14:36
created

BuildQueueCommand::outputUrls()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 9
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 6
c 0
b 0
f 0
nc 4
nop 2
dl 0
loc 9
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Command;
6
7
/*
8
 * (c) 2020 AOE GmbH <[email protected]>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21
22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Utility\MessageUtility;
28
use AOE\Crawler\Utility\SignalSlotUtility;
29
use AOE\Crawler\Value\QueueRow;
30
use Symfony\Component\Console\Command\Command;
31
use Symfony\Component\Console\Helper\ProgressBar;
32
use Symfony\Component\Console\Input\InputArgument;
33
use Symfony\Component\Console\Input\InputInterface;
34
use Symfony\Component\Console\Input\InputOption;
35
use Symfony\Component\Console\Output\OutputInterface;
36
use TYPO3\CMS\Core\Utility\GeneralUtility;
37
use TYPO3\CMS\Core\Utility\MathUtility;
38
use TYPO3\CMS\Extbase\Object\ObjectManager;
39
40
class BuildQueueCommand extends Command
41
{
42
    protected function configure(): void
43
    {
44
        $this->setDescription('Create entries in the queue that can be processed at once');
45
46
        $this->setHelp(
47
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
48
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
49
It can put entries in the queue from command line options, return the list of URLs and even execute
50
all entries right away without having to queue them up - this can be useful for immediate re-cache,
51
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
52
            '
53
            Examples:
54
              --- Re-cache pages from page 7 and two levels down, executed immediately
55
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
56
57
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
58
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
59
            '
60
        );
61
62
        $this->addArgument(
63
            'page',
64
            InputArgument::REQUIRED,
65
            'The page from where the queue building should start'
66
        );
67
68
        $this->addArgument(
69
            'conf',
70
            InputArgument::REQUIRED,
71
            'A comma separated list of crawler configurations'
72
        );
73
74
        $this->addOption(
75
            'depth',
76
            'd',
77
            InputOption::VALUE_OPTIONAL,
78
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
79
            '0'
80
        );
81
82
        $this->addOption(
83
            'mode',
84
            'm',
85
            InputOption::VALUE_OPTIONAL,
86
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
87
        );
88
89
        $this->addOption(
90
            'number',
91
            '',
92
            InputOption::VALUE_OPTIONAL,
93
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
94
            '0'
95
        );
96
    }
97
98
    /**
99
     * Crawler Command - Submitting URLs to be crawled.
100
     *
101
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
102
     * It can put entries in the queue from command line options, return the list of URLs and even execute
103
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
104
     * re-indexing or static publishing from command line.
105
     *
106
     * Examples:
107
     *
108
     * --- Re-cache pages from page 7 and two levels down, executed immediately
109
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
110
     *
111
     *
112
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
113
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
114
     */
115
    protected function execute(InputInterface $input, OutputInterface $output): int
116
    {
117
        /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
118
        $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
119
        $mode = $input->getOption('mode') ?? 'queue';
120
121
        $extensionSettings = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration();
122
123
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
124
125
        /** @var CrawlerController $crawlerController */
126
        $crawlerController = $objectManager->get(CrawlerController::class);
127
        /** @var QueueRepository $queueRepository */
128
        $queueRepository = $objectManager->get(QueueRepository::class);
129
130
        if ($mode === 'exec') {
131
            $crawlerController->registerQueueEntriesInternallyOnly = true;
132
        }
133
134
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
135
        if ($pageId === 0) {
136
            $message = "Page ${pageId} is not a valid page, please check you root page id and try again.";
137
            MessageUtility::addErrorMessage($message);
138
            $output->writeln("<info>${message}</info>");
139
            return 1;
140
        }
141
142
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
143
144
        if ($mode === 'queue' || $mode === 'exec') {
145
            $reason = new Reason();
146
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
147
            $reason->setDetailText('The cli script of the crawler added to the queue');
148
149
            $signalPayload = ['reason' => $reason];
150
            SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

150
            /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
151
                self::class,
152
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
153
                $signalPayload
154
            );
155
        }
156
157
        if ($extensionSettings['cleanUpOldQueueEntries']) {
158
            $queueRepository->cleanUpOldQueueEntries();
159
        }
160
161
        $crawlerController->setID = GeneralUtility::md5int(microtime());
162
        $queueRows = $crawlerController->getPageTreeAndUrls(
163
            $pageId,
164
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
165
            $crawlerController->getCurrentTime(),
166
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
167
            $mode === 'queue' || $mode === 'exec',
168
            $mode === 'url',
169
            [],
170
            $configurationKeys
171
        );
172
173
        if ($mode === 'url') {
174
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>');
175
        } elseif ($mode === 'exec') {
176
            $progressBar = new ProgressBar($output);
177
            $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>');
178
            $this->outputUrls($queueRows, $output);
179
            $output->writeln('<info>Processing</info>' . PHP_EOL);
180
181
            foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) {
182
                $p = $jsonCompatibilityConverter->convert($queueRec['parameters']);
183
184
                $progressBar->clear();
185
                $output->writeln('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL);
186
                $progressBar->display();
187
188
                $result = $crawlerController->readUrlFromArray($queueRec);
189
190
                $resultContent = $result['content'] ?? '';
191
                $requestResult = $jsonCompatibilityConverter->convert($resultContent);
192
193
                $progressBar->clear();
194
                if (is_array($requestResult)) {
195
                    $resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
196
                    $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
197
                } else {
198
                    $output->writeln('<error>Error checking Crawler Result:  ' . substr(preg_replace('/\s+/', ' ', strip_tags($resultContent)), 0, 30000) . '...' . PHP_EOL . '</error>' . PHP_EOL);
199
                }
200
                $progressBar->display();
201
            }
202
            $output->writeln('');
203
        } elseif ($mode === 'queue') {
204
            $output->writeln('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL);
205
            $this->outputUrls($queueRows, $output);
206
        } else {
207
            $output->writeln('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL);
208
            $this->outputUrls($queueRows, $output);
209
        }
210
211
        return 0;
212
    }
213
214
    /**
215
     * Obtains configuration keys from the CLI arguments
216
     */
217
    private function getConfigurationKeys(string $conf): array
218
    {
219
        $parameter = trim($conf);
220
        return ($parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : []);
221
    }
222
223
    private function outputUrls(array $queueRows, OutputInterface $output): void
224
    {
225
        foreach ($queueRows as $queueRow) {
226
            /** @var QueueRow $row */
227
            foreach ($queueRow as $row) {
228
                if (empty($row->message)) {
229
                    $output->writeln('<info>' . $row->urls . '</info>');
230
                } else {
231
                    $output->writeln('<warning>' . $row->pageTitle . ': ' . $row->message . '</warning>');
232
                }
233
            }
234
        }
235
    }
236
}
237