Issues (138)

Classes/Command/BuildQueueCommand.php (1 issue)

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Command;
6
7
/*
8
 * (c) 2020 AOE GmbH <[email protected]>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21
22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Utility\MessageUtility;
28
use AOE\Crawler\Utility\SignalSlotUtility;
29
use Symfony\Component\Console\Command\Command;
30
use Symfony\Component\Console\Helper\ProgressBar;
31
use Symfony\Component\Console\Input\InputArgument;
32
use Symfony\Component\Console\Input\InputInterface;
33
use Symfony\Component\Console\Input\InputOption;
34
use Symfony\Component\Console\Output\OutputInterface;
35
use TYPO3\CMS\Core\Utility\GeneralUtility;
36
use TYPO3\CMS\Core\Utility\MathUtility;
37
use TYPO3\CMS\Extbase\Object\ObjectManager;
38
39
class BuildQueueCommand extends Command
40
{
41
    protected function configure(): void
42
    {
43
        $this->setDescription('Create entries in the queue that can be processed at once');
44
45
        $this->setHelp(
46
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
47
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
48
It can put entries in the queue from command line options, return the list of URLs and even execute
49
all entries right away without having to queue them up - this can be useful for immediate re-cache,
50
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
51
            '
52
            Examples:
53
              --- Re-cache pages from page 7 and two levels down, executed immediately
54
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
55
56
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
57
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
58
            '
59
        );
60
61
        $this->addArgument(
62
            'page',
63
            InputArgument::REQUIRED,
64
            'The page from where the queue building should start'
65
        );
66
67
        $this->addArgument(
68
            'conf',
69
            InputArgument::REQUIRED,
70
            'A comma separated list of crawler configurations'
71
        );
72
73
        $this->addOption(
74
            'depth',
75
            'd',
76
            InputOption::VALUE_OPTIONAL,
77
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
78
            '0'
79
        );
80
81
        $this->addOption(
82
            'mode',
83
            'm',
84
            InputOption::VALUE_OPTIONAL,
85
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
86
        );
87
88
        $this->addOption(
89
            'number',
90
            '',
91
            InputOption::VALUE_OPTIONAL,
92
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
93
            '0'
94
        );
95
    }
96
97
    /**
98
     * Crawler Command - Submitting URLs to be crawled.
99
     *
100
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
101
     * It can put entries in the queue from command line options, return the list of URLs and even execute
102
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
103
     * re-indexing or static publishing from command line.
104
     *
105
     * Examples:
106
     *
107
     * --- Re-cache pages from page 7 and two levels down, executed immediately
108
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
109
     *
110
     *
111
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
112
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
113
     */
114
    protected function execute(InputInterface $input, OutputInterface $output): int
115
    {
116
        /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
117
        $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
118
        $mode = $input->getOption('mode') ?? 'queue';
119
120
        $extensionSettings = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration();
121
122
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
123
124
        /** @var CrawlerController $crawlerController */
125
        $crawlerController = $objectManager->get(CrawlerController::class);
126
        /** @var QueueRepository $queueRepository */
127
        $queueRepository = $objectManager->get(QueueRepository::class);
128
129
        if ($mode === 'exec') {
130
            $crawlerController->registerQueueEntriesInternallyOnly = true;
131
        }
132
133
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
134
        if ($pageId === 0) {
135
            $message = "Page ${pageId} is not a valid page, please check you root page id and try again.";
136
            MessageUtility::addErrorMessage($message);
137
            $output->writeln("<info>${message}</info>");
138
            return 1;
139
        }
140
141
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
142
143
        if ($mode === 'queue' || $mode === 'exec') {
144
            $reason = new Reason();
145
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
146
            $reason->setDetailText('The cli script of the crawler added to the queue');
147
148
            $signalPayload = ['reason' => $reason];
149
            SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

149
            /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
150
                self::class,
151
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
152
                $signalPayload
153
            );
154
        }
155
156
        if ($extensionSettings['cleanUpOldQueueEntries']) {
157
            $queueRepository->cleanUpOldQueueEntries();
158
        }
159
160
        $crawlerController->setID = GeneralUtility::md5int(microtime());
161
        $crawlerController->getPageTreeAndUrls(
162
            $pageId,
163
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
164
            $crawlerController->getCurrentTime(),
165
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
166
            $mode === 'queue' || $mode === 'exec',
167
            $mode === 'url',
168
            [],
169
            $configurationKeys
170
        );
171
172
        if ($mode === 'url') {
173
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>');
174
        } elseif ($mode === 'exec') {
175
            $progressBar = new ProgressBar($output);
176
            $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>');
177
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
178
            $output->writeln('<info>Processing</info>' . PHP_EOL);
179
180
            foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) {
181
                $p = $jsonCompatibilityConverter->convert($queueRec['parameters']);
182
183
                $progressBar->clear();
184
                $output->writeln('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL);
185
                $progressBar->display();
186
187
                $result = $crawlerController->readUrlFromArray($queueRec);
188
189
                $resultContent = $result['content'] ?? '';
190
                $requestResult = $jsonCompatibilityConverter->convert($resultContent);
191
192
                $progressBar->clear();
193
                if (is_array($requestResult)) {
194
                    $resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
195
                    $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
196
                } else {
197
                    $output->writeln('<error>Error checking Crawler Result:  ' . substr(preg_replace('/\s+/', ' ', strip_tags($resultContent)), 0, 30000) . '...' . PHP_EOL . '</error>' . PHP_EOL);
198
                }
199
                $progressBar->display();
200
            }
201
            $output->writeln('');
202
        } elseif ($mode === 'queue') {
203
            $output->writeln('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL);
204
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
205
        } else {
206
            $output->writeln('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL);
207
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL);
208
        }
209
210
        return 0;
211
    }
212
213
    /**
214
     * Obtains configuration keys from the CLI arguments
215
     */
216
    private function getConfigurationKeys(string $conf): array
217
    {
218
        $parameter = trim($conf);
219
        return ($parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : []);
220
    }
221
}
222