1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace AOE\Crawler\Command; |
||
6 | |||
7 | /* |
||
8 | * (c) 2020 AOE GmbH <[email protected]> |
||
9 | * |
||
10 | * This file is part of the TYPO3 Crawler Extension. |
||
11 | * |
||
12 | * It is free software; you can redistribute it and/or modify it under |
||
13 | * the terms of the GNU General Public License, either version 2 |
||
14 | * of the License, or any later version. |
||
15 | * |
||
16 | * For the full copyright and license information, please read the |
||
17 | * LICENSE.txt file that was distributed with this source code. |
||
18 | * |
||
19 | * The TYPO3 project - inspiring people to share! |
||
20 | */ |
||
21 | |||
22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
||
23 | use AOE\Crawler\Controller\CrawlerController; |
||
24 | use AOE\Crawler\Converter\JsonCompatibilityConverter; |
||
25 | use AOE\Crawler\Domain\Model\Reason; |
||
26 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||
27 | use AOE\Crawler\Utility\MessageUtility; |
||
28 | use AOE\Crawler\Utility\SignalSlotUtility; |
||
29 | use Symfony\Component\Console\Command\Command; |
||
30 | use Symfony\Component\Console\Helper\ProgressBar; |
||
31 | use Symfony\Component\Console\Input\InputArgument; |
||
32 | use Symfony\Component\Console\Input\InputInterface; |
||
33 | use Symfony\Component\Console\Input\InputOption; |
||
34 | use Symfony\Component\Console\Output\OutputInterface; |
||
35 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
36 | use TYPO3\CMS\Core\Utility\MathUtility; |
||
37 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||
38 | |||
39 | class BuildQueueCommand extends Command |
||
40 | { |
||
41 | protected function configure(): void |
||
42 | { |
||
43 | $this->setDescription('Create entries in the queue that can be processed at once'); |
||
44 | |||
45 | $this->setHelp( |
||
46 | 'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) . |
||
47 | 'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
||
48 | It can put entries in the queue from command line options, return the list of URLs and even execute |
||
49 | all entries right away without having to queue them up - this can be useful for immediate re-cache, |
||
50 | re-indexing or static publishing from command line.' . chr(10) . chr(10) . |
||
51 | ' |
||
52 | Examples: |
||
53 | --- Re-cache pages from page 7 and two levels down, executed immediately |
||
54 | $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec |
||
55 | |||
56 | --- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
||
57 | $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4 |
||
58 | ' |
||
59 | ); |
||
60 | |||
61 | $this->addArgument( |
||
62 | 'page', |
||
63 | InputArgument::REQUIRED, |
||
64 | 'The page from where the queue building should start' |
||
65 | ); |
||
66 | |||
67 | $this->addArgument( |
||
68 | 'conf', |
||
69 | InputArgument::REQUIRED, |
||
70 | 'A comma separated list of crawler configurations' |
||
71 | ); |
||
72 | |||
73 | $this->addOption( |
||
74 | 'depth', |
||
75 | 'd', |
||
76 | InputOption::VALUE_OPTIONAL, |
||
77 | 'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.', |
||
78 | '0' |
||
79 | ); |
||
80 | |||
81 | $this->addOption( |
||
82 | 'mode', |
||
83 | 'm', |
||
84 | InputOption::VALUE_OPTIONAL, |
||
85 | 'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!' |
||
86 | ); |
||
87 | |||
88 | $this->addOption( |
||
89 | 'number', |
||
90 | '', |
||
91 | InputOption::VALUE_OPTIONAL, |
||
92 | 'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"', |
||
93 | '0' |
||
94 | ); |
||
95 | } |
||
96 | |||
97 | /** |
||
98 | * Crawler Command - Submitting URLs to be crawled. |
||
99 | * |
||
100 | * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
||
101 | * It can put entries in the queue from command line options, return the list of URLs and even execute |
||
102 | * all entries right away without having to queue them up - this can be useful for immediate re-cache, |
||
103 | * re-indexing or static publishing from command line. |
||
104 | * |
||
105 | * Examples: |
||
106 | * |
||
107 | * --- Re-cache pages from page 7 and two levels down, executed immediately |
||
108 | * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec |
||
109 | * |
||
110 | * |
||
111 | * --- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
||
112 | * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4 |
||
113 | */ |
||
114 | protected function execute(InputInterface $input, OutputInterface $output): int |
||
115 | { |
||
116 | /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */ |
||
117 | $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class); |
||
118 | $mode = $input->getOption('mode') ?? 'queue'; |
||
119 | |||
120 | $extensionSettings = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration(); |
||
121 | |||
122 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
||
123 | |||
124 | /** @var CrawlerController $crawlerController */ |
||
125 | $crawlerController = $objectManager->get(CrawlerController::class); |
||
126 | /** @var QueueRepository $queueRepository */ |
||
127 | $queueRepository = $objectManager->get(QueueRepository::class); |
||
128 | |||
129 | if ($mode === 'exec') { |
||
130 | $crawlerController->registerQueueEntriesInternallyOnly = true; |
||
131 | } |
||
132 | |||
133 | $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0); |
||
134 | if ($pageId === 0) { |
||
135 | $message = "Page ${pageId} is not a valid page, please check you root page id and try again."; |
||
136 | MessageUtility::addErrorMessage($message); |
||
137 | $output->writeln("<info>${message}</info>"); |
||
138 | return 1; |
||
139 | } |
||
140 | |||
141 | $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf')); |
||
142 | |||
143 | if ($mode === 'queue' || $mode === 'exec') { |
||
144 | $reason = new Reason(); |
||
145 | $reason->setReason(Reason::REASON_CLI_SUBMIT); |
||
146 | $reason->setDetailText('The cli script of the crawler added to the queue'); |
||
147 | |||
148 | $signalPayload = ['reason' => $reason]; |
||
149 | SignalSlotUtility::emitSignal( |
||
0 ignored issues
–
show
Deprecated Code
introduced
by
Loading history...
|
|||
150 | self::class, |
||
151 | SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE, |
||
152 | $signalPayload |
||
153 | ); |
||
154 | } |
||
155 | |||
156 | if ($extensionSettings['cleanUpOldQueueEntries']) { |
||
157 | $queueRepository->cleanUpOldQueueEntries(); |
||
158 | } |
||
159 | |||
160 | $crawlerController->setID = GeneralUtility::md5int(microtime()); |
||
161 | $crawlerController->getPageTreeAndUrls( |
||
162 | $pageId, |
||
163 | MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99), |
||
164 | $crawlerController->getCurrentTime(), |
||
165 | MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000), |
||
166 | $mode === 'queue' || $mode === 'exec', |
||
167 | $mode === 'url', |
||
168 | [], |
||
169 | $configurationKeys |
||
170 | ); |
||
171 | |||
172 | if ($mode === 'url') { |
||
173 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>'); |
||
174 | } elseif ($mode === 'exec') { |
||
175 | $progressBar = new ProgressBar($output); |
||
176 | $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>'); |
||
177 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
178 | $output->writeln('<info>Processing</info>' . PHP_EOL); |
||
179 | |||
180 | foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) { |
||
181 | $p = $jsonCompatibilityConverter->convert($queueRec['parameters']); |
||
182 | |||
183 | $progressBar->clear(); |
||
184 | $output->writeln('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL); |
||
185 | $progressBar->display(); |
||
186 | |||
187 | $result = $crawlerController->readUrlFromArray($queueRec); |
||
188 | |||
189 | $resultContent = $result['content'] ?? ''; |
||
190 | $requestResult = $jsonCompatibilityConverter->convert($resultContent); |
||
191 | |||
192 | $progressBar->clear(); |
||
193 | if (is_array($requestResult)) { |
||
194 | $resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : ''; |
||
195 | $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL); |
||
196 | } else { |
||
197 | $output->writeln('<error>Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($resultContent)), 0, 30000) . '...' . PHP_EOL . '</error>' . PHP_EOL); |
||
198 | } |
||
199 | $progressBar->display(); |
||
200 | } |
||
201 | $output->writeln(''); |
||
202 | } elseif ($mode === 'queue') { |
||
203 | $output->writeln('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL); |
||
204 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
205 | } else { |
||
206 | $output->writeln('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL); |
||
207 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
208 | } |
||
209 | |||
210 | return 0; |
||
211 | } |
||
212 | |||
213 | /** |
||
214 | * Obtains configuration keys from the CLI arguments |
||
215 | */ |
||
216 | private function getConfigurationKeys(string $conf): array |
||
217 | { |
||
218 | $parameter = trim($conf); |
||
219 | return ($parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : []); |
||
220 | } |
||
221 | } |
||
222 |