AOEpeople /
crawler
| 1 | <?php |
||
| 2 | |||
| 3 | declare(strict_types=1); |
||
| 4 | |||
| 5 | namespace AOE\Crawler\Command; |
||
| 6 | |||
| 7 | /* |
||
| 8 | * (c) 2020 AOE GmbH <[email protected]> |
||
| 9 | * |
||
| 10 | * This file is part of the TYPO3 Crawler Extension. |
||
| 11 | * |
||
| 12 | * It is free software; you can redistribute it and/or modify it under |
||
| 13 | * the terms of the GNU General Public License, either version 2 |
||
| 14 | * of the License, or any later version. |
||
| 15 | * |
||
| 16 | * For the full copyright and license information, please read the |
||
| 17 | * LICENSE.txt file that was distributed with this source code. |
||
| 18 | * |
||
| 19 | * The TYPO3 project - inspiring people to share! |
||
| 20 | */ |
||
| 21 | |||
| 22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
||
| 23 | use AOE\Crawler\Controller\CrawlerController; |
||
| 24 | use AOE\Crawler\Converter\JsonCompatibilityConverter; |
||
| 25 | use AOE\Crawler\Domain\Model\Reason; |
||
| 26 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||
| 27 | use AOE\Crawler\Utility\MessageUtility; |
||
| 28 | use AOE\Crawler\Utility\SignalSlotUtility; |
||
| 29 | use Symfony\Component\Console\Command\Command; |
||
| 30 | use Symfony\Component\Console\Helper\ProgressBar; |
||
| 31 | use Symfony\Component\Console\Input\InputArgument; |
||
| 32 | use Symfony\Component\Console\Input\InputInterface; |
||
| 33 | use Symfony\Component\Console\Input\InputOption; |
||
| 34 | use Symfony\Component\Console\Output\OutputInterface; |
||
| 35 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
| 36 | use TYPO3\CMS\Core\Utility\MathUtility; |
||
| 37 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||
| 38 | |||
| 39 | class BuildQueueCommand extends Command |
||
| 40 | { |
||
| 41 | protected function configure(): void |
||
| 42 | { |
||
| 43 | $this->setDescription('Create entries in the queue that can be processed at once'); |
||
| 44 | |||
| 45 | $this->setHelp( |
||
| 46 | 'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) . |
||
| 47 | 'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
||
| 48 | It can put entries in the queue from command line options, return the list of URLs and even execute |
||
| 49 | all entries right away without having to queue them up - this can be useful for immediate re-cache, |
||
| 50 | re-indexing or static publishing from command line.' . chr(10) . chr(10) . |
||
| 51 | ' |
||
| 52 | Examples: |
||
| 53 | --- Re-cache pages from page 7 and two levels down, executed immediately |
||
| 54 | $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec |
||
| 55 | |||
| 56 | --- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
||
| 57 | $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4 |
||
| 58 | ' |
||
| 59 | ); |
||
| 60 | |||
| 61 | $this->addArgument( |
||
| 62 | 'page', |
||
| 63 | InputArgument::REQUIRED, |
||
| 64 | 'The page from where the queue building should start' |
||
| 65 | ); |
||
| 66 | |||
| 67 | $this->addArgument( |
||
| 68 | 'conf', |
||
| 69 | InputArgument::REQUIRED, |
||
| 70 | 'A comma separated list of crawler configurations' |
||
| 71 | ); |
||
| 72 | |||
| 73 | $this->addOption( |
||
| 74 | 'depth', |
||
| 75 | 'd', |
||
| 76 | InputOption::VALUE_OPTIONAL, |
||
| 77 | 'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.', |
||
| 78 | '0' |
||
| 79 | ); |
||
| 80 | |||
| 81 | $this->addOption( |
||
| 82 | 'mode', |
||
| 83 | 'm', |
||
| 84 | InputOption::VALUE_OPTIONAL, |
||
| 85 | 'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!' |
||
| 86 | ); |
||
| 87 | |||
| 88 | $this->addOption( |
||
| 89 | 'number', |
||
| 90 | '', |
||
| 91 | InputOption::VALUE_OPTIONAL, |
||
| 92 | 'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"', |
||
| 93 | '0' |
||
| 94 | ); |
||
| 95 | } |
||
| 96 | |||
| 97 | /** |
||
| 98 | * Crawler Command - Submitting URLs to be crawled. |
||
| 99 | * |
||
| 100 | * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module; |
||
| 101 | * It can put entries in the queue from command line options, return the list of URLs and even execute |
||
| 102 | * all entries right away without having to queue them up - this can be useful for immediate re-cache, |
||
| 103 | * re-indexing or static publishing from command line. |
||
| 104 | * |
||
| 105 | * Examples: |
||
| 106 | * |
||
| 107 | * --- Re-cache pages from page 7 and two levels down, executed immediately |
||
| 108 | * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec |
||
| 109 | * |
||
| 110 | * |
||
| 111 | * --- Put entries for re-caching pages from page 7 into queue, 4 every minute. |
||
| 112 | * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4 |
||
| 113 | */ |
||
| 114 | protected function execute(InputInterface $input, OutputInterface $output): int |
||
| 115 | { |
||
| 116 | /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */ |
||
| 117 | $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class); |
||
| 118 | $mode = $input->getOption('mode') ?? 'queue'; |
||
| 119 | |||
| 120 | $extensionSettings = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration(); |
||
| 121 | |||
| 122 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
||
| 123 | |||
| 124 | /** @var CrawlerController $crawlerController */ |
||
| 125 | $crawlerController = $objectManager->get(CrawlerController::class); |
||
| 126 | /** @var QueueRepository $queueRepository */ |
||
| 127 | $queueRepository = $objectManager->get(QueueRepository::class); |
||
| 128 | |||
| 129 | if ($mode === 'exec') { |
||
| 130 | $crawlerController->registerQueueEntriesInternallyOnly = true; |
||
| 131 | } |
||
| 132 | |||
| 133 | $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0); |
||
| 134 | if ($pageId === 0) { |
||
| 135 | $message = "Page ${pageId} is not a valid page, please check you root page id and try again."; |
||
| 136 | MessageUtility::addErrorMessage($message); |
||
| 137 | $output->writeln("<info>${message}</info>"); |
||
| 138 | return 1; |
||
| 139 | } |
||
| 140 | |||
| 141 | $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf')); |
||
| 142 | |||
| 143 | if ($mode === 'queue' || $mode === 'exec') { |
||
| 144 | $reason = new Reason(); |
||
| 145 | $reason->setReason(Reason::REASON_CLI_SUBMIT); |
||
| 146 | $reason->setDetailText('The cli script of the crawler added to the queue'); |
||
| 147 | |||
| 148 | $signalPayload = ['reason' => $reason]; |
||
| 149 | SignalSlotUtility::emitSignal( |
||
|
0 ignored issues
–
show
Deprecated Code
introduced
by
Loading history...
|
|||
| 150 | self::class, |
||
| 151 | SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE, |
||
| 152 | $signalPayload |
||
| 153 | ); |
||
| 154 | } |
||
| 155 | |||
| 156 | if ($extensionSettings['cleanUpOldQueueEntries']) { |
||
| 157 | $queueRepository->cleanUpOldQueueEntries(); |
||
| 158 | } |
||
| 159 | |||
| 160 | $crawlerController->setID = GeneralUtility::md5int(microtime()); |
||
| 161 | $crawlerController->getPageTreeAndUrls( |
||
| 162 | $pageId, |
||
| 163 | MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99), |
||
| 164 | $crawlerController->getCurrentTime(), |
||
| 165 | MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000), |
||
| 166 | $mode === 'queue' || $mode === 'exec', |
||
| 167 | $mode === 'url', |
||
| 168 | [], |
||
| 169 | $configurationKeys |
||
| 170 | ); |
||
| 171 | |||
| 172 | if ($mode === 'url') { |
||
| 173 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>'); |
||
| 174 | } elseif ($mode === 'exec') { |
||
| 175 | $progressBar = new ProgressBar($output); |
||
| 176 | $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>'); |
||
| 177 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
| 178 | $output->writeln('<info>Processing</info>' . PHP_EOL); |
||
| 179 | |||
| 180 | foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) { |
||
| 181 | $p = $jsonCompatibilityConverter->convert($queueRec['parameters']); |
||
| 182 | |||
| 183 | $progressBar->clear(); |
||
| 184 | $output->writeln('<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL); |
||
| 185 | $progressBar->display(); |
||
| 186 | |||
| 187 | $result = $crawlerController->readUrlFromArray($queueRec); |
||
| 188 | |||
| 189 | $resultContent = $result['content'] ?? ''; |
||
| 190 | $requestResult = $jsonCompatibilityConverter->convert($resultContent); |
||
| 191 | |||
| 192 | $progressBar->clear(); |
||
| 193 | if (is_array($requestResult)) { |
||
| 194 | $resLog = is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) . implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : ''; |
||
| 195 | $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL); |
||
| 196 | } else { |
||
| 197 | $output->writeln('<error>Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($resultContent)), 0, 30000) . '...' . PHP_EOL . '</error>' . PHP_EOL); |
||
| 198 | } |
||
| 199 | $progressBar->display(); |
||
| 200 | } |
||
| 201 | $output->writeln(''); |
||
| 202 | } elseif ($mode === 'queue') { |
||
| 203 | $output->writeln('<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL); |
||
| 204 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
| 205 | } else { |
||
| 206 | $output->writeln('<info>' . count($crawlerController->urlList) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL); |
||
| 207 | $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->urlList) . '</info>' . PHP_EOL); |
||
| 208 | } |
||
| 209 | |||
| 210 | return 0; |
||
| 211 | } |
||
| 212 | |||
| 213 | /** |
||
| 214 | * Obtains configuration keys from the CLI arguments |
||
| 215 | */ |
||
| 216 | private function getConfigurationKeys(string $conf): array |
||
| 217 | { |
||
| 218 | $parameter = trim($conf); |
||
| 219 | return ($parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : []); |
||
| 220 | } |
||
| 221 | } |
||
| 222 |