AOEpeople /
crawler
| 1 | <?php |
||||||
| 2 | |||||||
| 3 | declare(strict_types=1); |
||||||
| 4 | |||||||
| 5 | namespace AOE\Crawler\Controller; |
||||||
| 6 | |||||||
| 7 | /*************************************************************** |
||||||
| 8 | * Copyright notice |
||||||
| 9 | * |
||||||
| 10 | * (c) 2020 AOE GmbH <[email protected]> |
||||||
| 11 | * |
||||||
| 12 | * All rights reserved |
||||||
| 13 | * |
||||||
| 14 | * This script is part of the TYPO3 project. The TYPO3 project is |
||||||
| 15 | * free software; you can redistribute it and/or modify |
||||||
| 16 | * it under the terms of the GNU General Public License as published by |
||||||
| 17 | * the Free Software Foundation; either version 3 of the License, or |
||||||
| 18 | * (at your option) any later version. |
||||||
| 19 | * |
||||||
| 20 | * The GNU General Public License can be found at |
||||||
| 21 | * http://www.gnu.org/copyleft/gpl.html. |
||||||
| 22 | * |
||||||
| 23 | * This script is distributed in the hope that it will be useful, |
||||||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
| 26 | * GNU General Public License for more details. |
||||||
| 27 | * |
||||||
| 28 | * This copyright notice MUST APPEAR in all copies of the script! |
||||||
| 29 | ***************************************************************/ |
||||||
| 30 | |||||||
| 31 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
||||||
| 32 | use AOE\Crawler\Converter\JsonCompatibilityConverter; |
||||||
| 33 | use AOE\Crawler\Crawler; |
||||||
| 34 | use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory; |
||||||
| 35 | use AOE\Crawler\Domain\Model\Process; |
||||||
| 36 | use AOE\Crawler\Domain\Repository\ConfigurationRepository; |
||||||
| 37 | use AOE\Crawler\Domain\Repository\ProcessRepository; |
||||||
| 38 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||||||
| 39 | use AOE\Crawler\QueueExecutor; |
||||||
| 40 | use AOE\Crawler\Service\ConfigurationService; |
||||||
| 41 | use AOE\Crawler\Service\UrlService; |
||||||
| 42 | use AOE\Crawler\Service\UserService; |
||||||
| 43 | use AOE\Crawler\Utility\SignalSlotUtility; |
||||||
| 44 | use AOE\Crawler\Value\QueueFilter; |
||||||
| 45 | use PDO; |
||||||
| 46 | use Psr\Http\Message\UriInterface; |
||||||
| 47 | use Psr\Log\LoggerAwareInterface; |
||||||
| 48 | use Psr\Log\LoggerAwareTrait; |
||||||
| 49 | use TYPO3\CMS\Backend\Tree\View\PageTreeView; |
||||||
| 50 | use TYPO3\CMS\Backend\Utility\BackendUtility; |
||||||
| 51 | use TYPO3\CMS\Core\Authentication\BackendUserAuthentication; |
||||||
| 52 | use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait; |
||||||
| 53 | use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait; |
||||||
| 54 | use TYPO3\CMS\Core\Core\Bootstrap; |
||||||
| 55 | use TYPO3\CMS\Core\Core\Environment; |
||||||
| 56 | use TYPO3\CMS\Core\Database\Connection; |
||||||
| 57 | use TYPO3\CMS\Core\Database\ConnectionPool; |
||||||
| 58 | use TYPO3\CMS\Core\Database\Query\QueryBuilder; |
||||||
| 59 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction; |
||||||
| 60 | use TYPO3\CMS\Core\Database\QueryGenerator; |
||||||
| 61 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; |
||||||
| 62 | use TYPO3\CMS\Core\Exception\SiteNotFoundException; |
||||||
| 63 | use TYPO3\CMS\Core\Imaging\Icon; |
||||||
| 64 | use TYPO3\CMS\Core\Imaging\IconFactory; |
||||||
| 65 | use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException; |
||||||
| 66 | use TYPO3\CMS\Core\Site\Entity\Site; |
||||||
| 67 | use TYPO3\CMS\Core\Type\Bitmask\Permission; |
||||||
| 68 | use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser; |
||||||
| 69 | use TYPO3\CMS\Core\Utility\DebugUtility; |
||||||
| 70 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||||||
| 71 | use TYPO3\CMS\Core\Utility\MathUtility; |
||||||
| 72 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||||||
| 73 | |||||||
| 74 | /** |
||||||
| 75 | * Class CrawlerController |
||||||
| 76 | * |
||||||
| 77 | * @package AOE\Crawler\Controller |
||||||
| 78 | */ |
||||||
| 79 | class CrawlerController implements LoggerAwareInterface |
||||||
| 80 | { |
||||||
| 81 | use LoggerAwareTrait; |
||||||
| 82 | use PublicMethodDeprecationTrait; |
||||||
| 83 | use PublicPropertyDeprecationTrait; |
||||||
| 84 | |||||||
| 85 | public const CLI_STATUS_NOTHING_PROCCESSED = 0; |
||||||
| 86 | |||||||
| 87 | //queue not empty |
||||||
| 88 | public const CLI_STATUS_REMAIN = 1; |
||||||
| 89 | |||||||
| 90 | //(some) queue items where processed |
||||||
| 91 | public const CLI_STATUS_PROCESSED = 2; |
||||||
| 92 | |||||||
| 93 | //instance didn't finish |
||||||
| 94 | public const CLI_STATUS_ABORTED = 4; |
||||||
| 95 | |||||||
| 96 | public const CLI_STATUS_POLLABLE_PROCESSED = 8; |
||||||
| 97 | |||||||
| 98 | /** |
||||||
| 99 | * @var integer |
||||||
| 100 | */ |
||||||
| 101 | public $setID = 0; |
||||||
| 102 | |||||||
| 103 | /** |
||||||
| 104 | * @var string |
||||||
| 105 | */ |
||||||
| 106 | public $processID = ''; |
||||||
| 107 | |||||||
| 108 | /** |
||||||
| 109 | * @var array |
||||||
| 110 | */ |
||||||
| 111 | public $duplicateTrack = []; |
||||||
| 112 | |||||||
| 113 | /** |
||||||
| 114 | * @var array |
||||||
| 115 | */ |
||||||
| 116 | public $downloadUrls = []; |
||||||
| 117 | |||||||
| 118 | /** |
||||||
| 119 | * @var array |
||||||
| 120 | */ |
||||||
| 121 | public $incomingProcInstructions = []; |
||||||
| 122 | |||||||
| 123 | /** |
||||||
| 124 | * @var array |
||||||
| 125 | */ |
||||||
| 126 | public $incomingConfigurationSelection = []; |
||||||
| 127 | |||||||
| 128 | /** |
||||||
| 129 | * @var bool |
||||||
| 130 | */ |
||||||
| 131 | public $registerQueueEntriesInternallyOnly = false; |
||||||
| 132 | |||||||
| 133 | /** |
||||||
| 134 | * @var array |
||||||
| 135 | */ |
||||||
| 136 | public $queueEntries = []; |
||||||
| 137 | |||||||
| 138 | /** |
||||||
| 139 | * @var array |
||||||
| 140 | */ |
||||||
| 141 | public $urlList = []; |
||||||
| 142 | |||||||
| 143 | /** |
||||||
| 144 | * @var array |
||||||
| 145 | */ |
||||||
| 146 | public $extensionSettings = []; |
||||||
| 147 | |||||||
| 148 | /** |
||||||
| 149 | * Mount Point |
||||||
| 150 | * |
||||||
| 151 | * @var bool |
||||||
| 152 | * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code. |
||||||
| 153 | */ |
||||||
| 154 | public $MP = false; |
||||||
| 155 | |||||||
| 156 | /** |
||||||
| 157 | * @var string |
||||||
| 158 | * @deprecated |
||||||
| 159 | */ |
||||||
| 160 | protected $processFilename; |
||||||
| 161 | |||||||
| 162 | /** |
||||||
| 163 | * Holds the internal access mode can be 'gui','cli' or 'cli_im' |
||||||
| 164 | * |
||||||
| 165 | * @var string |
||||||
| 166 | * @deprecated |
||||||
| 167 | */ |
||||||
| 168 | protected $accessMode; |
||||||
| 169 | |||||||
| 170 | /** |
||||||
| 171 | * @var QueueRepository |
||||||
| 172 | */ |
||||||
| 173 | protected $queueRepository; |
||||||
| 174 | |||||||
| 175 | /** |
||||||
| 176 | * @var ProcessRepository |
||||||
| 177 | */ |
||||||
| 178 | protected $processRepository; |
||||||
| 179 | |||||||
| 180 | /** |
||||||
| 181 | * @var ConfigurationRepository |
||||||
| 182 | */ |
||||||
| 183 | protected $configurationRepository; |
||||||
| 184 | |||||||
| 185 | /** |
||||||
| 186 | * @var string |
||||||
| 187 | * @deprecated Since v9.2.5 - This will be remove in v10 |
||||||
| 188 | */ |
||||||
| 189 | protected $tableName = 'tx_crawler_queue'; |
||||||
| 190 | |||||||
| 191 | /** |
||||||
| 192 | * @var QueueExecutor |
||||||
| 193 | */ |
||||||
| 194 | protected $queueExecutor; |
||||||
| 195 | |||||||
| 196 | /** |
||||||
| 197 | * @var int |
||||||
| 198 | */ |
||||||
| 199 | protected $maximumUrlsToCompile = 10000; |
||||||
| 200 | |||||||
| 201 | /** |
||||||
| 202 | * @var IconFactory |
||||||
| 203 | */ |
||||||
| 204 | protected $iconFactory; |
||||||
| 205 | |||||||
| 206 | /** |
||||||
| 207 | * @var string[] |
||||||
| 208 | */ |
||||||
| 209 | private $deprecatedPublicMethods = [ |
||||||
| 210 | 'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.', |
||||||
| 211 | 'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 212 | 'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x', |
||||||
| 213 | 'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x', |
||||||
| 214 | 'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 215 | 'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x', |
||||||
| 216 | 'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x', |
||||||
| 217 | 'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.', |
||||||
| 218 | 'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.', |
||||||
| 219 | 'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 220 | 'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead', |
||||||
| 221 | 'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead', |
||||||
| 222 | 'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 223 | 'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 224 | 'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead', |
||||||
| 225 | ]; |
||||||
| 226 | |||||||
| 227 | /** |
||||||
| 228 | * @var string[] |
||||||
| 229 | */ |
||||||
| 230 | private $deprecatedPublicProperties = [ |
||||||
| 231 | 'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 232 | 'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x', |
||||||
| 233 | ]; |
||||||
| 234 | |||||||
| 235 | /** |
||||||
| 236 | * @var BackendUserAuthentication|null |
||||||
| 237 | */ |
||||||
| 238 | private $backendUser; |
||||||
| 239 | |||||||
| 240 | /** |
||||||
| 241 | * @var integer |
||||||
| 242 | */ |
||||||
| 243 | private $scheduledTime = 0; |
||||||
| 244 | |||||||
| 245 | /** |
||||||
| 246 | * @var integer |
||||||
| 247 | */ |
||||||
| 248 | private $reqMinute = 0; |
||||||
| 249 | |||||||
| 250 | /** |
||||||
| 251 | * @var bool |
||||||
| 252 | */ |
||||||
| 253 | private $submitCrawlUrls = false; |
||||||
| 254 | |||||||
| 255 | /** |
||||||
| 256 | * @var bool |
||||||
| 257 | */ |
||||||
| 258 | private $downloadCrawlUrls = false; |
||||||
| 259 | |||||||
| 260 | /** |
||||||
| 261 | * @var PageRepository |
||||||
| 262 | */ |
||||||
| 263 | private $pageRepository; |
||||||
| 264 | |||||||
| 265 | /** |
||||||
| 266 | * @var Crawler |
||||||
| 267 | */ |
||||||
| 268 | private $crawler; |
||||||
| 269 | |||||||
| 270 | /************************************ |
||||||
| 271 | * |
||||||
| 272 | * Getting URLs based on Page TSconfig |
||||||
| 273 | * |
||||||
| 274 | ************************************/ |
||||||
| 275 | |||||||
| 276 | public function __construct() |
||||||
| 277 | { |
||||||
| 278 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
||||||
| 279 | $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class); |
||||||
| 280 | $this->queueRepository = $objectManager->get(QueueRepository::class); |
||||||
| 281 | $this->processRepository = $objectManager->get(ProcessRepository::class); |
||||||
| 282 | $this->configurationRepository = $objectManager->get(ConfigurationRepository::class); |
||||||
| 283 | $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class); |
||||||
| 284 | $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory); |
||||||
| 285 | $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class); |
||||||
| 286 | $this->crawler = GeneralUtility::makeInstance(Crawler::class); |
||||||
| 287 | |||||||
| 288 | $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc'; |
||||||
| 289 | |||||||
| 290 | /** @var ExtensionConfigurationProvider $configurationProvider */ |
||||||
| 291 | $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class); |
||||||
| 292 | $settings = $configurationProvider->getExtensionConfiguration(); |
||||||
| 293 | $this->extensionSettings = is_array($settings) ? $settings : []; |
||||||
| 294 | |||||||
| 295 | if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) { |
||||||
| 296 | $this->extensionSettings['countInARun'] = 100; |
||||||
| 297 | } |
||||||
| 298 | |||||||
| 299 | $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1); |
||||||
| 300 | $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)); |
||||||
| 301 | } |
||||||
| 302 | |||||||
| 303 | public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void |
||||||
| 304 | { |
||||||
| 305 | $this->maximumUrlsToCompile = $maximumUrlsToCompile; |
||||||
| 306 | } |
||||||
| 307 | |||||||
| 308 | /** |
||||||
| 309 | * Method to set the accessMode can be gui, cli or cli_im |
||||||
| 310 | * |
||||||
| 311 | * @return string |
||||||
| 312 | * @deprecated |
||||||
| 313 | */ |
||||||
| 314 | public function getAccessMode() |
||||||
| 315 | { |
||||||
| 316 | return $this->accessMode; |
||||||
| 317 | } |
||||||
| 318 | |||||||
| 319 | /** |
||||||
| 320 | * @param string $accessMode |
||||||
| 321 | * @deprecated |
||||||
| 322 | */ |
||||||
| 323 | public function setAccessMode($accessMode): void |
||||||
| 324 | { |
||||||
| 325 | $this->accessMode = $accessMode; |
||||||
| 326 | } |
||||||
| 327 | |||||||
| 328 | /** |
||||||
| 329 | * Set disabled status to prevent processes from being processed |
||||||
| 330 | * @deprecated |
||||||
| 331 | */ |
||||||
| 332 | public function setDisabled(?bool $disabled = true): void |
||||||
| 333 | { |
||||||
| 334 | if ($disabled) { |
||||||
| 335 | GeneralUtility::writeFile($this->processFilename, 'disabled'); |
||||||
| 336 | } elseif (is_file($this->processFilename)) { |
||||||
| 337 | unlink($this->processFilename); |
||||||
| 338 | } |
||||||
| 339 | } |
||||||
| 340 | |||||||
| 341 | /** |
||||||
| 342 | * Get disable status |
||||||
| 343 | * @deprecated |
||||||
| 344 | */ |
||||||
| 345 | public function getDisabled(): bool |
||||||
| 346 | { |
||||||
| 347 | return is_file($this->processFilename); |
||||||
| 348 | } |
||||||
| 349 | |||||||
| 350 | /** |
||||||
| 351 | * @param string $filenameWithPath |
||||||
| 352 | * @deprecated |
||||||
| 353 | */ |
||||||
| 354 | public function setProcessFilename($filenameWithPath): void |
||||||
| 355 | { |
||||||
| 356 | $this->processFilename = $filenameWithPath; |
||||||
| 357 | } |
||||||
| 358 | |||||||
| 359 | /** |
||||||
| 360 | * @return string |
||||||
| 361 | * @deprecated |
||||||
| 362 | */ |
||||||
| 363 | public function getProcessFilename() |
||||||
| 364 | { |
||||||
| 365 | return $this->processFilename; |
||||||
| 366 | } |
||||||
| 367 | |||||||
| 368 | /** |
||||||
| 369 | * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']). |
||||||
| 370 | */ |
||||||
| 371 | public function setExtensionSettings(array $extensionSettings): void |
||||||
| 372 | { |
||||||
| 373 | $this->extensionSettings = $extensionSettings; |
||||||
| 374 | } |
||||||
| 375 | |||||||
| 376 | /** |
||||||
| 377 | * Check if the given page should be crawled |
||||||
| 378 | * |
||||||
| 379 | * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped |
||||||
| 380 | */ |
||||||
| 381 | public function checkIfPageShouldBeSkipped(array $pageRow) |
||||||
| 382 | { |
||||||
| 383 | // if page is hidden |
||||||
| 384 | if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) { |
||||||
| 385 | return 'Because page is hidden'; |
||||||
| 386 | } |
||||||
| 387 | |||||||
| 388 | if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) { |
||||||
| 389 | return 'Because doktype is not allowed'; |
||||||
| 390 | } |
||||||
| 391 | |||||||
| 392 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) { |
||||||
| 393 | if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) { |
||||||
| 394 | return 'Doktype was excluded by "' . $key . '"'; |
||||||
| 395 | } |
||||||
| 396 | } |
||||||
| 397 | |||||||
| 398 | // veto hook |
||||||
| 399 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) { |
||||||
| 400 | $params = [ |
||||||
| 401 | 'pageRow' => $pageRow, |
||||||
| 402 | ]; |
||||||
| 403 | // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled |
||||||
| 404 | $veto = GeneralUtility::callUserFunction($func, $params, $this); |
||||||
| 405 | if ($veto !== false) { |
||||||
| 406 | if (is_string($veto)) { |
||||||
| 407 | return $veto; |
||||||
| 408 | } |
||||||
| 409 | return 'Veto from hook "' . htmlspecialchars($key) . '"'; |
||||||
| 410 | } |
||||||
| 411 | } |
||||||
| 412 | |||||||
| 413 | return false; |
||||||
| 414 | } |
||||||
| 415 | |||||||
| 416 | /** |
||||||
| 417 | * Wrapper method for getUrlsForPageId() |
||||||
| 418 | * It returns an array of configurations and no urls! |
||||||
| 419 | * |
||||||
| 420 | * @param array $pageRow Page record with at least dok-type and uid columns. |
||||||
| 421 | * @param string $skipMessage |
||||||
| 422 | * @return array |
||||||
| 423 | * @see getUrlsForPageId() |
||||||
| 424 | */ |
||||||
| 425 | public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') |
||||||
| 426 | { |
||||||
| 427 | if (! is_int($pageRow['uid'])) { |
||||||
| 428 | $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer'; |
||||||
| 429 | return []; |
||||||
| 430 | } |
||||||
| 431 | |||||||
| 432 | $message = $this->checkIfPageShouldBeSkipped($pageRow); |
||||||
| 433 | if ($message === false) { |
||||||
| 434 | $res = $this->getUrlsForPageId($pageRow['uid']); |
||||||
| 435 | $skipMessage = ''; |
||||||
| 436 | } else { |
||||||
| 437 | $skipMessage = $message; |
||||||
| 438 | $res = []; |
||||||
| 439 | } |
||||||
| 440 | |||||||
| 441 | return $res; |
||||||
| 442 | } |
||||||
| 443 | |||||||
| 444 | /** |
||||||
| 445 | * Creates a list of URLs from input array (and submits them to queue if asked for) |
||||||
| 446 | * See Web > Info module script + "indexed_search"'s crawler hook-client using this! |
||||||
| 447 | * |
||||||
| 448 | * @param array $vv Information about URLs from pageRow to crawl. |
||||||
| 449 | * @param array $pageRow Page row |
||||||
| 450 | * @param int $scheduledTime Unix time to schedule indexing to, typically time() |
||||||
| 451 | * @param int $reqMinute Number of requests per minute (creates the interleave between requests) |
||||||
| 452 | * @param bool $submitCrawlUrls If set, submits the URLs to queue |
||||||
| 453 | * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||||||
| 454 | * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates |
||||||
| 455 | * @param array $downloadUrls Array which will be filled with URLS for download if flag is set. |
||||||
| 456 | * @param array $incomingProcInstructions Array of processing instructions |
||||||
| 457 | * @return string List of URLs (meant for display in backend module) |
||||||
| 458 | */ |
||||||
| 459 | public function urlListFromUrlArray( |
||||||
| 460 | array $vv, |
||||||
| 461 | array $pageRow, |
||||||
| 462 | $scheduledTime, |
||||||
| 463 | $reqMinute, |
||||||
| 464 | $submitCrawlUrls, |
||||||
| 465 | $downloadCrawlUrls, |
||||||
| 466 | array &$duplicateTrack, |
||||||
| 467 | array &$downloadUrls, |
||||||
| 468 | array $incomingProcInstructions |
||||||
| 469 | ) { |
||||||
| 470 | if (! is_array($vv['URLs'])) { |
||||||
| 471 | return 'ERROR - no URL generated'; |
||||||
| 472 | } |
||||||
| 473 | $urlLog = []; |
||||||
| 474 | $pageId = (int) $pageRow['uid']; |
||||||
| 475 | $configurationHash = $this->getConfigurationHash($vv); |
||||||
| 476 | $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash); |
||||||
| 477 | |||||||
| 478 | $urlService = new UrlService(); |
||||||
| 479 | |||||||
| 480 | foreach ($vv['URLs'] as $urlQuery) { |
||||||
| 481 | if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) { |
||||||
| 482 | continue; |
||||||
| 483 | } |
||||||
| 484 | $url = (string) $urlService->getUrlFromPageAndQueryParameters( |
||||||
| 485 | $pageId, |
||||||
| 486 | $urlQuery, |
||||||
| 487 | $vv['subCfg']['baseUrl'] ?? null, |
||||||
| 488 | $vv['subCfg']['force_ssl'] ?? 0 |
||||||
| 489 | ); |
||||||
| 490 | |||||||
| 491 | // Create key by which to determine unique-ness: |
||||||
| 492 | $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter']; |
||||||
| 493 | |||||||
| 494 | if (isset($duplicateTrack[$uKey])) { |
||||||
| 495 | //if the url key is registered just display it and do not resubmit is |
||||||
| 496 | $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>'; |
||||||
| 497 | } else { |
||||||
| 498 | // Scheduled time: |
||||||
| 499 | $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute)); |
||||||
| 500 | $schTime = intval($schTime / 60) * 60; |
||||||
| 501 | $formattedDate = BackendUtility::datetime($schTime); |
||||||
| 502 | $this->urlList[] = '[' . $formattedDate . '] ' . $url; |
||||||
| 503 | $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url); |
||||||
| 504 | |||||||
| 505 | // Submit for crawling! |
||||||
| 506 | if ($submitCrawlUrls) { |
||||||
| 507 | $added = $this->addUrl( |
||||||
| 508 | $pageId, |
||||||
| 509 | $url, |
||||||
| 510 | $vv['subCfg'], |
||||||
| 511 | $scheduledTime, |
||||||
| 512 | $configurationHash, |
||||||
| 513 | $skipInnerCheck |
||||||
| 514 | ); |
||||||
| 515 | if ($added === false) { |
||||||
| 516 | $urlList .= ' (URL already existed)'; |
||||||
| 517 | } |
||||||
| 518 | } elseif ($downloadCrawlUrls) { |
||||||
| 519 | $downloadUrls[$url] = $url; |
||||||
| 520 | } |
||||||
| 521 | $urlLog[] = $urlList; |
||||||
| 522 | } |
||||||
| 523 | $duplicateTrack[$uKey] = true; |
||||||
| 524 | } |
||||||
| 525 | |||||||
| 526 | return implode('<br>', $urlLog); |
||||||
| 527 | } |
||||||
| 528 | |||||||
| 529 | /** |
||||||
| 530 | * Returns true if input processing instruction is among registered ones. |
||||||
| 531 | * |
||||||
| 532 | * @param string $piString PI to test |
||||||
| 533 | * @param array $incomingProcInstructions Processing instructions |
||||||
| 534 | * @return boolean |
||||||
| 535 | */ |
||||||
| 536 | public function drawURLs_PIfilter($piString, array $incomingProcInstructions) |
||||||
| 537 | { |
||||||
| 538 | if (empty($incomingProcInstructions)) { |
||||||
| 539 | return true; |
||||||
| 540 | } |
||||||
| 541 | |||||||
| 542 | foreach ($incomingProcInstructions as $pi) { |
||||||
| 543 | if (GeneralUtility::inList($piString, $pi)) { |
||||||
| 544 | return true; |
||||||
| 545 | } |
||||||
| 546 | } |
||||||
| 547 | return false; |
||||||
| 548 | } |
||||||
| 549 | |||||||
| 550 | public function getPageTSconfigForId(int $id): array |
||||||
| 551 | { |
||||||
| 552 | if (! $this->MP) { |
||||||
| 553 | $pageTSconfig = BackendUtility::getPagesTSconfig($id); |
||||||
| 554 | } else { |
||||||
| 555 | // TODO: Please check, this makes no sense to split a boolean value. |
||||||
| 556 | [, $mountPointId] = explode('-', $this->MP); |
||||||
| 557 | $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId); |
||||||
| 558 | } |
||||||
| 559 | |||||||
| 560 | // Call a hook to alter configuration |
||||||
| 561 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) { |
||||||
| 562 | $params = [ |
||||||
| 563 | 'pageId' => $id, |
||||||
| 564 | 'pageTSConfig' => &$pageTSconfig, |
||||||
| 565 | ]; |
||||||
| 566 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) { |
||||||
| 567 | GeneralUtility::callUserFunction($userFunc, $params, $this); |
||||||
| 568 | } |
||||||
| 569 | } |
||||||
| 570 | return $pageTSconfig; |
||||||
| 571 | } |
||||||
| 572 | |||||||
| 573 | /** |
||||||
| 574 | * This methods returns an array of configurations. |
||||||
| 575 | * Adds no urls! |
||||||
| 576 | */ |
||||||
| 577 | public function getUrlsForPageId(int $pageId): array |
||||||
| 578 | { |
||||||
| 579 | // Get page TSconfig for page ID |
||||||
| 580 | $pageTSconfig = $this->getPageTSconfigForId($pageId); |
||||||
| 581 | |||||||
| 582 | $res = []; |
||||||
| 583 | |||||||
| 584 | // Fetch Crawler Configuration from pageTSconfig |
||||||
| 585 | $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? []; |
||||||
| 586 | foreach ($crawlerCfg as $key => $values) { |
||||||
| 587 | if (! is_array($values)) { |
||||||
| 588 | continue; |
||||||
| 589 | } |
||||||
| 590 | $key = str_replace('.', '', $key); |
||||||
| 591 | // Sub configuration for a single configuration string: |
||||||
| 592 | $subCfg = (array) $crawlerCfg[$key . '.']; |
||||||
| 593 | $subCfg['key'] = $key; |
||||||
| 594 | |||||||
| 595 | if (strcmp($subCfg['procInstrFilter'] ?? '', '')) { |
||||||
| 596 | $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter'])); |
||||||
| 597 | } |
||||||
| 598 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true)); |
||||||
| 599 | |||||||
| 600 | // process configuration if it is not page-specific or if the specific page is the current page: |
||||||
| 601 | // TODO: Check if $pidOnlyList can be kept as Array instead of imploded |
||||||
| 602 | if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) { |
||||||
| 603 | |||||||
| 604 | // Explode, process etc.: |
||||||
| 605 | $res[$key] = []; |
||||||
| 606 | $res[$key]['subCfg'] = $subCfg; |
||||||
| 607 | $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]); |
||||||
| 608 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId); |
||||||
| 609 | $res[$key]['origin'] = 'pagets'; |
||||||
| 610 | |||||||
| 611 | // recognize MP value |
||||||
| 612 | if (! $this->MP) { |
||||||
| 613 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]); |
||||||
| 614 | } else { |
||||||
| 615 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]); |
||||||
| 616 | } |
||||||
| 617 | } |
||||||
| 618 | } |
||||||
| 619 | |||||||
| 620 | // Get configuration from tx_crawler_configuration records up the rootline |
||||||
| 621 | $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId); |
||||||
| 622 | foreach ($crawlerConfigurations as $configurationRecord) { |
||||||
| 623 | |||||||
| 624 | // check access to the configuration record |
||||||
| 625 | if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) { |
||||||
| 626 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true)); |
||||||
| 627 | |||||||
| 628 | // process configuration if it is not page-specific or if the specific page is the current page: |
||||||
| 629 | // TODO: Check if $pidOnlyList can be kept as Array instead of imploded |
||||||
| 630 | if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) { |
||||||
| 631 | $key = $configurationRecord['name']; |
||||||
| 632 | |||||||
| 633 | // don't overwrite previously defined paramSets |
||||||
| 634 | if (! isset($res[$key])) { |
||||||
| 635 | |||||||
| 636 | /* @var $TSparserObject TypoScriptParser */ |
||||||
| 637 | $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class); |
||||||
| 638 | $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']); |
||||||
| 639 | |||||||
| 640 | $subCfg = [ |
||||||
| 641 | 'procInstrFilter' => $configurationRecord['processing_instruction_filter'], |
||||||
| 642 | 'procInstrParams.' => $TSparserObject->setup, |
||||||
| 643 | 'baseUrl' => $configurationRecord['base_url'], |
||||||
| 644 | 'force_ssl' => (int) $configurationRecord['force_ssl'], |
||||||
| 645 | 'userGroups' => $configurationRecord['fegroups'], |
||||||
| 646 | 'exclude' => $configurationRecord['exclude'], |
||||||
| 647 | 'key' => $key, |
||||||
| 648 | ]; |
||||||
| 649 | |||||||
| 650 | if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) { |
||||||
| 651 | $res[$key] = []; |
||||||
| 652 | $res[$key]['subCfg'] = $subCfg; |
||||||
| 653 | $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']); |
||||||
| 654 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId); |
||||||
| 655 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]); |
||||||
| 656 | $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid']; |
||||||
| 657 | } |
||||||
| 658 | } |
||||||
| 659 | } |
||||||
| 660 | } |
||||||
| 661 | } |
||||||
| 662 | |||||||
| 663 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) { |
||||||
| 664 | $params = [ |
||||||
| 665 | 'res' => &$res, |
||||||
| 666 | ]; |
||||||
| 667 | GeneralUtility::callUserFunction($func, $params, $this); |
||||||
| 668 | } |
||||||
| 669 | return $res; |
||||||
| 670 | } |
||||||
| 671 | |||||||
| 672 | /** |
||||||
| 673 | * Find all configurations of subpages of a page |
||||||
| 674 | * TODO: Write Functional Tests |
||||||
| 675 | */ |
||||||
| 676 | public function getConfigurationsForBranch(int $rootid, int $depth): array |
||||||
| 677 | { |
||||||
| 678 | $configurationsForBranch = []; |
||||||
| 679 | $pageTSconfig = $this->getPageTSconfigForId($rootid); |
||||||
| 680 | $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? []; |
||||||
| 681 | foreach ($sets as $key => $value) { |
||||||
| 682 | if (! is_array($value)) { |
||||||
| 683 | continue; |
||||||
| 684 | } |
||||||
| 685 | $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key; |
||||||
| 686 | } |
||||||
| 687 | $pids = []; |
||||||
| 688 | $rootLine = BackendUtility::BEgetRootLine($rootid); |
||||||
| 689 | foreach ($rootLine as $node) { |
||||||
| 690 | $pids[] = $node['uid']; |
||||||
| 691 | } |
||||||
| 692 | /* @var PageTreeView $tree */ |
||||||
| 693 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||||||
| 694 | $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW); |
||||||
| 695 | $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause)); |
||||||
| 696 | $tree->getTree($rootid, $depth, ''); |
||||||
| 697 | foreach ($tree->tree as $node) { |
||||||
| 698 | $pids[] = $node['row']['uid']; |
||||||
| 699 | } |
||||||
| 700 | |||||||
| 701 | $queryBuilder = $this->getQueryBuilder(ConfigurationRepository::TABLE_NAME); |
||||||
| 702 | $statement = $queryBuilder |
||||||
| 703 | ->select('name') |
||||||
| 704 | ->from(ConfigurationRepository::TABLE_NAME) |
||||||
| 705 | ->where( |
||||||
| 706 | $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY)) |
||||||
| 707 | ) |
||||||
| 708 | ->execute(); |
||||||
| 709 | |||||||
| 710 | while ($row = $statement->fetch()) { |
||||||
|
0 ignored issues
–
show
|
|||||||
| 711 | $configurationsForBranch[] = $row['name']; |
||||||
| 712 | } |
||||||
| 713 | return $configurationsForBranch; |
||||||
| 714 | } |
||||||
| 715 | |||||||
| 716 | /** |
||||||
| 717 | * Check if a user has access to an item |
||||||
| 718 | * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list) |
||||||
| 719 | * |
||||||
| 720 | * @param string $groupList Comma-separated list of (fe_)group UIDs from a user |
||||||
| 721 | * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access |
||||||
| 722 | * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty |
||||||
| 723 | * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause() |
||||||
| 724 | * @deprecated |
||||||
| 725 | * @codeCoverageIgnore |
||||||
| 726 | */ |
||||||
| 727 | public function hasGroupAccess($groupList, $accessList) |
||||||
| 728 | { |
||||||
| 729 | if (empty($accessList)) { |
||||||
| 730 | return true; |
||||||
| 731 | } |
||||||
| 732 | foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) { |
||||||
| 733 | if (GeneralUtility::inList($accessList, $groupUid)) { |
||||||
| 734 | return true; |
||||||
| 735 | } |
||||||
| 736 | } |
||||||
| 737 | return false; |
||||||
| 738 | } |
||||||
| 739 | |||||||
| 740 | /** |
||||||
| 741 | * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter. |
||||||
| 742 | * Syntax of values: |
||||||
| 743 | * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally |
||||||
| 744 | * - Configuration is splitted by "|" and the parts are processed individually and finally added together |
||||||
| 745 | * - For each configuration part: |
||||||
| 746 | * - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30" |
||||||
| 747 | * - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123" |
||||||
| 748 | * _ENABLELANG:1 picks only original records without their language overlays |
||||||
| 749 | * - Default: Literal value |
||||||
| 750 | * |
||||||
| 751 | * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion) |
||||||
| 752 | * @param integer $pid Current page ID |
||||||
| 753 | * @return array |
||||||
| 754 | * |
||||||
| 755 | * TODO: Write Functional Tests |
||||||
| 756 | */ |
||||||
| 757 | public function expandParameters($paramArray, $pid) |
||||||
| 758 | { |
||||||
| 759 | // Traverse parameter names: |
||||||
| 760 | foreach ($paramArray as $p => $v) { |
||||||
| 761 | $v = trim($v); |
||||||
| 762 | |||||||
| 763 | // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal |
||||||
| 764 | if (strpos($v, '[') === 0 && substr($v, -1) === ']') { |
||||||
| 765 | // So, find the value inside brackets and reset the paramArray value as an array. |
||||||
| 766 | $v = substr($v, 1, -1); |
||||||
| 767 | $paramArray[$p] = []; |
||||||
| 768 | |||||||
| 769 | // Explode parts and traverse them: |
||||||
| 770 | $parts = explode('|', $v); |
||||||
| 771 | foreach ($parts as $pV) { |
||||||
| 772 | |||||||
| 773 | // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30) |
||||||
| 774 | if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) { |
||||||
| 775 | $reg = $this->swapIfFirstIsLargerThanSecond($reg); |
||||||
| 776 | |||||||
| 777 | // Traverse range, add values: |
||||||
| 778 | // Limit to size of range! |
||||||
| 779 | $runAwayBrake = 1000; |
||||||
| 780 | for ($a = $reg[1]; $a <= $reg[2]; $a++) { |
||||||
| 781 | $paramArray[$p][] = $a; |
||||||
| 782 | $runAwayBrake--; |
||||||
| 783 | if ($runAwayBrake <= 0) { |
||||||
| 784 | break; |
||||||
| 785 | } |
||||||
| 786 | } |
||||||
| 787 | } elseif (strpos(trim($pV), '_TABLE:') === 0) { |
||||||
| 788 | |||||||
| 789 | // Parse parameters: |
||||||
| 790 | $subparts = GeneralUtility::trimExplode(';', $pV); |
||||||
| 791 | $subpartParams = []; |
||||||
| 792 | foreach ($subparts as $spV) { |
||||||
| 793 | [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV); |
||||||
| 794 | $subpartParams[$pKey] = $pVal; |
||||||
| 795 | } |
||||||
| 796 | |||||||
| 797 | // Table exists: |
||||||
| 798 | if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) { |
||||||
| 799 | $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid); |
||||||
| 800 | $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0; |
||||||
| 801 | $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid'; |
||||||
| 802 | $where = $subpartParams['_WHERE'] ?? ''; |
||||||
| 803 | $addTable = $subpartParams['_ADDTABLE'] ?? ''; |
||||||
| 804 | |||||||
| 805 | $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid'; |
||||||
| 806 | if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) { |
||||||
| 807 | $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']); |
||||||
| 808 | |||||||
| 809 | if ($recursiveDepth > 0) { |
||||||
| 810 | /** @var QueryGenerator $queryGenerator */ |
||||||
| 811 | $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class); |
||||||
| 812 | $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1); |
||||||
| 813 | $pidArray = GeneralUtility::intExplode(',', $pidList); |
||||||
| 814 | } else { |
||||||
| 815 | $pidArray = [(string) $lookUpPid]; |
||||||
| 816 | } |
||||||
| 817 | |||||||
| 818 | $queryBuilder->getRestrictions() |
||||||
| 819 | ->removeAll() |
||||||
| 820 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)); |
||||||
| 821 | |||||||
| 822 | $queryBuilder |
||||||
| 823 | ->select($fieldName) |
||||||
| 824 | ->from($subpartParams['_TABLE']) |
||||||
| 825 | ->where( |
||||||
| 826 | $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)), |
||||||
| 827 | $where |
||||||
| 828 | ); |
||||||
| 829 | |||||||
| 830 | if (! empty($addTable)) { |
||||||
| 831 | // TODO: Check if this works as intended! |
||||||
| 832 | $queryBuilder->add('from', $addTable); |
||||||
| 833 | } |
||||||
| 834 | $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField']; |
||||||
| 835 | |||||||
| 836 | if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) { |
||||||
| 837 | $queryBuilder->andWhere( |
||||||
| 838 | $queryBuilder->expr()->lte( |
||||||
| 839 | $transOrigPointerField, |
||||||
| 840 | 0 |
||||||
| 841 | ) |
||||||
| 842 | ); |
||||||
| 843 | } |
||||||
| 844 | |||||||
| 845 | $statement = $queryBuilder->execute(); |
||||||
| 846 | |||||||
| 847 | $rows = []; |
||||||
| 848 | while ($row = $statement->fetch()) { |
||||||
|
0 ignored issues
–
show
The function
Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This function has been deprecated. The supplier of the function has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead. Loading history...
|
|||||||
| 849 | $rows[$row[$fieldName]] = $row; |
||||||
| 850 | } |
||||||
| 851 | |||||||
| 852 | if (is_array($rows)) { |
||||||
| 853 | $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows)); |
||||||
| 854 | } |
||||||
| 855 | } |
||||||
| 856 | } |
||||||
| 857 | } else { |
||||||
| 858 | // Just add value: |
||||||
| 859 | $paramArray[$p][] = $pV; |
||||||
| 860 | } |
||||||
| 861 | // Hook for processing own expandParameters place holder |
||||||
| 862 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) { |
||||||
| 863 | $_params = [ |
||||||
| 864 | 'pObj' => &$this, |
||||||
| 865 | 'paramArray' => &$paramArray, |
||||||
| 866 | 'currentKey' => $p, |
||||||
| 867 | 'currentValue' => $pV, |
||||||
| 868 | 'pid' => $pid, |
||||||
| 869 | ]; |
||||||
| 870 | foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) { |
||||||
| 871 | GeneralUtility::callUserFunction($_funcRef, $_params, $this); |
||||||
| 872 | } |
||||||
| 873 | } |
||||||
| 874 | } |
||||||
| 875 | |||||||
| 876 | // Make unique set of values and sort array by key: |
||||||
| 877 | $paramArray[$p] = array_unique($paramArray[$p]); |
||||||
| 878 | ksort($paramArray); |
||||||
| 879 | } else { |
||||||
| 880 | // Set the literal value as only value in array: |
||||||
| 881 | $paramArray[$p] = [$v]; |
||||||
| 882 | } |
||||||
| 883 | } |
||||||
| 884 | |||||||
| 885 | return $paramArray; |
||||||
| 886 | } |
||||||
| 887 | |||||||
| 888 | /** |
||||||
| 889 | * Compiling URLs from parameter array (output of expandParameters()) |
||||||
| 890 | * The number of URLs will be the multiplication of the number of parameter values for each key |
||||||
| 891 | * |
||||||
| 892 | * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values |
||||||
| 893 | * @param array $urls URLs accumulated in this array (for recursion) |
||||||
| 894 | * @return array |
||||||
| 895 | */ |
||||||
| 896 | public function compileUrls($paramArray, array $urls) |
||||||
| 897 | { |
||||||
| 898 | if (empty($paramArray)) { |
||||||
| 899 | return $urls; |
||||||
| 900 | } |
||||||
| 901 | $varName = key($paramArray); |
||||||
| 902 | $valueSet = array_shift($paramArray); |
||||||
| 903 | |||||||
| 904 | // Traverse value set: |
||||||
| 905 | $newUrls = []; |
||||||
| 906 | foreach ($urls as $url) { |
||||||
| 907 | foreach ($valueSet as $val) { |
||||||
| 908 | if (count($newUrls) < $this->getMaximumUrlsToCompile()) { |
||||||
| 909 | $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : ''); |
||||||
| 910 | } |
||||||
| 911 | } |
||||||
| 912 | } |
||||||
| 913 | return $this->compileUrls($paramArray, $newUrls); |
||||||
| 914 | } |
||||||
| 915 | |||||||
| 916 | /************************************ |
||||||
| 917 | * |
||||||
| 918 | * Crawler log |
||||||
| 919 | * |
||||||
| 920 | ************************************/ |
||||||
| 921 | |||||||
| 922 | /** |
||||||
| 923 | * Return array of records from crawler queue for input page ID |
||||||
| 924 | * |
||||||
| 925 | * @param integer $id Page ID for which to look up log entries. |
||||||
| 926 | * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||||||
| 927 | * @param boolean $doFullFlush |
||||||
| 928 | * @param integer $itemsPerPage Limit the amount of entries per page default is 10 |
||||||
| 929 | * @return array |
||||||
| 930 | * |
||||||
| 931 | * @deprecated |
||||||
| 932 | */ |
||||||
| 933 | public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10) |
||||||
| 934 | { |
||||||
| 935 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 936 | $queryBuilder |
||||||
| 937 | ->select('*') |
||||||
| 938 | ->from($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 939 | ->where( |
||||||
| 940 | $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT)) |
||||||
| 941 | ) |
||||||
| 942 | ->orderBy('scheduled', 'DESC'); |
||||||
| 943 | |||||||
| 944 | $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||||||
| 945 | ->getConnectionForTable($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 946 | ->getExpressionBuilder(); |
||||||
| 947 | $query = $expressionBuilder->andX(); |
||||||
| 948 | // PHPStorm adds the highlight that the $addWhere is immediately overwritten, |
||||||
| 949 | // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND |
||||||
| 950 | // between the statements, it's not a mistake in the code. |
||||||
| 951 | switch ($queueFilter) { |
||||||
| 952 | case 'pending': |
||||||
| 953 | $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0)); |
||||||
| 954 | break; |
||||||
| 955 | case 'finished': |
||||||
| 956 | $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0)); |
||||||
| 957 | break; |
||||||
| 958 | } |
||||||
| 959 | |||||||
| 960 | if ($doFlush) { |
||||||
| 961 | $this->queueRepository->flushQueue($queueFilter); |
||||||
| 962 | } |
||||||
| 963 | if ($itemsPerPage > 0) { |
||||||
| 964 | $queryBuilder |
||||||
| 965 | ->setMaxResults((int) $itemsPerPage); |
||||||
| 966 | } |
||||||
| 967 | |||||||
| 968 | return $queryBuilder->execute()->fetchAll(); |
||||||
| 969 | } |
||||||
| 970 | |||||||
| 971 | /** |
||||||
| 972 | * Return array of records from crawler queue for input set ID |
||||||
| 973 | * |
||||||
| 974 | * @param int $set_id Set ID for which to look up log entries. |
||||||
| 975 | * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones |
||||||
| 976 | * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||||||
| 977 | * @param int $itemsPerPage Limit the amount of entries per page default is 10 |
||||||
| 978 | * @return array |
||||||
| 979 | * |
||||||
| 980 | * @deprecated |
||||||
| 981 | */ |
||||||
| 982 | public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10) |
||||||
| 983 | { |
||||||
| 984 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 985 | $queryBuilder |
||||||
| 986 | ->select('*') |
||||||
| 987 | ->from($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 988 | ->where( |
||||||
| 989 | $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT)) |
||||||
| 990 | ) |
||||||
| 991 | ->orderBy('scheduled', 'DESC'); |
||||||
| 992 | |||||||
| 993 | $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||||||
| 994 | ->getConnectionForTable($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 995 | ->getExpressionBuilder(); |
||||||
| 996 | $query = $expressionBuilder->andX(); |
||||||
| 997 | // PHPStorm adds the highlight that the $addWhere is immediately overwritten, |
||||||
| 998 | // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND |
||||||
| 999 | // between the statements, it's not a mistake in the code. |
||||||
| 1000 | $addWhere = ''; |
||||||
| 1001 | switch ($filter) { |
||||||
| 1002 | case 'pending': |
||||||
| 1003 | $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0)); |
||||||
| 1004 | $addWhere = $query->add($expressionBuilder->eq('exec_time', 0)); |
||||||
| 1005 | break; |
||||||
| 1006 | case 'finished': |
||||||
| 1007 | $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0)); |
||||||
| 1008 | $addWhere = $query->add($expressionBuilder->gt('exec_time', 0)); |
||||||
| 1009 | break; |
||||||
| 1010 | } |
||||||
| 1011 | if ($doFlush) { |
||||||
| 1012 | $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id)); |
||||||
| 1013 | $this->flushQueue($doFullFlush ? '' : $addWhere); |
||||||
| 1014 | return []; |
||||||
| 1015 | } |
||||||
| 1016 | if ($itemsPerPage > 0) { |
||||||
| 1017 | $queryBuilder |
||||||
| 1018 | ->setMaxResults((int) $itemsPerPage); |
||||||
| 1019 | } |
||||||
| 1020 | |||||||
| 1021 | return $queryBuilder->execute()->fetchAll(); |
||||||
| 1022 | } |
||||||
| 1023 | |||||||
| 1024 | /** |
||||||
| 1025 | * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php" |
||||||
| 1026 | * |
||||||
| 1027 | * @param integer $setId Set ID |
||||||
| 1028 | * @param array $params Parameters to pass to call back function |
||||||
| 1029 | * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler' |
||||||
| 1030 | * @param integer $page_id Page ID to attach it to |
||||||
| 1031 | * @param integer $schedule Time at which to activate |
||||||
| 1032 | */ |
||||||
| 1033 | public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void |
||||||
| 1034 | { |
||||||
| 1035 | if (! is_array($params)) { |
||||||
| 1036 | $params = []; |
||||||
| 1037 | } |
||||||
| 1038 | $params['_CALLBACKOBJ'] = $callBack; |
||||||
| 1039 | |||||||
| 1040 | GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME) |
||||||
| 1041 | ->insert( |
||||||
| 1042 | QueueRepository::TABLE_NAME, |
||||||
| 1043 | [ |
||||||
| 1044 | 'page_id' => (int) $page_id, |
||||||
| 1045 | 'parameters' => json_encode($params), |
||||||
| 1046 | 'scheduled' => (int) $schedule ?: $this->getCurrentTime(), |
||||||
| 1047 | 'exec_time' => 0, |
||||||
| 1048 | 'set_id' => (int) $setId, |
||||||
| 1049 | 'result_data' => '', |
||||||
| 1050 | ] |
||||||
| 1051 | ); |
||||||
| 1052 | } |
||||||
| 1053 | |||||||
| 1054 | /************************************ |
||||||
| 1055 | * |
||||||
| 1056 | * URL setting |
||||||
| 1057 | * |
||||||
| 1058 | ************************************/ |
||||||
| 1059 | |||||||
| 1060 | /** |
||||||
| 1061 | * Setting a URL for crawling: |
||||||
| 1062 | * |
||||||
| 1063 | * @param integer $id Page ID |
||||||
| 1064 | * @param string $url Complete URL |
||||||
| 1065 | * @param array $subCfg Sub configuration array (from TS config) |
||||||
| 1066 | * @param integer $tstamp Scheduled-time |
||||||
| 1067 | * @param string $configurationHash (optional) configuration hash |
||||||
| 1068 | * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check |
||||||
| 1069 | * @return bool |
||||||
| 1070 | */ |
||||||
| 1071 | public function addUrl( |
||||||
| 1072 | $id, |
||||||
| 1073 | $url, |
||||||
| 1074 | array $subCfg, |
||||||
| 1075 | $tstamp, |
||||||
| 1076 | $configurationHash = '', |
||||||
| 1077 | $skipInnerDuplicationCheck = false |
||||||
| 1078 | ) { |
||||||
| 1079 | $urlAdded = false; |
||||||
| 1080 | $rows = []; |
||||||
| 1081 | |||||||
| 1082 | // Creating parameters: |
||||||
| 1083 | $parameters = [ |
||||||
| 1084 | 'url' => $url, |
||||||
| 1085 | ]; |
||||||
| 1086 | |||||||
| 1087 | // fe user group simulation: |
||||||
| 1088 | $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true))); |
||||||
| 1089 | if ($uGs) { |
||||||
| 1090 | $parameters['feUserGroupList'] = $uGs; |
||||||
| 1091 | } |
||||||
| 1092 | |||||||
| 1093 | // Setting processing instructions |
||||||
| 1094 | $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']); |
||||||
| 1095 | if (is_array($subCfg['procInstrParams.'])) { |
||||||
| 1096 | $parameters['procInstrParams'] = $subCfg['procInstrParams.']; |
||||||
| 1097 | } |
||||||
| 1098 | |||||||
| 1099 | // Compile value array: |
||||||
| 1100 | $parameters_serialized = json_encode($parameters); |
||||||
| 1101 | $fieldArray = [ |
||||||
| 1102 | 'page_id' => (int) $id, |
||||||
| 1103 | 'parameters' => $parameters_serialized, |
||||||
| 1104 | 'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized), |
||||||
| 1105 | 'configuration_hash' => $configurationHash, |
||||||
| 1106 | 'scheduled' => $tstamp, |
||||||
| 1107 | 'exec_time' => 0, |
||||||
| 1108 | 'set_id' => (int) $this->setID, |
||||||
| 1109 | 'result_data' => '', |
||||||
| 1110 | 'configuration' => $subCfg['key'], |
||||||
| 1111 | ]; |
||||||
| 1112 | |||||||
| 1113 | if ($this->registerQueueEntriesInternallyOnly) { |
||||||
| 1114 | //the entries will only be registered and not stored to the database |
||||||
| 1115 | $this->queueEntries[] = $fieldArray; |
||||||
| 1116 | } else { |
||||||
| 1117 | if (! $skipInnerDuplicationCheck) { |
||||||
| 1118 | // check if there is already an equal entry |
||||||
| 1119 | $rows = $this->queueRepository->getDuplicateQueueItemsIfExists( |
||||||
| 1120 | (bool) $this->extensionSettings['enableTimeslot'], |
||||||
| 1121 | $tstamp, |
||||||
| 1122 | $this->getCurrentTime(), |
||||||
| 1123 | $fieldArray['page_id'], |
||||||
| 1124 | $fieldArray['parameters_hash'] |
||||||
| 1125 | ); |
||||||
| 1126 | } |
||||||
| 1127 | |||||||
| 1128 | if (empty($rows)) { |
||||||
| 1129 | $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME); |
||||||
| 1130 | $connectionForCrawlerQueue->insert( |
||||||
| 1131 | QueueRepository::TABLE_NAME, |
||||||
| 1132 | $fieldArray |
||||||
| 1133 | ); |
||||||
| 1134 | $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid'); |
||||||
| 1135 | $rows[] = $uid; |
||||||
| 1136 | $urlAdded = true; |
||||||
| 1137 | |||||||
| 1138 | $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray]; |
||||||
| 1139 | SignalSlotUtility::emitSignal( |
||||||
| 1140 | self::class, |
||||||
| 1141 | SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE, |
||||||
| 1142 | $signalPayload |
||||||
| 1143 | ); |
||||||
| 1144 | } else { |
||||||
| 1145 | $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray]; |
||||||
| 1146 | SignalSlotUtility::emitSignal( |
||||||
| 1147 | self::class, |
||||||
| 1148 | SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE, |
||||||
| 1149 | $signalPayload |
||||||
| 1150 | ); |
||||||
| 1151 | } |
||||||
| 1152 | } |
||||||
| 1153 | |||||||
| 1154 | return $urlAdded; |
||||||
| 1155 | } |
||||||
| 1156 | |||||||
| 1157 | /** |
||||||
| 1158 | * Returns the current system time |
||||||
| 1159 | * |
||||||
| 1160 | * @return int |
||||||
| 1161 | */ |
||||||
| 1162 | public function getCurrentTime() |
||||||
| 1163 | { |
||||||
| 1164 | return time(); |
||||||
| 1165 | } |
||||||
| 1166 | |||||||
| 1167 | /************************************ |
||||||
| 1168 | * |
||||||
| 1169 | * URL reading |
||||||
| 1170 | * |
||||||
| 1171 | ************************************/ |
||||||
| 1172 | |||||||
| 1173 | /** |
||||||
| 1174 | * Read URL for single queue entry |
||||||
| 1175 | * |
||||||
| 1176 | * @param integer $queueId |
||||||
| 1177 | * @param boolean $force If set, will process even if exec_time has been set! |
||||||
| 1178 | * |
||||||
| 1179 | * @return int|null |
||||||
| 1180 | */ |
||||||
| 1181 | public function readUrl($queueId, $force = false) |
||||||
| 1182 | { |
||||||
| 1183 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME); |
||||||
| 1184 | $ret = 0; |
||||||
| 1185 | $this->logger->debug('crawler-readurl start ' . microtime(true)); |
||||||
|
0 ignored issues
–
show
The method
debug() does not exist on null.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 1186 | |||||||
| 1187 | $queryBuilder |
||||||
| 1188 | ->select('*') |
||||||
| 1189 | ->from(QueueRepository::TABLE_NAME) |
||||||
| 1190 | ->where( |
||||||
| 1191 | $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT)) |
||||||
| 1192 | ); |
||||||
| 1193 | if (! $force) { |
||||||
| 1194 | $queryBuilder |
||||||
| 1195 | ->andWhere('exec_time = 0') |
||||||
| 1196 | ->andWhere('process_scheduled > 0'); |
||||||
| 1197 | } |
||||||
| 1198 | $queueRec = $queryBuilder->execute()->fetch(); |
||||||
| 1199 | |||||||
| 1200 | if (! is_array($queueRec)) { |
||||||
| 1201 | return; |
||||||
| 1202 | } |
||||||
| 1203 | |||||||
| 1204 | SignalSlotUtility::emitSignal( |
||||||
| 1205 | self::class, |
||||||
| 1206 | SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS, |
||||||
| 1207 | [$queueId, &$queueRec] |
||||||
| 1208 | ); |
||||||
| 1209 | |||||||
| 1210 | // Set exec_time to lock record: |
||||||
| 1211 | $field_array = ['exec_time' => $this->getCurrentTime()]; |
||||||
| 1212 | |||||||
| 1213 | if (isset($this->processID)) { |
||||||
| 1214 | //if mulitprocessing is used we need to store the id of the process which has handled this entry |
||||||
| 1215 | $field_array['process_id_completed'] = $this->processID; |
||||||
| 1216 | } |
||||||
| 1217 | |||||||
| 1218 | GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME) |
||||||
| 1219 | ->update( |
||||||
| 1220 | QueueRepository::TABLE_NAME, |
||||||
| 1221 | $field_array, |
||||||
| 1222 | ['qid' => (int) $queueId] |
||||||
| 1223 | ); |
||||||
| 1224 | |||||||
| 1225 | $result = $this->queueExecutor->executeQueueItem($queueRec, $this); |
||||||
| 1226 | if ($result['content'] === null) { |
||||||
| 1227 | $resultData = 'An errors happened'; |
||||||
| 1228 | } else { |
||||||
| 1229 | /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */ |
||||||
| 1230 | $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class); |
||||||
| 1231 | $resultData = $jsonCompatibilityConverter->convert($result['content']); |
||||||
| 1232 | |||||||
| 1233 | //atm there's no need to point to specific pollable extensions |
||||||
| 1234 | if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) { |
||||||
| 1235 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) { |
||||||
| 1236 | // only check the success value if the instruction is runnig |
||||||
| 1237 | // it is important to name the pollSuccess key same as the procInstructions key |
||||||
| 1238 | if (is_array($resultData['parameters']['procInstructions']) |
||||||
| 1239 | && in_array( |
||||||
| 1240 | $pollable, |
||||||
| 1241 | $resultData['parameters']['procInstructions'], true |
||||||
| 1242 | ) |
||||||
| 1243 | ) { |
||||||
| 1244 | if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) { |
||||||
| 1245 | $ret |= self::CLI_STATUS_POLLABLE_PROCESSED; |
||||||
| 1246 | } |
||||||
| 1247 | } |
||||||
| 1248 | } |
||||||
| 1249 | } |
||||||
| 1250 | } |
||||||
| 1251 | // Set result in log which also denotes the end of the processing of this entry. |
||||||
| 1252 | $field_array = ['result_data' => json_encode($result)]; |
||||||
| 1253 | |||||||
| 1254 | SignalSlotUtility::emitSignal( |
||||||
| 1255 | self::class, |
||||||
| 1256 | SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS, |
||||||
| 1257 | [$queueId, &$field_array] |
||||||
| 1258 | ); |
||||||
| 1259 | |||||||
| 1260 | GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME) |
||||||
| 1261 | ->update( |
||||||
| 1262 | QueueRepository::TABLE_NAME, |
||||||
| 1263 | $field_array, |
||||||
| 1264 | ['qid' => (int) $queueId] |
||||||
| 1265 | ); |
||||||
| 1266 | |||||||
| 1267 | $this->logger->debug('crawler-readurl stop ' . microtime(true)); |
||||||
| 1268 | return $ret; |
||||||
| 1269 | } |
||||||
| 1270 | |||||||
| 1271 | /** |
||||||
| 1272 | * Read URL for not-yet-inserted log-entry |
||||||
| 1273 | * |
||||||
| 1274 | * @param array $field_array Queue field array, |
||||||
| 1275 | * |
||||||
| 1276 | * @return array|bool|mixed|string |
||||||
| 1277 | */ |
||||||
| 1278 | public function readUrlFromArray($field_array) |
||||||
| 1279 | { |
||||||
| 1280 | // Set exec_time to lock record: |
||||||
| 1281 | $field_array['exec_time'] = $this->getCurrentTime(); |
||||||
| 1282 | $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME); |
||||||
| 1283 | $connectionForCrawlerQueue->insert( |
||||||
| 1284 | QueueRepository::TABLE_NAME, |
||||||
| 1285 | $field_array |
||||||
| 1286 | ); |
||||||
| 1287 | $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid'); |
||||||
| 1288 | $result = $this->queueExecutor->executeQueueItem($field_array, $this); |
||||||
| 1289 | |||||||
| 1290 | // Set result in log which also denotes the end of the processing of this entry. |
||||||
| 1291 | $field_array = ['result_data' => json_encode($result)]; |
||||||
| 1292 | |||||||
| 1293 | SignalSlotUtility::emitSignal( |
||||||
| 1294 | self::class, |
||||||
| 1295 | SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS, |
||||||
| 1296 | [$queueId, &$field_array] |
||||||
| 1297 | ); |
||||||
| 1298 | |||||||
| 1299 | $connectionForCrawlerQueue->update( |
||||||
| 1300 | QueueRepository::TABLE_NAME, |
||||||
| 1301 | $field_array, |
||||||
| 1302 | ['qid' => $queueId] |
||||||
| 1303 | ); |
||||||
| 1304 | |||||||
| 1305 | return $result; |
||||||
| 1306 | } |
||||||
| 1307 | |||||||
| 1308 | /***************************** |
||||||
| 1309 | * |
||||||
| 1310 | * Compiling URLs to crawl - tools |
||||||
| 1311 | * |
||||||
| 1312 | *****************************/ |
||||||
| 1313 | |||||||
| 1314 | /** |
||||||
| 1315 | * @param integer $id Root page id to start from. |
||||||
| 1316 | * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite |
||||||
| 1317 | * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue |
||||||
| 1318 | * @param integer $reqMinute Number of requests per minute (creates the interleave between requests) |
||||||
| 1319 | * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling) |
||||||
| 1320 | * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||||||
| 1321 | * @param array $incomingProcInstructions Array of processing instructions |
||||||
| 1322 | * @param array $configurationSelection Array of configuration keys |
||||||
| 1323 | * @return string |
||||||
| 1324 | */ |
||||||
| 1325 | public function getPageTreeAndUrls( |
||||||
| 1326 | $id, |
||||||
| 1327 | $depth, |
||||||
| 1328 | $scheduledTime, |
||||||
| 1329 | $reqMinute, |
||||||
| 1330 | $submitCrawlUrls, |
||||||
| 1331 | $downloadCrawlUrls, |
||||||
| 1332 | array $incomingProcInstructions, |
||||||
| 1333 | array $configurationSelection |
||||||
| 1334 | ) { |
||||||
| 1335 | $this->scheduledTime = $scheduledTime; |
||||||
| 1336 | $this->reqMinute = $reqMinute; |
||||||
| 1337 | $this->submitCrawlUrls = $submitCrawlUrls; |
||||||
| 1338 | $this->downloadCrawlUrls = $downloadCrawlUrls; |
||||||
| 1339 | $this->incomingProcInstructions = $incomingProcInstructions; |
||||||
| 1340 | $this->incomingConfigurationSelection = $configurationSelection; |
||||||
| 1341 | |||||||
| 1342 | $this->duplicateTrack = []; |
||||||
| 1343 | $this->downloadUrls = []; |
||||||
| 1344 | |||||||
| 1345 | // Drawing tree: |
||||||
| 1346 | /* @var PageTreeView $tree */ |
||||||
| 1347 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||||||
| 1348 | $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW); |
||||||
| 1349 | $tree->init('AND ' . $perms_clause); |
||||||
| 1350 | |||||||
| 1351 | $pageInfo = BackendUtility::readPageAccess($id, $perms_clause); |
||||||
| 1352 | if (is_array($pageInfo)) { |
||||||
| 1353 | // Set root row: |
||||||
| 1354 | $tree->tree[] = [ |
||||||
| 1355 | 'row' => $pageInfo, |
||||||
| 1356 | 'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL), |
||||||
| 1357 | ]; |
||||||
| 1358 | } |
||||||
| 1359 | |||||||
| 1360 | // Get branch beneath: |
||||||
| 1361 | if ($depth) { |
||||||
| 1362 | $tree->getTree($id, $depth, ''); |
||||||
| 1363 | } |
||||||
| 1364 | |||||||
| 1365 | // Traverse page tree: |
||||||
| 1366 | $code = ''; |
||||||
| 1367 | |||||||
| 1368 | foreach ($tree->tree as $data) { |
||||||
| 1369 | $this->MP = false; |
||||||
| 1370 | |||||||
| 1371 | // recognize mount points |
||||||
| 1372 | if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) { |
||||||
| 1373 | $mountpage = $this->pageRepository->getPage($data['row']['uid']); |
||||||
| 1374 | |||||||
| 1375 | // fetch mounted pages |
||||||
| 1376 | $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid']; |
||||||
| 1377 | |||||||
| 1378 | $mountTree = GeneralUtility::makeInstance(PageTreeView::class); |
||||||
| 1379 | $mountTree->init('AND ' . $perms_clause); |
||||||
| 1380 | $mountTree->getTree($mountpage[0]['mount_pid'], $depth); |
||||||
| 1381 | |||||||
| 1382 | foreach ($mountTree->tree as $mountData) { |
||||||
| 1383 | $code .= $this->drawURLs_addRowsForPage( |
||||||
| 1384 | $mountData['row'], |
||||||
| 1385 | $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true) |
||||||
| 1386 | ); |
||||||
| 1387 | } |
||||||
| 1388 | |||||||
| 1389 | // replace page when mount_pid_ol is enabled |
||||||
| 1390 | if ($mountpage[0]['mount_pid_ol']) { |
||||||
| 1391 | $data['row']['uid'] = $mountpage[0]['mount_pid']; |
||||||
| 1392 | } else { |
||||||
| 1393 | // if the mount_pid_ol is not set the MP must not be used for the mountpoint page |
||||||
| 1394 | $this->MP = false; |
||||||
| 1395 | } |
||||||
| 1396 | } |
||||||
| 1397 | |||||||
| 1398 | $code .= $this->drawURLs_addRowsForPage( |
||||||
| 1399 | $data['row'], |
||||||
| 1400 | $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true) |
||||||
| 1401 | ); |
||||||
| 1402 | } |
||||||
| 1403 | |||||||
| 1404 | return $code; |
||||||
| 1405 | } |
||||||
| 1406 | |||||||
| 1407 | /** |
||||||
| 1408 | * Expands exclude string |
||||||
| 1409 | * |
||||||
| 1410 | * @param string $excludeString Exclude string |
||||||
| 1411 | * @return array |
||||||
| 1412 | */ |
||||||
| 1413 | public function expandExcludeString($excludeString) |
||||||
| 1414 | { |
||||||
| 1415 | // internal static caches; |
||||||
| 1416 | static $expandedExcludeStringCache; |
||||||
| 1417 | static $treeCache; |
||||||
| 1418 | |||||||
| 1419 | if (empty($expandedExcludeStringCache[$excludeString])) { |
||||||
| 1420 | $pidList = []; |
||||||
| 1421 | |||||||
| 1422 | if (! empty($excludeString)) { |
||||||
| 1423 | /** @var PageTreeView $tree */ |
||||||
| 1424 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||||||
| 1425 | $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW)); |
||||||
| 1426 | |||||||
| 1427 | $excludeParts = GeneralUtility::trimExplode(',', $excludeString); |
||||||
| 1428 | |||||||
| 1429 | foreach ($excludeParts as $excludePart) { |
||||||
| 1430 | [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart); |
||||||
| 1431 | |||||||
| 1432 | // default is "page only" = "depth=0" |
||||||
| 1433 | if (empty($depth)) { |
||||||
| 1434 | $depth = (stristr($excludePart, '+')) ? 99 : 0; |
||||||
| 1435 | } |
||||||
| 1436 | |||||||
| 1437 | $pidList[] = (int) $pid; |
||||||
| 1438 | |||||||
| 1439 | if ($depth > 0) { |
||||||
| 1440 | if (empty($treeCache[$pid][$depth])) { |
||||||
| 1441 | $tree->reset(); |
||||||
| 1442 | $tree->getTree($pid, $depth); |
||||||
| 1443 | $treeCache[$pid][$depth] = $tree->tree; |
||||||
| 1444 | } |
||||||
| 1445 | |||||||
| 1446 | foreach ($treeCache[$pid][$depth] as $data) { |
||||||
| 1447 | $pidList[] = (int) $data['row']['uid']; |
||||||
| 1448 | } |
||||||
| 1449 | } |
||||||
| 1450 | } |
||||||
| 1451 | } |
||||||
| 1452 | |||||||
| 1453 | $expandedExcludeStringCache[$excludeString] = array_unique($pidList); |
||||||
| 1454 | } |
||||||
| 1455 | |||||||
| 1456 | return $expandedExcludeStringCache[$excludeString]; |
||||||
| 1457 | } |
||||||
| 1458 | |||||||
| 1459 | /** |
||||||
| 1460 | * Create the rows for display of the page tree |
||||||
| 1461 | * For each page a number of rows are shown displaying GET variable configuration |
||||||
| 1462 | */ |
||||||
| 1463 | public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string |
||||||
| 1464 | { |
||||||
| 1465 | $skipMessage = ''; |
||||||
| 1466 | |||||||
| 1467 | // Get list of configurations |
||||||
| 1468 | $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage); |
||||||
| 1469 | $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations); |
||||||
| 1470 | |||||||
| 1471 | // Traverse parameter combinations: |
||||||
| 1472 | $c = 0; |
||||||
| 1473 | $content = ''; |
||||||
| 1474 | if (! empty($configurations)) { |
||||||
| 1475 | foreach ($configurations as $confKey => $confArray) { |
||||||
| 1476 | |||||||
| 1477 | // Title column: |
||||||
| 1478 | if (! $c) { |
||||||
| 1479 | $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>'; |
||||||
| 1480 | } else { |
||||||
| 1481 | $titleClm = ''; |
||||||
| 1482 | } |
||||||
| 1483 | |||||||
| 1484 | if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) { |
||||||
| 1485 | |||||||
| 1486 | // URL list: |
||||||
| 1487 | $urlList = $this->urlListFromUrlArray( |
||||||
| 1488 | $confArray, |
||||||
| 1489 | $pageRow, |
||||||
| 1490 | $this->scheduledTime, |
||||||
| 1491 | $this->reqMinute, |
||||||
| 1492 | $this->submitCrawlUrls, |
||||||
| 1493 | $this->downloadCrawlUrls, |
||||||
| 1494 | $this->duplicateTrack, |
||||||
| 1495 | $this->downloadUrls, |
||||||
| 1496 | // if empty the urls won't be filtered by processing instructions |
||||||
| 1497 | $this->incomingProcInstructions |
||||||
| 1498 | ); |
||||||
| 1499 | |||||||
| 1500 | // Expanded parameters: |
||||||
| 1501 | $paramExpanded = ''; |
||||||
| 1502 | $calcAccu = []; |
||||||
| 1503 | $calcRes = 1; |
||||||
| 1504 | foreach ($confArray['paramExpanded'] as $gVar => $gVal) { |
||||||
| 1505 | $paramExpanded .= ' |
||||||
| 1506 | <tr> |
||||||
| 1507 | <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' . |
||||||
| 1508 | '(' . count($gVal) . ')' . |
||||||
| 1509 | '</td> |
||||||
| 1510 | <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td> |
||||||
| 1511 | </tr> |
||||||
| 1512 | '; |
||||||
| 1513 | $calcRes *= count($gVal); |
||||||
| 1514 | $calcAccu[] = count($gVal); |
||||||
| 1515 | } |
||||||
| 1516 | $paramExpanded = '<table>' . $paramExpanded . '</table>'; |
||||||
| 1517 | $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes; |
||||||
| 1518 | |||||||
| 1519 | // Options |
||||||
| 1520 | $optionValues = ''; |
||||||
| 1521 | if ($confArray['subCfg']['userGroups']) { |
||||||
| 1522 | $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>'; |
||||||
| 1523 | } |
||||||
| 1524 | if ($confArray['subCfg']['procInstrFilter']) { |
||||||
| 1525 | $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>'; |
||||||
| 1526 | } |
||||||
| 1527 | |||||||
| 1528 | // Compile row: |
||||||
| 1529 | $content .= ' |
||||||
| 1530 | <tr> |
||||||
| 1531 | ' . $titleClm . ' |
||||||
| 1532 | <td>' . htmlspecialchars($confKey) . '</td> |
||||||
| 1533 | <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td> |
||||||
| 1534 | <td>' . $paramExpanded . '</td> |
||||||
| 1535 | <td nowrap="nowrap">' . $urlList . '</td> |
||||||
| 1536 | <td nowrap="nowrap">' . $optionValues . '</td> |
||||||
| 1537 | <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td> |
||||||
| 1538 | </tr>'; |
||||||
| 1539 | } else { |
||||||
| 1540 | $content .= '<tr> |
||||||
| 1541 | ' . $titleClm . ' |
||||||
| 1542 | <td>' . htmlspecialchars($confKey) . '</td> |
||||||
| 1543 | <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td> |
||||||
| 1544 | </tr>'; |
||||||
| 1545 | } |
||||||
| 1546 | |||||||
| 1547 | $c++; |
||||||
| 1548 | } |
||||||
| 1549 | } else { |
||||||
| 1550 | $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : ''; |
||||||
| 1551 | |||||||
| 1552 | // Compile row: |
||||||
| 1553 | $content .= ' |
||||||
| 1554 | <tr> |
||||||
| 1555 | <td>' . $pageTitle . '</td> |
||||||
| 1556 | <td colspan="6"><em>No entries</em>' . $message . '</td> |
||||||
| 1557 | </tr>'; |
||||||
| 1558 | } |
||||||
| 1559 | |||||||
| 1560 | return $content; |
||||||
| 1561 | } |
||||||
| 1562 | |||||||
| 1563 | /***************************** |
||||||
| 1564 | * |
||||||
| 1565 | * CLI functions |
||||||
| 1566 | * |
||||||
| 1567 | *****************************/ |
||||||
| 1568 | |||||||
| 1569 | /** |
||||||
| 1570 | * Running the functionality of the CLI (crawling URLs from queue) |
||||||
| 1571 | */ |
||||||
| 1572 | public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int |
||||||
| 1573 | { |
||||||
| 1574 | $result = 0; |
||||||
| 1575 | $counter = 0; |
||||||
| 1576 | |||||||
| 1577 | // First, run hooks: |
||||||
| 1578 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) { |
||||||
| 1579 | trigger_error( |
||||||
| 1580 | 'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS', |
||||||
| 1581 | E_USER_DEPRECATED |
||||||
| 1582 | ); |
||||||
| 1583 | $hookObj = GeneralUtility::makeInstance($objRef); |
||||||
| 1584 | if (is_object($hookObj)) { |
||||||
| 1585 | $hookObj->crawler_init($this); |
||||||
| 1586 | } |
||||||
| 1587 | } |
||||||
| 1588 | |||||||
| 1589 | // Clean up the queue |
||||||
| 1590 | $this->queueRepository->cleanupQueue(); |
||||||
| 1591 | |||||||
| 1592 | // Select entries: |
||||||
| 1593 | $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun); |
||||||
| 1594 | |||||||
| 1595 | if (! empty($rows)) { |
||||||
| 1596 | $quidList = []; |
||||||
| 1597 | |||||||
| 1598 | foreach ($rows as $r) { |
||||||
| 1599 | $quidList[] = $r['qid']; |
||||||
| 1600 | } |
||||||
| 1601 | |||||||
| 1602 | $processId = $this->CLI_buildProcessId(); |
||||||
| 1603 | |||||||
| 1604 | //save the number of assigned queue entries to determine how many have been processed later |
||||||
| 1605 | $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId); |
||||||
| 1606 | $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId); |
||||||
| 1607 | |||||||
| 1608 | if ($numberOfAffectedRows !== count($quidList)) { |
||||||
| 1609 | return ($result | self::CLI_STATUS_ABORTED); |
||||||
| 1610 | } |
||||||
| 1611 | |||||||
| 1612 | foreach ($rows as $r) { |
||||||
| 1613 | $result |= $this->readUrl($r['qid']); |
||||||
| 1614 | |||||||
| 1615 | $counter++; |
||||||
| 1616 | // Just to relax the system |
||||||
| 1617 | usleep((int) $sleepTime); |
||||||
| 1618 | |||||||
| 1619 | // if during the start and the current read url the cli has been disable we need to return from the function |
||||||
| 1620 | // mark the process NOT as ended. |
||||||
| 1621 | if ($this->crawler->isDisabled()) { |
||||||
| 1622 | return ($result | self::CLI_STATUS_ABORTED); |
||||||
| 1623 | } |
||||||
| 1624 | |||||||
| 1625 | if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) { |
||||||
| 1626 | $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')'); |
||||||
| 1627 | $result |= self::CLI_STATUS_ABORTED; |
||||||
| 1628 | //possible timeout |
||||||
| 1629 | break; |
||||||
| 1630 | } |
||||||
| 1631 | } |
||||||
| 1632 | |||||||
| 1633 | sleep((int) $sleepAfterFinish); |
||||||
| 1634 | } |
||||||
| 1635 | |||||||
| 1636 | if ($counter > 0) { |
||||||
| 1637 | $result |= self::CLI_STATUS_PROCESSED; |
||||||
| 1638 | } |
||||||
| 1639 | |||||||
| 1640 | return $result; |
||||||
| 1641 | } |
||||||
| 1642 | |||||||
| 1643 | /** |
||||||
| 1644 | * Activate hooks |
||||||
| 1645 | * @deprecated |
||||||
| 1646 | */ |
||||||
| 1647 | public function CLI_runHooks(): void |
||||||
| 1648 | { |
||||||
| 1649 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) { |
||||||
| 1650 | $hookObj = GeneralUtility::makeInstance($objRef); |
||||||
| 1651 | if (is_object($hookObj)) { |
||||||
| 1652 | $hookObj->crawler_init($this); |
||||||
| 1653 | } |
||||||
| 1654 | } |
||||||
| 1655 | } |
||||||
| 1656 | |||||||
| 1657 | /** |
||||||
| 1658 | * Try to acquire a new process with the given id |
||||||
| 1659 | * also performs some auto-cleanup for orphan processes |
||||||
| 1660 | * @param string $id identification string for the process |
||||||
| 1661 | * @return boolean |
||||||
| 1662 | * @todo preemption might not be the most elegant way to clean up |
||||||
| 1663 | */ |
||||||
| 1664 | public function CLI_checkAndAcquireNewProcess($id) |
||||||
| 1665 | { |
||||||
| 1666 | $ret = true; |
||||||
| 1667 | |||||||
| 1668 | $systemProcessId = getmypid(); |
||||||
| 1669 | if (! $systemProcessId) { |
||||||
| 1670 | return false; |
||||||
| 1671 | } |
||||||
| 1672 | |||||||
| 1673 | $processCount = 0; |
||||||
| 1674 | $orphanProcesses = []; |
||||||
| 1675 | |||||||
| 1676 | $activeProcesses = $this->processRepository->findAllActive(); |
||||||
| 1677 | $currentTime = $this->getCurrentTime(); |
||||||
| 1678 | |||||||
| 1679 | /** @var Process $process */ |
||||||
| 1680 | foreach ($activeProcesses as $process) { |
||||||
| 1681 | if ($process->getTtl() < $currentTime) { |
||||||
| 1682 | $orphanProcesses[] = $process->getProcessId(); |
||||||
| 1683 | } else { |
||||||
| 1684 | $processCount++; |
||||||
| 1685 | } |
||||||
| 1686 | } |
||||||
| 1687 | |||||||
| 1688 | // if there are less than allowed active processes then add a new one |
||||||
| 1689 | if ($processCount < (int) $this->extensionSettings['processLimit']) { |
||||||
| 1690 | $this->processRepository->addProcess($id, $systemProcessId); |
||||||
| 1691 | } else { |
||||||
| 1692 | $ret = false; |
||||||
| 1693 | } |
||||||
| 1694 | |||||||
| 1695 | $this->processRepository->deleteProcessesMarkedAsDeleted(); |
||||||
| 1696 | $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses); |
||||||
| 1697 | $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses); |
||||||
| 1698 | |||||||
| 1699 | return $ret; |
||||||
| 1700 | } |
||||||
| 1701 | |||||||
| 1702 | /** |
||||||
| 1703 | * Release a process and the required resources |
||||||
| 1704 | * |
||||||
| 1705 | * @param mixed $releaseIds string with a single process-id or array with multiple process-ids |
||||||
| 1706 | * @return boolean |
||||||
| 1707 | * @deprecated |
||||||
| 1708 | */ |
||||||
| 1709 | public function CLI_releaseProcesses($releaseIds) |
||||||
| 1710 | { |
||||||
| 1711 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1712 | |||||||
| 1713 | if (! is_array($releaseIds)) { |
||||||
| 1714 | $releaseIds = [$releaseIds]; |
||||||
| 1715 | } |
||||||
| 1716 | |||||||
| 1717 | if (empty($releaseIds)) { |
||||||
| 1718 | //nothing to release |
||||||
| 1719 | return false; |
||||||
| 1720 | } |
||||||
| 1721 | |||||||
| 1722 | // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup |
||||||
| 1723 | // this ensures that a single process can't mess up the entire process table |
||||||
| 1724 | |||||||
| 1725 | // mark all processes as deleted which have no "waiting" queue-entires and which are not active |
||||||
| 1726 | |||||||
| 1727 | // ReleaseQueueEntries |
||||||
| 1728 | $queryBuilder |
||||||
| 1729 | ->update(QueueRepository::TABLE_NAME, 'q') |
||||||
| 1730 | ->where( |
||||||
| 1731 | 'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)' |
||||||
| 1732 | ) |
||||||
| 1733 | ->set('q.process_scheduled', 0) |
||||||
| 1734 | ->set('q.process_id', '') |
||||||
| 1735 | ->execute(); |
||||||
| 1736 | |||||||
| 1737 | // FIXME: Not entirely sure that this is equivalent to the previous version |
||||||
| 1738 | $queryBuilder->resetQueryPart('set'); |
||||||
| 1739 | |||||||
| 1740 | // ReleaseProcessEntries |
||||||
| 1741 | $queryBuilder |
||||||
| 1742 | ->update(ProcessRepository::TABLE_NAME) |
||||||
| 1743 | ->where( |
||||||
| 1744 | $queryBuilder->expr()->eq('active', 0), |
||||||
| 1745 | 'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)' |
||||||
| 1746 | ) |
||||||
| 1747 | ->set('system_process_id', 0) |
||||||
| 1748 | ->execute(); |
||||||
| 1749 | |||||||
| 1750 | $this->processRepository->markRequestedProcessesAsNotActive($releaseIds); |
||||||
| 1751 | $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds); |
||||||
| 1752 | |||||||
| 1753 | return true; |
||||||
| 1754 | } |
||||||
| 1755 | |||||||
| 1756 | /** |
||||||
| 1757 | * Create a unique Id for the current process |
||||||
| 1758 | * |
||||||
| 1759 | * @return string the ID |
||||||
| 1760 | */ |
||||||
| 1761 | public function CLI_buildProcessId() |
||||||
| 1762 | { |
||||||
| 1763 | if (! $this->processID) { |
||||||
| 1764 | $this->processID = GeneralUtility::shortMD5(microtime(true)); |
||||||
| 1765 | } |
||||||
| 1766 | return $this->processID; |
||||||
| 1767 | } |
||||||
| 1768 | |||||||
| 1769 | /** |
||||||
| 1770 | * Prints a message to the stdout (only if debug-mode is enabled) |
||||||
| 1771 | * |
||||||
| 1772 | * @param string $msg the message |
||||||
| 1773 | * @deprecated |
||||||
| 1774 | * @codeCoverageIgnore |
||||||
| 1775 | */ |
||||||
| 1776 | public function CLI_debug($msg): void |
||||||
| 1777 | { |
||||||
| 1778 | if ((int) $this->extensionSettings['processDebug']) { |
||||||
| 1779 | echo $msg . "\n"; |
||||||
| 1780 | flush(); |
||||||
| 1781 | } |
||||||
| 1782 | } |
||||||
| 1783 | |||||||
| 1784 | /** |
||||||
| 1785 | * Cleans up entries that stayed for too long in the queue. These are: |
||||||
| 1786 | * - processed entries that are over 1.5 days in age |
||||||
| 1787 | * - scheduled entries that are over 7 days old |
||||||
| 1788 | * |
||||||
| 1789 | * @deprecated |
||||||
| 1790 | */ |
||||||
| 1791 | public function cleanUpOldQueueEntries(): void |
||||||
| 1792 | { |
||||||
| 1793 | // 24*60*60 Seconds in 24 hours |
||||||
| 1794 | $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; |
||||||
| 1795 | $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400; |
||||||
| 1796 | |||||||
| 1797 | $now = time(); |
||||||
| 1798 | $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds); |
||||||
| 1799 | $this->flushQueue($condition); |
||||||
| 1800 | } |
||||||
| 1801 | |||||||
| 1802 | /** |
||||||
| 1803 | * Removes queue entries |
||||||
| 1804 | * |
||||||
| 1805 | * @param string $where SQL related filter for the entries which should be removed |
||||||
| 1806 | * |
||||||
| 1807 | * @deprecated |
||||||
| 1808 | */ |
||||||
| 1809 | protected function flushQueue($where = ''): void |
||||||
| 1810 | { |
||||||
| 1811 | $realWhere = strlen((string) $where) > 0 ? $where : '1=1'; |
||||||
| 1812 | |||||||
| 1813 | $queryBuilder = $this->getQueryBuilder($this->tableName); |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1814 | |||||||
| 1815 | $groups = $queryBuilder |
||||||
|
0 ignored issues
–
show
The function
Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This function has been deprecated. The supplier of the function has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead. Loading history...
|
|||||||
| 1816 | ->selectLiteral('DISTINCT set_id') |
||||||
| 1817 | ->from($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1818 | ->where($realWhere) |
||||||
| 1819 | ->execute() |
||||||
| 1820 | ->fetchAll(); |
||||||
| 1821 | if (is_array($groups)) { |
||||||
| 1822 | foreach ($groups as $group) { |
||||||
| 1823 | $subSet = $queryBuilder |
||||||
|
0 ignored issues
–
show
The function
Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This function has been deprecated. The supplier of the function has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead. Loading history...
|
|||||||
| 1824 | ->select('qid', 'set_id') |
||||||
| 1825 | ->from($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1826 | ->where( |
||||||
| 1827 | $realWhere, |
||||||
| 1828 | $queryBuilder->expr()->eq('set_id', $group['set_id']) |
||||||
| 1829 | ) |
||||||
| 1830 | ->execute() |
||||||
| 1831 | ->fetchAll(); |
||||||
| 1832 | |||||||
| 1833 | $payLoad = ['subSet' => $subSet]; |
||||||
| 1834 | SignalSlotUtility::emitSignal( |
||||||
| 1835 | self::class, |
||||||
| 1836 | SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH, |
||||||
| 1837 | $payLoad |
||||||
| 1838 | ); |
||||||
| 1839 | } |
||||||
| 1840 | } |
||||||
| 1841 | |||||||
| 1842 | $queryBuilder |
||||||
| 1843 | ->delete($this->tableName) |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1844 | ->where($realWhere) |
||||||
| 1845 | ->execute(); |
||||||
| 1846 | } |
||||||
| 1847 | |||||||
| 1848 | /** |
||||||
| 1849 | * This method determines duplicates for a queue entry with the same parameters and this timestamp. |
||||||
| 1850 | * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past. |
||||||
| 1851 | * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp |
||||||
| 1852 | * |
||||||
| 1853 | * @param int $tstamp |
||||||
| 1854 | * @param array $fieldArray |
||||||
| 1855 | * |
||||||
| 1856 | * @return array |
||||||
| 1857 | * @deprecated |
||||||
| 1858 | */ |
||||||
| 1859 | protected function getDuplicateRowsIfExist($tstamp, $fieldArray) |
||||||
| 1860 | { |
||||||
| 1861 | $rows = []; |
||||||
| 1862 | |||||||
| 1863 | $currentTime = $this->getCurrentTime(); |
||||||
| 1864 | |||||||
| 1865 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
||||||
|
0 ignored issues
–
show
The property
AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This property has been deprecated. The supplier of the class has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead. Loading history...
|
|||||||
| 1866 | $queryBuilder |
||||||
| 1867 | ->select('qid') |
||||||
| 1868 | ->from(QueueRepository::TABLE_NAME); |
||||||
| 1869 | //if this entry is scheduled with "now" |
||||||
| 1870 | if ($tstamp <= $currentTime) { |
||||||
| 1871 | if ($this->extensionSettings['enableTimeslot']) { |
||||||
| 1872 | $timeBegin = $currentTime - 100; |
||||||
| 1873 | $timeEnd = $currentTime + 100; |
||||||
| 1874 | $queryBuilder |
||||||
| 1875 | ->where( |
||||||
| 1876 | 'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . '' |
||||||
| 1877 | ) |
||||||
| 1878 | ->orWhere( |
||||||
| 1879 | $queryBuilder->expr()->lte('scheduled', $currentTime) |
||||||
| 1880 | ); |
||||||
| 1881 | } else { |
||||||
| 1882 | $queryBuilder |
||||||
| 1883 | ->where( |
||||||
| 1884 | $queryBuilder->expr()->lte('scheduled', $currentTime) |
||||||
| 1885 | ); |
||||||
| 1886 | } |
||||||
| 1887 | } elseif ($tstamp > $currentTime) { |
||||||
| 1888 | //entry with a timestamp in the future need to have the same schedule time |
||||||
| 1889 | $queryBuilder |
||||||
| 1890 | ->where( |
||||||
| 1891 | $queryBuilder->expr()->eq('scheduled', $tstamp) |
||||||
| 1892 | ); |
||||||
| 1893 | } |
||||||
| 1894 | |||||||
| 1895 | $queryBuilder |
||||||
| 1896 | ->andWhere('NOT exec_time') |
||||||
| 1897 | ->andWhere('NOT process_id') |
||||||
| 1898 | ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT))) |
||||||
| 1899 | ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR))); |
||||||
| 1900 | |||||||
| 1901 | $statement = $queryBuilder->execute(); |
||||||
| 1902 | |||||||
| 1903 | while ($row = $statement->fetch()) { |
||||||
| 1904 | $rows[] = $row['qid']; |
||||||
| 1905 | } |
||||||
| 1906 | |||||||
| 1907 | return $rows; |
||||||
| 1908 | } |
||||||
| 1909 | |||||||
| 1910 | /** |
||||||
| 1911 | * Returns a md5 hash generated from a serialized configuration array. |
||||||
| 1912 | * |
||||||
| 1913 | * @return string |
||||||
| 1914 | */ |
||||||
| 1915 | protected function getConfigurationHash(array $configuration) |
||||||
| 1916 | { |
||||||
| 1917 | unset($configuration['paramExpanded']); |
||||||
| 1918 | unset($configuration['URLs']); |
||||||
| 1919 | return md5(serialize($configuration)); |
||||||
| 1920 | } |
||||||
| 1921 | |||||||
| 1922 | /** |
||||||
| 1923 | * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using |
||||||
| 1924 | * the Site instance. |
||||||
| 1925 | * |
||||||
| 1926 | * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl |
||||||
| 1927 | * @throws SiteNotFoundException |
||||||
| 1928 | * @throws InvalidRouteArgumentsException |
||||||
| 1929 | * |
||||||
| 1930 | * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead. |
||||||
| 1931 | * @codeCoverageIgnore |
||||||
| 1932 | */ |
||||||
| 1933 | protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface |
||||||
| 1934 | { |
||||||
| 1935 | $urlService = new UrlService(); |
||||||
| 1936 | return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp); |
||||||
| 1937 | } |
||||||
| 1938 | |||||||
| 1939 | protected function swapIfFirstIsLargerThanSecond(array $reg): array |
||||||
| 1940 | { |
||||||
| 1941 | // Swap if first is larger than last: |
||||||
| 1942 | if ($reg[1] > $reg[2]) { |
||||||
| 1943 | $temp = $reg[2]; |
||||||
| 1944 | $reg[2] = $reg[1]; |
||||||
| 1945 | $reg[1] = $temp; |
||||||
| 1946 | } |
||||||
| 1947 | |||||||
| 1948 | return $reg; |
||||||
| 1949 | } |
||||||
| 1950 | |||||||
| 1951 | private function getMaximumUrlsToCompile(): int |
||||||
| 1952 | { |
||||||
| 1953 | return $this->maximumUrlsToCompile; |
||||||
| 1954 | } |
||||||
| 1955 | |||||||
| 1956 | /** |
||||||
| 1957 | * @return BackendUserAuthentication |
||||||
| 1958 | */ |
||||||
| 1959 | private function getBackendUser() |
||||||
| 1960 | { |
||||||
| 1961 | // Make sure the _cli_ user is loaded |
||||||
| 1962 | Bootstrap::initializeBackendAuthentication(); |
||||||
| 1963 | if ($this->backendUser === null) { |
||||||
| 1964 | $this->backendUser = $GLOBALS['BE_USER']; |
||||||
| 1965 | } |
||||||
| 1966 | return $this->backendUser; |
||||||
| 1967 | } |
||||||
| 1968 | |||||||
| 1969 | /** |
||||||
| 1970 | * Get querybuilder for given table |
||||||
| 1971 | * |
||||||
| 1972 | * @return QueryBuilder |
||||||
| 1973 | */ |
||||||
| 1974 | private function getQueryBuilder(string $table) |
||||||
| 1975 | { |
||||||
| 1976 | return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table); |
||||||
| 1977 | } |
||||||
| 1978 | } |
||||||
| 1979 |
This function has been deprecated. The supplier of the function has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.