Complex classes like CrawlerController often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use CrawlerController, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
68 | class CrawlerController |
||
69 | { |
||
70 | const CLI_STATUS_NOTHING_PROCCESSED = 0; |
||
71 | const CLI_STATUS_REMAIN = 1; //queue not empty |
||
72 | const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed |
||
73 | const CLI_STATUS_ABORTED = 4; //instance didn't finish |
||
74 | const CLI_STATUS_POLLABLE_PROCESSED = 8; |
||
75 | |||
76 | /** |
||
77 | * @var integer |
||
78 | */ |
||
79 | public $setID = 0; |
||
80 | |||
81 | /** |
||
82 | * @var string |
||
83 | */ |
||
84 | public $processID = ''; |
||
85 | |||
86 | /** |
||
87 | * One hour is max stalled time for the CLI |
||
88 | * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started |
||
89 | * |
||
90 | * @var integer |
||
91 | */ |
||
92 | public $max_CLI_exec_time = 3600; |
||
93 | |||
94 | /** |
||
95 | * @var array |
||
96 | */ |
||
97 | public $duplicateTrack = []; |
||
98 | |||
99 | /** |
||
100 | * @var array |
||
101 | */ |
||
102 | public $downloadUrls = []; |
||
103 | |||
104 | /** |
||
105 | * @var array |
||
106 | */ |
||
107 | public $incomingProcInstructions = []; |
||
108 | |||
109 | /** |
||
110 | * @var array |
||
111 | */ |
||
112 | public $incomingConfigurationSelection = []; |
||
113 | |||
114 | /** |
||
115 | * @var bool |
||
116 | */ |
||
117 | public $registerQueueEntriesInternallyOnly = false; |
||
118 | |||
119 | /** |
||
120 | * @var array |
||
121 | */ |
||
122 | public $queueEntries = []; |
||
123 | |||
124 | /** |
||
125 | * @var array |
||
126 | */ |
||
127 | public $urlList = []; |
||
128 | |||
129 | /** |
||
130 | * @var boolean |
||
131 | */ |
||
132 | public $debugMode = false; |
||
133 | |||
134 | /** |
||
135 | * @var array |
||
136 | */ |
||
137 | public $extensionSettings = []; |
||
138 | |||
139 | /** |
||
140 | * Mount Point |
||
141 | * |
||
142 | * @var boolean |
||
143 | */ |
||
144 | public $MP = false; |
||
145 | |||
146 | /** |
||
147 | * @var string |
||
148 | */ |
||
149 | protected $processFilename; |
||
150 | |||
151 | /** |
||
152 | * Holds the internal access mode can be 'gui','cli' or 'cli_im' |
||
153 | * |
||
154 | * @var string |
||
155 | */ |
||
156 | protected $accessMode; |
||
157 | |||
158 | /** |
||
159 | * @var BackendUserAuthentication |
||
160 | */ |
||
161 | private $backendUser; |
||
162 | |||
163 | /** |
||
164 | * @var integer |
||
165 | */ |
||
166 | private $scheduledTime = 0; |
||
167 | |||
168 | /** |
||
169 | * @var integer |
||
170 | */ |
||
171 | private $reqMinute = 0; |
||
172 | |||
173 | /** |
||
174 | * @var bool |
||
175 | */ |
||
176 | private $submitCrawlUrls = false; |
||
177 | |||
178 | /** |
||
179 | * @var bool |
||
180 | */ |
||
181 | private $downloadCrawlUrls = false; |
||
182 | |||
183 | /** |
||
184 | * @var QueueRepository |
||
185 | */ |
||
186 | protected $queueRepository; |
||
187 | |||
188 | /** |
||
189 | * @var ProcessRepository |
||
190 | */ |
||
191 | protected $processRepository; |
||
192 | |||
193 | /** |
||
194 | * @var string |
||
195 | */ |
||
196 | protected $tableName = 'tx_crawler_queue'; |
||
197 | |||
198 | /** |
||
199 | * @var QueryBuilder |
||
200 | */ |
||
201 | protected $queryBuilder = QueryBuilder::class; |
||
202 | |||
203 | /** |
||
204 | * @var array |
||
205 | */ |
||
206 | private $cliArgs; |
||
207 | |||
208 | |||
209 | /** |
||
210 | * @var Logger |
||
211 | */ |
||
212 | private $logger; |
||
213 | |||
214 | /** |
||
215 | * Method to set the accessMode can be gui, cli or cli_im |
||
216 | * |
||
217 | * @return string |
||
218 | */ |
||
219 | 1 | public function getAccessMode() |
|
223 | |||
224 | /** |
||
225 | * @param string $accessMode |
||
226 | */ |
||
227 | 1 | public function setAccessMode($accessMode) |
|
231 | |||
232 | /** |
||
233 | * Set disabled status to prevent processes from being processed |
||
234 | * |
||
235 | * @param bool $disabled (optional, defaults to true) |
||
236 | * @return void |
||
237 | */ |
||
238 | 3 | public function setDisabled($disabled = true) |
|
248 | |||
249 | /** |
||
250 | * Get disable status |
||
251 | * |
||
252 | * @return bool true if disabled |
||
253 | */ |
||
254 | 3 | public function getDisabled() |
|
258 | |||
259 | /** |
||
260 | * @param string $filenameWithPath |
||
261 | * |
||
262 | * @return void |
||
263 | */ |
||
264 | 4 | public function setProcessFilename($filenameWithPath) |
|
268 | |||
269 | /** |
||
270 | * @return string |
||
271 | */ |
||
272 | 1 | public function getProcessFilename() |
|
276 | |||
277 | /** |
||
278 | * @return Logger |
||
279 | */ |
||
280 | private function getLogger(): Logger { |
||
286 | |||
287 | /************************************ |
||
288 | * |
||
289 | * Getting URLs based on Page TSconfig |
||
290 | * |
||
291 | ************************************/ |
||
292 | |||
293 | 31 | public function __construct() |
|
316 | |||
317 | /** |
||
318 | * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']). |
||
319 | * |
||
320 | * @param array $extensionSettings |
||
321 | * @return void |
||
322 | */ |
||
323 | 40 | public function setExtensionSettings(array $extensionSettings) |
|
327 | |||
328 | /** |
||
329 | * Check if the given page should be crawled |
||
330 | * |
||
331 | * @param array $pageRow |
||
332 | * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped |
||
333 | */ |
||
334 | 8 | public function checkIfPageShouldBeSkipped(array $pageRow) |
|
391 | |||
392 | /** |
||
393 | * Wrapper method for getUrlsForPageId() |
||
394 | * It returns an array of configurations and no urls! |
||
395 | * |
||
396 | * @param array $pageRow Page record with at least dok-type and uid columns. |
||
397 | * @param string $skipMessage |
||
398 | * @return array |
||
399 | * @see getUrlsForPageId() |
||
400 | */ |
||
401 | 4 | public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') |
|
416 | |||
417 | /** |
||
418 | * This method is used to count if there are ANY unprocessed queue entries |
||
419 | * of a given page_id and the configuration which matches a given hash. |
||
420 | * If there if none, we can skip an inner detail check |
||
421 | * |
||
422 | * @param int $uid |
||
423 | * @param string $configurationHash |
||
424 | * @return boolean |
||
425 | */ |
||
426 | 5 | protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash) |
|
439 | |||
440 | /** |
||
441 | * Creates a list of URLs from input array (and submits them to queue if asked for) |
||
442 | * See Web > Info module script + "indexed_search"'s crawler hook-client using this! |
||
443 | * |
||
444 | * @param array Information about URLs from pageRow to crawl. |
||
445 | * @param array Page row |
||
446 | * @param integer Unix time to schedule indexing to, typically time() |
||
447 | * @param integer Number of requests per minute (creates the interleave between requests) |
||
448 | * @param boolean If set, submits the URLs to queue |
||
449 | * @param boolean If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||
450 | * @param array Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates |
||
451 | * @param array Array which will be filled with URLS for download if flag is set. |
||
452 | * @param array Array of processing instructions |
||
453 | * @return string List of URLs (meant for display in backend module) |
||
454 | * |
||
455 | */ |
||
456 | 2 | public function urlListFromUrlArray( |
|
457 | array $vv, |
||
458 | array $pageRow, |
||
459 | $scheduledTime, |
||
460 | $reqMinute, |
||
461 | $submitCrawlUrls, |
||
462 | $downloadCrawlUrls, |
||
463 | array &$duplicateTrack, |
||
464 | array &$downloadUrls, |
||
465 | array $incomingProcInstructions |
||
466 | ) { |
||
467 | 2 | $urlList = ''; |
|
468 | // realurl support (thanks to Ingo Renner) |
||
469 | 2 | if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) { |
|
470 | |||
471 | /** @var tx_realurl $urlObj */ |
||
472 | $urlObj = GeneralUtility::makeInstance('tx_realurl'); |
||
473 | |||
474 | if (!empty($vv['subCfg']['baseUrl'])) { |
||
475 | $urlParts = parse_url($vv['subCfg']['baseUrl']); |
||
476 | $host = strtolower($urlParts['host']); |
||
477 | $urlObj->host = $host; |
||
478 | |||
479 | // First pass, finding configuration OR pointer string: |
||
480 | $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT']; |
||
481 | |||
482 | // If it turned out to be a string pointer, then look up the real config: |
||
483 | if (is_string($urlObj->extConf)) { |
||
484 | $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT']; |
||
485 | } |
||
486 | } |
||
487 | |||
488 | if (!$GLOBALS['TSFE']->sys_page) { |
||
489 | $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository'); |
||
490 | } |
||
491 | |||
492 | if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) { |
||
493 | $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id']; |
||
494 | } |
||
495 | } |
||
496 | |||
497 | 2 | if (is_array($vv['URLs'])) { |
|
498 | 2 | $configurationHash = $this->getConfigurationHash($vv); |
|
499 | 2 | $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash); |
|
500 | |||
501 | 2 | foreach ($vv['URLs'] as $urlQuery) { |
|
502 | 2 | if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) { |
|
503 | |||
504 | // Calculate cHash: |
||
505 | 2 | if ($vv['subCfg']['cHash']) { |
|
506 | /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */ |
||
507 | $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator'); |
||
508 | $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery); |
||
509 | } |
||
510 | |||
511 | // Create key by which to determine unique-ness: |
||
512 | 2 | $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter']; |
|
513 | |||
514 | // realurl support (thanks to Ingo Renner) |
||
515 | 2 | $urlQuery = 'index.php' . $urlQuery; |
|
516 | 2 | if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) { |
|
517 | $params = [ |
||
518 | 'LD' => [ |
||
519 | 'totalURL' => $urlQuery |
||
520 | ], |
||
521 | 'TCEmainHook' => true |
||
522 | ]; |
||
523 | $urlObj->encodeSpURL($params); |
||
|
|||
524 | $urlQuery = $params['LD']['totalURL']; |
||
525 | } |
||
526 | |||
527 | // Scheduled time: |
||
528 | 2 | $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute)); |
|
529 | 2 | $schTime = floor($schTime / 60) * 60; |
|
530 | |||
531 | 2 | if (isset($duplicateTrack[$uKey])) { |
|
532 | |||
533 | //if the url key is registered just display it and do not resubmit is |
||
534 | $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>'; |
||
535 | } else { |
||
536 | 2 | $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery); |
|
537 | 2 | $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery; |
|
538 | |||
539 | 2 | $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery; |
|
540 | |||
541 | // Submit for crawling! |
||
542 | 2 | if ($submitCrawlUrls) { |
|
543 | 2 | $added = $this->addUrl( |
|
544 | 2 | $pageRow['uid'], |
|
545 | 2 | $theUrl, |
|
546 | 2 | $vv['subCfg'], |
|
547 | 2 | $scheduledTime, |
|
548 | 2 | $configurationHash, |
|
549 | 2 | $skipInnerCheck |
|
550 | ); |
||
551 | 2 | if ($added === false) { |
|
552 | 2 | $urlList .= ' (Url already existed)'; |
|
553 | } |
||
554 | } elseif ($downloadCrawlUrls) { |
||
555 | $downloadUrls[$theUrl] = $theUrl; |
||
556 | } |
||
557 | |||
558 | 2 | $urlList .= '<br />'; |
|
559 | } |
||
560 | 2 | $duplicateTrack[$uKey] = true; |
|
561 | } |
||
562 | } |
||
563 | } else { |
||
564 | $urlList = 'ERROR - no URL generated'; |
||
565 | } |
||
566 | |||
567 | 2 | return $urlList; |
|
568 | } |
||
569 | |||
570 | /** |
||
571 | * Returns true if input processing instruction is among registered ones. |
||
572 | * |
||
573 | * @param string $piString PI to test |
||
574 | * @param array $incomingProcInstructions Processing instructions |
||
575 | * @return boolean |
||
576 | */ |
||
577 | 5 | public function drawURLs_PIfilter($piString, array $incomingProcInstructions) |
|
589 | |||
590 | 2 | public function getPageTSconfigForId($id) |
|
591 | { |
||
592 | 2 | if (!$this->MP) { |
|
593 | 2 | $pageTSconfig = BackendUtility::getPagesTSconfig($id); |
|
594 | } else { |
||
595 | list(, $mountPointId) = explode('-', $this->MP); |
||
596 | $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId); |
||
597 | } |
||
598 | |||
599 | // Call a hook to alter configuration |
||
600 | 2 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) { |
|
601 | $params = [ |
||
602 | 'pageId' => $id, |
||
603 | 'pageTSConfig' => &$pageTSconfig |
||
604 | ]; |
||
605 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) { |
||
606 | GeneralUtility::callUserFunction($userFunc, $params, $this); |
||
607 | } |
||
608 | } |
||
609 | |||
610 | 2 | return $pageTSconfig; |
|
611 | } |
||
612 | |||
613 | /** |
||
614 | * This methods returns an array of configurations. |
||
615 | * And no urls! |
||
616 | * |
||
617 | * @param integer $id Page ID |
||
618 | * @param bool $forceSsl Use https |
||
619 | * @return array |
||
620 | */ |
||
621 | 2 | public function getUrlsForPageId($id, $forceSsl = false) |
|
622 | { |
||
623 | |||
624 | /** |
||
625 | * Get configuration from tsConfig |
||
626 | */ |
||
627 | |||
628 | // Get page TSconfig for page ID: |
||
629 | 2 | $pageTSconfig = $this->getPageTSconfigForId($id); |
|
630 | |||
631 | 2 | $res = []; |
|
632 | |||
633 | 2 | if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) { |
|
634 | 1 | $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']; |
|
635 | |||
636 | 1 | if (is_array($crawlerCfg['paramSets.'])) { |
|
637 | 1 | foreach ($crawlerCfg['paramSets.'] as $key => $values) { |
|
638 | 1 | if (is_array($values)) { |
|
639 | 1 | $key = str_replace('.', '', $key); |
|
640 | // Sub configuration for a single configuration string: |
||
641 | 1 | $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.']; |
|
642 | 1 | $subCfg['key'] = $key; |
|
643 | |||
644 | 1 | if (strcmp($subCfg['procInstrFilter'], '')) { |
|
645 | 1 | $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter'])); |
|
646 | } |
||
647 | 1 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true)); |
|
648 | |||
649 | // process configuration if it is not page-specific or if the specific page is the current page: |
||
650 | 1 | if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) { |
|
651 | |||
652 | // add trailing slash if not present |
||
653 | 1 | if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') { |
|
654 | $subCfg['baseUrl'] .= '/'; |
||
655 | } |
||
656 | |||
657 | // Explode, process etc.: |
||
658 | 1 | $res[$key] = []; |
|
659 | 1 | $res[$key]['subCfg'] = $subCfg; |
|
660 | 1 | $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]); |
|
661 | 1 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id); |
|
662 | 1 | $res[$key]['origin'] = 'pagets'; |
|
663 | |||
664 | // recognize MP value |
||
665 | 1 | if (!$this->MP) { |
|
666 | 1 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]); |
|
667 | } else { |
||
668 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]); |
||
669 | } |
||
670 | } |
||
671 | } |
||
672 | } |
||
673 | } |
||
674 | } |
||
675 | |||
676 | /** |
||
677 | * Get configuration from tx_crawler_configuration records |
||
678 | */ |
||
679 | |||
680 | // get records along the rootline |
||
681 | 2 | $rootLine = BackendUtility::BEgetRootLine($id); |
|
682 | |||
683 | |||
684 | 2 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration'); |
|
685 | 2 | $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class)); |
|
686 | |||
687 | 2 | foreach ($rootLine as $page) { |
|
688 | $configurationRecordsForCurrentPage = $queryBuilder |
||
689 | 2 | ->select('*') |
|
690 | 2 | ->from('tx_crawler_configuration') |
|
691 | 2 | ->where( |
|
692 | 2 | $queryBuilder->expr()->eq('pid', $page['uid']), |
|
693 | 2 | substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration') |
|
694 | ) |
||
695 | 2 | ->execute() |
|
696 | 2 | ->fetchAll(); |
|
697 | |||
698 | 2 | if (is_array($configurationRecordsForCurrentPage)) { |
|
699 | 2 | foreach ($configurationRecordsForCurrentPage as $configurationRecord) { |
|
700 | |||
701 | // check access to the configuration record |
||
702 | 1 | if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) { |
|
703 | 1 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true)); |
|
704 | |||
705 | // process configuration if it is not page-specific or if the specific page is the current page: |
||
706 | 1 | if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) { |
|
707 | 1 | $key = $configurationRecord['name']; |
|
708 | |||
709 | // don't overwrite previously defined paramSets |
||
710 | 1 | if (!isset($res[$key])) { |
|
711 | |||
712 | /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */ |
||
713 | 1 | $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class); |
|
714 | 1 | $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']); |
|
715 | |||
716 | 1 | $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl); |
|
717 | |||
718 | $subCfg = [ |
||
719 | 1 | 'procInstrFilter' => $configurationRecord['processing_instruction_filter'], |
|
720 | 1 | 'procInstrParams.' => $TSparserObject->setup, |
|
721 | 1 | 'baseUrl' => $this->getBaseUrlForConfigurationRecord( |
|
722 | 1 | $configurationRecord['base_url'], |
|
723 | 1 | $configurationRecord['sys_domain_base_url'], |
|
724 | 1 | $isCrawlingProtocolHttps |
|
725 | ), |
||
726 | 1 | 'realurl' => $configurationRecord['realurl'], |
|
727 | 1 | 'cHash' => $configurationRecord['chash'], |
|
728 | 1 | 'userGroups' => $configurationRecord['fegroups'], |
|
729 | 1 | 'exclude' => $configurationRecord['exclude'], |
|
730 | 1 | 'rootTemplatePid' => (int) $configurationRecord['root_template_pid'], |
|
731 | 1 | 'key' => $key |
|
732 | ]; |
||
733 | |||
734 | // add trailing slash if not present |
||
735 | 1 | if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') { |
|
736 | $subCfg['baseUrl'] .= '/'; |
||
737 | } |
||
738 | 1 | if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) { |
|
739 | 1 | $res[$key] = []; |
|
740 | 1 | $res[$key]['subCfg'] = $subCfg; |
|
741 | 1 | $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']); |
|
742 | 1 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id); |
|
743 | 1 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]); |
|
744 | 1 | $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid']; |
|
745 | } |
||
746 | } |
||
747 | } |
||
748 | } |
||
749 | } |
||
750 | } |
||
751 | } |
||
752 | |||
753 | 2 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) { |
|
754 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) { |
||
755 | $params = [ |
||
756 | 'res' => &$res, |
||
757 | ]; |
||
758 | GeneralUtility::callUserFunction($func, $params, $this); |
||
759 | } |
||
760 | } |
||
761 | |||
762 | 2 | return $res; |
|
763 | } |
||
764 | |||
765 | /** |
||
766 | * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used. |
||
767 | * |
||
768 | * @param string $baseUrl |
||
769 | * @param integer $sysDomainUid |
||
770 | * @param bool $ssl |
||
771 | * @return string |
||
772 | */ |
||
773 | 4 | protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false) |
|
794 | |||
795 | /** |
||
796 | * @param $rootid |
||
797 | * @param $depth |
||
798 | * @return array |
||
799 | * |
||
800 | * TODO: Write Functional Tests |
||
801 | */ |
||
802 | public function getConfigurationsForBranch($rootid, $depth) |
||
854 | |||
855 | /** |
||
856 | * Get querybuilder for given table |
||
857 | * |
||
858 | * @param string $table |
||
859 | * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder |
||
860 | */ |
||
861 | 7 | private function getQueryBuilder(string $table) { |
|
867 | |||
868 | /** |
||
869 | * Check if a user has access to an item |
||
870 | * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list) |
||
871 | * |
||
872 | * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause() |
||
873 | * @param string $groupList Comma-separated list of (fe_)group UIDs from a user |
||
874 | * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access |
||
875 | * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty |
||
876 | */ |
||
877 | 3 | public function hasGroupAccess($groupList, $accessList) |
|
889 | |||
890 | /** |
||
891 | * Parse GET vars of input Query into array with key=>value pairs |
||
892 | * |
||
893 | * @param string $inputQuery Input query string |
||
894 | * @return array |
||
895 | */ |
||
896 | 5 | public function parseParams($inputQuery) |
|
912 | |||
913 | /** |
||
914 | * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter. |
||
915 | * Syntax of values: |
||
916 | * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally |
||
917 | * - Configuration is splitted by "|" and the parts are processed individually and finally added together |
||
918 | * - For each configuration part: |
||
919 | * - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30" |
||
920 | * - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123" |
||
921 | * _ENABLELANG:1 picks only original records without their language overlays |
||
922 | * - Default: Literal value |
||
923 | * |
||
924 | * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion) |
||
925 | * @param integer $pid Current page ID |
||
926 | * @return array |
||
927 | * |
||
928 | * TODO: Write Functional Tests |
||
929 | */ |
||
930 | 2 | public function expandParameters($paramArray, $pid) |
|
931 | { |
||
932 | 2 | global $TCA; |
|
933 | |||
934 | // Traverse parameter names: |
||
935 | 2 | foreach ($paramArray as $p => $v) { |
|
936 | 2 | $v = trim($v); |
|
937 | |||
938 | // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal |
||
939 | 2 | if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') { |
|
940 | // So, find the value inside brackets and reset the paramArray value as an array. |
||
941 | 2 | $v = substr($v, 1, -1); |
|
942 | 2 | $paramArray[$p] = []; |
|
943 | |||
944 | // Explode parts and traverse them: |
||
945 | 2 | $parts = explode('|', $v); |
|
946 | 2 | foreach ($parts as $pV) { |
|
947 | |||
948 | // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30) |
||
949 | 2 | if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) { |
|
950 | |||
951 | // Swap if first is larger than last: |
||
952 | if ($reg[1] > $reg[2]) { |
||
953 | $temp = $reg[2]; |
||
954 | $reg[2] = $reg[1]; |
||
955 | $reg[1] = $temp; |
||
956 | } |
||
957 | |||
958 | // Traverse range, add values: |
||
959 | $runAwayBrake = 1000; // Limit to size of range! |
||
960 | for ($a = $reg[1]; $a <= $reg[2];$a++) { |
||
961 | $paramArray[$p][] = $a; |
||
962 | $runAwayBrake--; |
||
963 | if ($runAwayBrake <= 0) { |
||
964 | break; |
||
965 | } |
||
966 | } |
||
967 | 2 | } elseif (substr(trim($pV), 0, 7) == '_TABLE:') { |
|
968 | |||
969 | // Parse parameters: |
||
970 | $subparts = GeneralUtility::trimExplode(';', $pV); |
||
971 | $subpartParams = []; |
||
972 | foreach ($subparts as $spV) { |
||
973 | list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV); |
||
974 | $subpartParams[$pKey] = $pVal; |
||
975 | } |
||
976 | |||
977 | // Table exists: |
||
978 | if (isset($TCA[$subpartParams['_TABLE']])) { |
||
979 | $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid; |
||
980 | $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid'; |
||
981 | $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : ''; |
||
982 | $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : ''; |
||
983 | |||
984 | $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid'; |
||
985 | if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) { |
||
986 | $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']); |
||
987 | |||
988 | $queryBuilder->getRestrictions() |
||
989 | ->removeAll() |
||
990 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)); |
||
991 | |||
992 | $queryBuilder |
||
993 | ->select($fieldName) |
||
994 | ->from($subpartParams['_TABLE']) |
||
995 | // TODO: Check if this works as intended! |
||
996 | ->add('from', $addTable) |
||
997 | ->where( |
||
998 | $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)), |
||
999 | $where |
||
1000 | ); |
||
1001 | $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField']; |
||
1002 | |||
1003 | if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) { |
||
1004 | $queryBuilder->andWhere( |
||
1005 | $queryBuilder->expr()->lte( |
||
1006 | $queryBuilder->quoteIdentifier($transOrigPointerField), 0 |
||
1007 | ) |
||
1008 | ); |
||
1009 | } |
||
1010 | |||
1011 | $statement = $queryBuilder->execute(); |
||
1012 | |||
1013 | $rows = []; |
||
1014 | while($row = $statement->fetch()) { |
||
1015 | $rows[$fieldName] = $row; |
||
1016 | } |
||
1017 | |||
1018 | if (is_array($rows)) { |
||
1019 | $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows)); |
||
1020 | } |
||
1021 | } |
||
1022 | } |
||
1023 | } else { // Just add value: |
||
1024 | 2 | $paramArray[$p][] = $pV; |
|
1025 | } |
||
1026 | // Hook for processing own expandParameters place holder |
||
1027 | 2 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) { |
|
1028 | $_params = [ |
||
1029 | 'pObj' => &$this, |
||
1030 | 'paramArray' => &$paramArray, |
||
1031 | 'currentKey' => $p, |
||
1032 | 'currentValue' => $pV, |
||
1033 | 'pid' => $pid |
||
1034 | ]; |
||
1035 | foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) { |
||
1036 | GeneralUtility::callUserFunction($_funcRef, $_params, $this); |
||
1037 | } |
||
1038 | } |
||
1039 | } |
||
1040 | |||
1041 | // Make unique set of values and sort array by key: |
||
1042 | 2 | $paramArray[$p] = array_unique($paramArray[$p]); |
|
1043 | 2 | ksort($paramArray); |
|
1044 | } else { |
||
1045 | // Set the literal value as only value in array: |
||
1046 | 2 | $paramArray[$p] = [$v]; |
|
1047 | } |
||
1048 | } |
||
1049 | |||
1050 | 2 | return $paramArray; |
|
1051 | } |
||
1052 | |||
1053 | /** |
||
1054 | * Compiling URLs from parameter array (output of expandParameters()) |
||
1055 | * The number of URLs will be the multiplication of the number of parameter values for each key |
||
1056 | * |
||
1057 | * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values |
||
1058 | * @param array $urls URLs accumulated in this array (for recursion) |
||
1059 | * @return array |
||
1060 | */ |
||
1061 | 5 | public function compileUrls($paramArray, $urls = []) |
|
1086 | |||
1087 | /************************************ |
||
1088 | * |
||
1089 | * Crawler log |
||
1090 | * |
||
1091 | ************************************/ |
||
1092 | |||
1093 | /** |
||
1094 | * Return array of records from crawler queue for input page ID |
||
1095 | * |
||
1096 | * @param integer $id Page ID for which to look up log entries. |
||
1097 | * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones |
||
1098 | * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||
1099 | * @param boolean $doFullFlush |
||
1100 | * @param integer $itemsPerPage Limit the amount of entries per page default is 10 |
||
1101 | * @return array |
||
1102 | */ |
||
1103 | 4 | public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10) |
|
1144 | |||
1145 | /** |
||
1146 | * Return array of records from crawler queue for input set ID |
||
1147 | * |
||
1148 | * @param integer $set_id Set ID for which to look up log entries. |
||
1149 | * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones |
||
1150 | * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||
1151 | * @param integer $itemsPerPage Limit the amount of entires per page default is 10 |
||
1152 | * @return array |
||
1153 | */ |
||
1154 | 4 | public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10) |
|
1194 | |||
1195 | /** |
||
1196 | * Removes queue entries |
||
1197 | * |
||
1198 | * @param string $where SQL related filter for the entries which should be removed |
||
1199 | * @return void |
||
1200 | */ |
||
1201 | 7 | protected function flushQueue($where = '') |
|
1236 | |||
1237 | /** |
||
1238 | * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php" |
||
1239 | * |
||
1240 | * @param integer $setId Set ID |
||
1241 | * @param array $params Parameters to pass to call back function |
||
1242 | * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler' |
||
1243 | * @param integer $page_id Page ID to attach it to |
||
1244 | * @param integer $schedule Time at which to activate |
||
1245 | * @return void |
||
1246 | */ |
||
1247 | public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0) |
||
1267 | |||
1268 | /************************************ |
||
1269 | * |
||
1270 | * URL setting |
||
1271 | * |
||
1272 | ************************************/ |
||
1273 | |||
1274 | /** |
||
1275 | * Setting a URL for crawling: |
||
1276 | * |
||
1277 | * @param integer $id Page ID |
||
1278 | * @param string $url Complete URL |
||
1279 | * @param array $subCfg Sub configuration array (from TS config) |
||
1280 | * @param integer $tstamp Scheduled-time |
||
1281 | * @param string $configurationHash (optional) configuration hash |
||
1282 | * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check |
||
1283 | * @return bool |
||
1284 | */ |
||
1285 | 2 | public function addUrl( |
|
1286 | $id, |
||
1287 | $url, |
||
1288 | array $subCfg, |
||
1289 | $tstamp, |
||
1290 | $configurationHash = '', |
||
1291 | $skipInnerDuplicationCheck = false |
||
1292 | ) { |
||
1293 | 2 | $urlAdded = false; |
|
1294 | 2 | $rows = []; |
|
1295 | |||
1296 | // Creating parameters: |
||
1297 | $parameters = [ |
||
1298 | 2 | 'url' => $url |
|
1299 | ]; |
||
1300 | |||
1301 | // fe user group simulation: |
||
1302 | 2 | $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true))); |
|
1303 | 2 | if ($uGs) { |
|
1304 | $parameters['feUserGroupList'] = $uGs; |
||
1305 | } |
||
1306 | |||
1307 | // Setting processing instructions |
||
1308 | 2 | $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']); |
|
1309 | 2 | if (is_array($subCfg['procInstrParams.'])) { |
|
1310 | 2 | $parameters['procInstrParams'] = $subCfg['procInstrParams.']; |
|
1311 | } |
||
1312 | |||
1313 | // Possible TypoScript Template Parents |
||
1314 | 2 | $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid']; |
|
1315 | |||
1316 | // Compile value array: |
||
1317 | 2 | $parameters_serialized = serialize($parameters); |
|
1318 | $fieldArray = [ |
||
1319 | 2 | 'page_id' => intval($id), |
|
1320 | 2 | 'parameters' => $parameters_serialized, |
|
1321 | 2 | 'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized), |
|
1322 | 2 | 'configuration_hash' => $configurationHash, |
|
1323 | 2 | 'scheduled' => $tstamp, |
|
1324 | 2 | 'exec_time' => 0, |
|
1325 | 2 | 'set_id' => intval($this->setID), |
|
1326 | 2 | 'result_data' => '', |
|
1327 | 2 | 'configuration' => $subCfg['key'], |
|
1328 | ]; |
||
1329 | |||
1330 | 2 | if ($this->registerQueueEntriesInternallyOnly) { |
|
1331 | //the entries will only be registered and not stored to the database |
||
1332 | $this->queueEntries[] = $fieldArray; |
||
1333 | } else { |
||
1334 | 2 | if (!$skipInnerDuplicationCheck) { |
|
1335 | // check if there is already an equal entry |
||
1336 | 2 | $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray); |
|
1337 | } |
||
1338 | |||
1339 | 2 | if (count($rows) == 0) { |
|
1340 | 2 | $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue'); |
|
1341 | 2 | $connectionForCrawlerQueue->insert( |
|
1342 | 2 | 'tx_crawler_queue', |
|
1343 | 2 | $fieldArray |
|
1344 | ); |
||
1345 | 2 | $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid'); |
|
1346 | 2 | $rows[] = $uid; |
|
1347 | 2 | $urlAdded = true; |
|
1348 | 2 | EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]); |
|
1349 | } else { |
||
1350 | EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]); |
||
1351 | } |
||
1352 | } |
||
1353 | |||
1354 | 2 | return $urlAdded; |
|
1355 | } |
||
1356 | |||
1357 | /** |
||
1358 | * This method determines duplicates for a queue entry with the same parameters and this timestamp. |
||
1359 | * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past. |
||
1360 | * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp |
||
1361 | * |
||
1362 | * @param int $tstamp |
||
1363 | * @param array $fieldArray |
||
1364 | * |
||
1365 | * @return array |
||
1366 | * |
||
1367 | * TODO: Write Functional Tests |
||
1368 | */ |
||
1369 | 2 | protected function getDuplicateRowsIfExist($tstamp, $fieldArray) |
|
1370 | { |
||
1371 | 2 | $rows = []; |
|
1372 | |||
1373 | 2 | $currentTime = $this->getCurrentTime(); |
|
1374 | 2 | $this->queryBuilder |
|
1375 | 2 | ->select('qid') |
|
1376 | 2 | ->from('tx_crawler_queue'); |
|
1377 | //if this entry is scheduled with "now" |
||
1378 | 2 | if ($tstamp <= $currentTime) { |
|
1379 | if ($this->extensionSettings['enableTimeslot']) { |
||
1380 | $timeBegin = $currentTime - 100; |
||
1381 | $timeEnd = $currentTime + 100; |
||
1382 | $this->queryBuilder |
||
1383 | ->where( |
||
1384 | 'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . '' |
||
1385 | ) |
||
1386 | ->orWhere( |
||
1387 | $this->queryBuilder->expr()->lte('scheduled', $currentTime) |
||
1388 | ); |
||
1389 | } else { |
||
1390 | $this->queryBuilder |
||
1391 | ->where( |
||
1392 | $this->queryBuilder->expr()->lte('scheduled', $currentTime) |
||
1393 | ); |
||
1394 | } |
||
1395 | 2 | } elseif ($tstamp > $currentTime) { |
|
1396 | //entry with a timestamp in the future need to have the same schedule time |
||
1397 | 2 | $this->queryBuilder |
|
1398 | 2 | ->where( |
|
1399 | 2 | $this->queryBuilder->expr()->eq('scheduled', $tstamp) |
|
1400 | ); |
||
1401 | } |
||
1402 | |||
1403 | 2 | $statement = $this->queryBuilder |
|
1404 | 2 | ->andWhere('exec_time != 0') |
|
1405 | 2 | ->andWhere('process_id != 0') |
|
1406 | 2 | ->andWhere($this->queryBuilder->expr()->eq('page_id', $this->queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT))) |
|
1407 | 2 | ->andWhere($this->queryBuilder->expr()->eq('parameters_hash', $this->queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR))) |
|
1408 | 2 | ->execute(); |
|
1409 | |||
1410 | 2 | while($row = $statement->fetch()) { |
|
1411 | $rows[] = $row['qid']; |
||
1412 | } |
||
1413 | |||
1414 | 2 | return $rows; |
|
1415 | } |
||
1416 | |||
1417 | /** |
||
1418 | * Returns the current system time |
||
1419 | * |
||
1420 | * @return int |
||
1421 | */ |
||
1422 | public function getCurrentTime() |
||
1426 | |||
1427 | /************************************ |
||
1428 | * |
||
1429 | * URL reading |
||
1430 | * |
||
1431 | ************************************/ |
||
1432 | |||
1433 | /** |
||
1434 | * Read URL for single queue entry |
||
1435 | * |
||
1436 | * @param integer $queueId |
||
1437 | * @param boolean $force If set, will process even if exec_time has been set! |
||
1438 | * @return integer |
||
1439 | */ |
||
1440 | public function readUrl($queueId, $force = false) |
||
1543 | |||
1544 | /** |
||
1545 | * Read URL for not-yet-inserted log-entry |
||
1546 | * |
||
1547 | * @param array $field_array Queue field array, |
||
1548 | * |
||
1549 | * @return string |
||
1550 | */ |
||
1551 | public function readUrlFromArray($field_array) |
||
1582 | |||
1583 | /** |
||
1584 | * Read URL for a queue record |
||
1585 | * |
||
1586 | * @param array $queueRec Queue record |
||
1587 | * @return string |
||
1588 | */ |
||
1589 | public function readUrl_exec($queueRec) |
||
1618 | |||
1619 | /** |
||
1620 | * Gets the content of a URL. |
||
1621 | * |
||
1622 | * @param string $originalUrl URL to read |
||
1623 | * @param string $crawlerId Crawler ID string (qid + hash to verify) |
||
1624 | * @param integer $timeout Timeout time |
||
1625 | * @param integer $recursion Recursion limiter for 302 redirects |
||
1626 | * @return array|boolean |
||
1627 | */ |
||
1628 | 2 | public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10) |
|
1729 | |||
1730 | /** |
||
1731 | * Gets the base path of the website frontend. |
||
1732 | * (e.g. if you call http://mydomain.com/cms/index.php in |
||
1733 | * the browser the base path is "/cms/") |
||
1734 | * |
||
1735 | * @return string Base path of the website frontend |
||
1736 | */ |
||
1737 | protected function getFrontendBasePath() |
||
1760 | |||
1761 | /** |
||
1762 | * Executes a shell command and returns the outputted result. |
||
1763 | * |
||
1764 | * @param string $command Shell command to be executed |
||
1765 | * @return string Outputted result of the command execution |
||
1766 | */ |
||
1767 | protected function executeShellCommand($command) |
||
1771 | |||
1772 | /** |
||
1773 | * Reads HTTP response from the given stream. |
||
1774 | * |
||
1775 | * @param resource $streamPointer Pointer to connection stream. |
||
1776 | * @return array Associative array with the following items: |
||
1777 | * headers <array> Response headers sent by server. |
||
1778 | * content <array> Content, with each line as an array item. |
||
1779 | */ |
||
1780 | 1 | protected function getHttpResponseFromStream($streamPointer) |
|
1803 | |||
1804 | /** |
||
1805 | * @param message |
||
1806 | */ |
||
1807 | 2 | protected function log($message) |
|
1820 | |||
1821 | /** |
||
1822 | * Builds HTTP request headers. |
||
1823 | * |
||
1824 | * @param array $url |
||
1825 | * @param string $crawlerId |
||
1826 | * |
||
1827 | * @return array |
||
1828 | */ |
||
1829 | 6 | protected function buildRequestHeaderArray(array $url, $crawlerId) |
|
1845 | |||
1846 | /** |
||
1847 | * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url |
||
1848 | * |
||
1849 | * @param array $headers HTTP Header |
||
1850 | * @param string $user HTTP Auth. User |
||
1851 | * @param string $pass HTTP Auth. Password |
||
1852 | * @return bool|string |
||
1853 | */ |
||
1854 | 12 | protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '') |
|
1888 | |||
1889 | /************************** |
||
1890 | * |
||
1891 | * tslib_fe hooks: |
||
1892 | * |
||
1893 | **************************/ |
||
1894 | |||
1895 | /** |
||
1896 | * Initialization hook (called after database connection) |
||
1897 | * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes) |
||
1898 | * |
||
1899 | * @param array $params Parameters from frontend |
||
1900 | * @param object $ref TSFE object (reference under PHP5) |
||
1901 | * @return void |
||
1902 | * |
||
1903 | * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public, |
||
1904 | * FIXME: I think this can be removed. (TNM) |
||
1905 | */ |
||
1906 | public function fe_init(&$params, $ref) |
||
1931 | |||
1932 | /***************************** |
||
1933 | * |
||
1934 | * Compiling URLs to crawl - tools |
||
1935 | * |
||
1936 | *****************************/ |
||
1937 | |||
1938 | /** |
||
1939 | * @param integer $id Root page id to start from. |
||
1940 | * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite |
||
1941 | * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue |
||
1942 | * @param integer $reqMinute Number of requests per minute (creates the interleave between requests) |
||
1943 | * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling) |
||
1944 | * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||
1945 | * @param array $incomingProcInstructions Array of processing instructions |
||
1946 | * @param array $configurationSelection Array of configuration keys |
||
1947 | * @return string |
||
1948 | */ |
||
1949 | public function getPageTreeAndUrls( |
||
2045 | |||
2046 | /** |
||
2047 | * Expands exclude string |
||
2048 | * |
||
2049 | * @param string $excludeString Exclude string |
||
2050 | * @return array |
||
2051 | */ |
||
2052 | 1 | public function expandExcludeString($excludeString) |
|
2053 | { |
||
2054 | // internal static caches; |
||
2055 | 1 | static $expandedExcludeStringCache; |
|
2056 | 1 | static $treeCache; |
|
2057 | |||
2058 | 1 | if (empty($expandedExcludeStringCache[$excludeString])) { |
|
2059 | 1 | $pidList = []; |
|
2060 | |||
2061 | 1 | if (!empty($excludeString)) { |
|
2062 | /** @var PageTreeView $tree */ |
||
2063 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||
2064 | $tree->init('AND ' . $this->backendUser->getPagePermsClause(1)); |
||
2065 | |||
2066 | $excludeParts = GeneralUtility::trimExplode(',', $excludeString); |
||
2067 | |||
2068 | foreach ($excludeParts as $excludePart) { |
||
2069 | list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart); |
||
2070 | |||
2071 | // default is "page only" = "depth=0" |
||
2072 | if (empty($depth)) { |
||
2073 | $depth = (stristr($excludePart, '+')) ? 99 : 0; |
||
2074 | } |
||
2075 | |||
2076 | $pidList[] = $pid; |
||
2077 | |||
2078 | if ($depth > 0) { |
||
2079 | if (empty($treeCache[$pid][$depth])) { |
||
2080 | $tree->reset(); |
||
2081 | $tree->getTree($pid, $depth); |
||
2082 | $treeCache[$pid][$depth] = $tree->tree; |
||
2083 | } |
||
2084 | |||
2085 | foreach ($treeCache[$pid][$depth] as $data) { |
||
2086 | $pidList[] = $data['row']['uid']; |
||
2087 | } |
||
2088 | } |
||
2089 | } |
||
2090 | } |
||
2091 | |||
2092 | 1 | $expandedExcludeStringCache[$excludeString] = array_unique($pidList); |
|
2093 | } |
||
2094 | |||
2095 | 1 | return $expandedExcludeStringCache[$excludeString]; |
|
2096 | } |
||
2097 | |||
2098 | /** |
||
2099 | * Create the rows for display of the page tree |
||
2100 | * For each page a number of rows are shown displaying GET variable configuration |
||
2101 | * |
||
2102 | * @param array Page row |
||
2103 | * @param string Page icon and title for row |
||
2104 | * @return string HTML <tr> content (one or more) |
||
2105 | */ |
||
2106 | public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon) |
||
2215 | |||
2216 | /***************************** |
||
2217 | * |
||
2218 | * CLI functions |
||
2219 | * |
||
2220 | *****************************/ |
||
2221 | |||
2222 | /** |
||
2223 | * Main function for running from Command Line PHP script (cron job) |
||
2224 | * See ext/crawler/cli/crawler_cli.phpsh for details |
||
2225 | * |
||
2226 | * @return int number of remaining items or false if error |
||
2227 | */ |
||
2228 | public function CLI_main($args) |
||
2264 | |||
2265 | /** |
||
2266 | * Helper function |
||
2267 | * |
||
2268 | * @param string $option Option string, eg. "-s |
||
2269 | * @param int $idx Value index, default is 0 (zero) = the first one... |
||
2270 | * @return string |
||
2271 | */ |
||
2272 | private function cli_argValue($option, $idx) { |
||
2275 | |||
2276 | /** |
||
2277 | * Helper function |
||
2278 | * |
||
2279 | * @param string $string The string to output |
||
2280 | */ |
||
2281 | private function cli_echo($string) { |
||
2284 | |||
2285 | /** |
||
2286 | * Set cli args |
||
2287 | * |
||
2288 | * This is a copy from the CommandLineController from TYPO3 < v9 |
||
2289 | * |
||
2290 | * TODO: Rework |
||
2291 | * |
||
2292 | * @param array $argv |
||
2293 | */ |
||
2294 | private function setCliArgs(array $argv) { |
||
2316 | |||
2317 | |||
2318 | |||
2319 | /** |
||
2320 | * Function executed by crawler_im.php cli script. |
||
2321 | * |
||
2322 | * @return void |
||
2323 | */ |
||
2324 | public function CLI_main_im($args = []) |
||
2415 | |||
2416 | /** |
||
2417 | * Function executed by crawler_im.php cli script. |
||
2418 | * |
||
2419 | * @return bool |
||
2420 | */ |
||
2421 | public function CLI_main_flush() |
||
2448 | |||
2449 | /** |
||
2450 | * Obtains configuration keys from the CLI arguments |
||
2451 | * |
||
2452 | * @return mixed Array of keys or null if no keys found |
||
2453 | */ |
||
2454 | protected function getConfigurationKeys() |
||
2459 | |||
2460 | /** |
||
2461 | * Running the functionality of the CLI (crawling URLs from queue) |
||
2462 | * |
||
2463 | * @param int $countInARun |
||
2464 | * @param int $sleepTime |
||
2465 | * @param int $sleepAfterFinish |
||
2466 | * @return string |
||
2467 | */ |
||
2468 | public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish) |
||
2585 | |||
2586 | /** |
||
2587 | * Activate hooks |
||
2588 | * |
||
2589 | * @return void |
||
2590 | */ |
||
2591 | public function CLI_runHooks() |
||
2603 | |||
2604 | /** |
||
2605 | * Try to acquire a new process with the given id |
||
2606 | * also performs some auto-cleanup for orphan processes |
||
2607 | * @todo preemption might not be the most elegant way to clean up |
||
2608 | * |
||
2609 | * @param string $id identification string for the process |
||
2610 | * @return boolean |
||
2611 | */ |
||
2612 | public function CLI_checkAndAcquireNewProcess($id) |
||
2669 | |||
2670 | /** |
||
2671 | * Release a process and the required resources |
||
2672 | * |
||
2673 | * @param mixed $releaseIds string with a single process-id or array with multiple process-ids |
||
2674 | * @param boolean $withinLock show whether the DB-actions are included within an existing lock |
||
2675 | * @return boolean |
||
2676 | */ |
||
2677 | public function CLI_releaseProcesses($releaseIds, $withinLock = false) |
||
2763 | |||
2764 | /** |
||
2765 | * Delete processes marked as deleted |
||
2766 | * |
||
2767 | * @return void |
||
2768 | * |
||
2769 | * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
||
2770 | * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted() |
||
2771 | */ |
||
2772 | public function CLI_deleteProcessesMarkedDeleted() |
||
2778 | |||
2779 | /** |
||
2780 | * Check if there are still resources left for the process with the given id |
||
2781 | * Used to determine timeouts and to ensure a proper cleanup if there's a timeout |
||
2782 | * |
||
2783 | * @param string identification string for the process |
||
2784 | * @return boolean determines if the process is still active / has resources |
||
2785 | * |
||
2786 | * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository |
||
2787 | */ |
||
2788 | 1 | public function CLI_checkIfProcessIsActive($pid) |
|
2807 | |||
2808 | /** |
||
2809 | * Create a unique Id for the current process |
||
2810 | * |
||
2811 | * @return string the ID |
||
2812 | */ |
||
2813 | 2 | public function CLI_buildProcessId() |
|
2820 | |||
2821 | /** |
||
2822 | * @param bool $get_as_float |
||
2823 | * |
||
2824 | * @return mixed |
||
2825 | */ |
||
2826 | protected function microtime($get_as_float = false) |
||
2830 | |||
2831 | /** |
||
2832 | * Prints a message to the stdout (only if debug-mode is enabled) |
||
2833 | * |
||
2834 | * @param string $msg the message |
||
2835 | */ |
||
2836 | public function CLI_debug($msg) |
||
2843 | |||
2844 | /** |
||
2845 | * Get URL content by making direct request to TYPO3. |
||
2846 | * |
||
2847 | * @param string $url Page URL |
||
2848 | * @param int $crawlerId Crawler-ID |
||
2849 | * @return array |
||
2850 | */ |
||
2851 | 2 | protected function sendDirectRequest($url, $crawlerId) |
|
2882 | |||
2883 | /** |
||
2884 | * Cleans up entries that stayed for too long in the queue. These are: |
||
2885 | * - processed entries that are over 1.5 days in age |
||
2886 | * - scheduled entries that are over 7 days old |
||
2887 | * |
||
2888 | * @return void |
||
2889 | */ |
||
2890 | public function cleanUpOldQueueEntries() |
||
2899 | |||
2900 | /** |
||
2901 | * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions |
||
2902 | * |
||
2903 | * @param int $id |
||
2904 | * @param int $typeNum |
||
2905 | * |
||
2906 | * @return void |
||
2907 | */ |
||
2908 | protected function initTSFE($id = 1, $typeNum = 0) |
||
2925 | |||
2926 | /** |
||
2927 | * Returns a md5 hash generated from a serialized configuration array. |
||
2928 | * |
||
2929 | * @param array $configuration |
||
2930 | * |
||
2931 | * @return string |
||
2932 | */ |
||
2933 | 7 | protected function getConfigurationHash(array $configuration) { |
|
2938 | |||
2939 | /** |
||
2940 | * Check whether the Crawling Protocol should be http or https |
||
2941 | * |
||
2942 | * @param $crawlerConfiguration |
||
2943 | * @param $pageConfiguration |
||
2944 | * |
||
2945 | * @return bool |
||
2946 | */ |
||
2947 | 6 | protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) { |
|
2959 | } |
||
2960 |
If you define a variable conditionally, it can happen that it is not defined for all execution paths.
Let’s take a look at an example:
In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.
Available Fixes
Check for existence of the variable explicitly:
Define a default value for the variable:
Add a value for the missing path: