| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | declare(strict_types=1); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | namespace AOE\Crawler\Service; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | /* | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |  * (c) 2020 AOE GmbH <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  * This file is part of the TYPO3 Crawler Extension. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  * It is free software; you can redistribute it and/or modify it under | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  * the terms of the GNU General Public License, either version 2 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  * of the License, or any later version. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  * For the full copyright and license information, please read the | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  * LICENSE.txt file that was distributed with this source code. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  * The TYPO3 project - inspiring people to share! | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | use AOE\Crawler\Controller\CrawlerController; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | use AOE\Crawler\Exception\CrawlerObjectException; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | use TYPO3\CMS\Core\Utility\GeneralUtility; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  | use TYPO3\CMS\Frontend\Page\PageRepository; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | class QueueService | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |      * @var CrawlerController | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     protected $crawlerController; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |     public function addPageToQueue(int $pageUid, int $time = 0): void | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         $crawler = $this->findCrawler(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |         /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |          * Todo: Switch back to getPage(); when dropping support for TYPO3 9 LTS - TNM | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |          * This switch to getPage_noCheck() is needed as TYPO3 9 LTS doesn't return dokType < 200, therefore automatically | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |          * adding pages to crawler queue when editing page-titles from the page tree directly was not working. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |          */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |         $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage_noCheck($pageUid, true); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         $configurations = $crawler->getUrlsForPageRow($pageData); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |         $allowedConfigurations = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |         $configurations = ConfigurationService::removeDisallowedConfigurations($allowedConfigurations, $configurations); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |         $downloadUrls = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         $duplicateTrack = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |         if (is_array($configurations)) { | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |             foreach ($configurations as $configuration) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |                 //enable inserting of entries | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |                 $crawler->registerQueueEntriesInternallyOnly = false; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |                 $crawler->urlListFromUrlArray( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |                     $configuration, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |                     $pageData, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |                     $time, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |                     300, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |                     true, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |                     false, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |                     $duplicateTrack, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |                     $downloadUrls, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |                     array_keys($this->getCrawlerProcInstructions()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |                 ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |                 //reset the queue because the entries have been written to the db | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |                 unset($crawler->queueEntries); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |      * Reads the registered processingInstructions of the crawler | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 73 |  |  |      */ | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |     private function getCrawlerProcInstructions(): array | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |         $crawlerProcInstructions = []; | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |         if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) { | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |             foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) { | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |                 $crawlerProcInstructions[$configuration['key']] = $configuration['value']; | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |             } | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 83 |  |  |         return $crawlerProcInstructions; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |      * Method to get an instance of the internal crawler singleton | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |      * @return CrawlerController Instance of the crawler lib | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |      * @throws CrawlerObjectException | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |     private function findCrawler() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         if (! is_object($this->crawlerController)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |             $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |             $this->crawlerController->setID = GeneralUtility::md5int(microtime()); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         if (is_object($this->crawlerController)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |             return $this->crawlerController; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         throw new CrawlerObjectException('no crawler object', 1608465082); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 105 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |  |