| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | declare(strict_types=1); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | namespace AOE\Crawler\CrawlStrategy; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | /* | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |  * (c) 2020 AOE GmbH <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  * This file is part of the TYPO3 Crawler Extension. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  * It is free software; you can redistribute it and/or modify it under | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  * the terms of the GNU General Public License, either version 2 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  * of the License, or any later version. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  * For the full copyright and license information, please read the | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  * LICENSE.txt file that was distributed with this source code. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  * The TYPO3 project - inspiring people to share! | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | use AOE\Crawler\Utility\PhpBinaryUtility; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | use Psr\Http\Message\UriInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  | use Psr\Log\LoggerAwareInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | use Psr\Log\LoggerAwareTrait; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | use TYPO3\CMS\Core\Core\Environment; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | use TYPO3\CMS\Core\Utility\CommandUtility; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  | use TYPO3\CMS\Core\Utility\GeneralUtility; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  | /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |  * Executes another process via shell_exec() to include cli/bootstrap.php which in turn | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |  * includes the index.php for frontend. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategy | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |     use LoggerAwareTrait; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |      * @var array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |     protected $extensionSettings; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 | 1 |  |     public function __construct(?ExtensionConfigurationProvider $configurationProvider = null) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 | 1 |  |         $configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(ExtensionConfigurationProvider::class); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 | 1 |  |         $settings = $configurationProvider->getExtensionConfiguration(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 | 1 |  |         $this->extensionSettings = is_array($settings) ? $settings : []; | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 | 1 |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |      * Fetches a URL by calling a shell script. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |      * @return array|bool|mixed | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 56 |  |  |      */ | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |     public function fetchUrlContents(UriInterface $url, string $crawlerId) | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |     { | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |         $url = (string) $url; | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         $parsedUrl = parse_url($url); | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |         if ($parsedUrl === false) { | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |             $this->logger->debug( | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |                 sprintf('Could not parse_url() for string "%s"', $url), | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |                 ['crawlerId' => $crawlerId] | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |             ); | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |             return false; | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |         if (! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) { | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |             $this->logger->debug( | 
            
                                                                        
                            
            
                                    
            
            
                | 72 |  |  |                 sprintf('Scheme does not match for url "%s"', $url), | 
            
                                                                        
                            
            
                                    
            
            
                | 73 |  |  |                 ['crawlerId' => $crawlerId] | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |             ); | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |             return false; | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |         if (! is_array($parsedUrl)) { | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |             return []; | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 82 |  |  |         $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId); | 
            
                                                                        
                            
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 84 |  |  |         $commandParts = [ | 
            
                                                                        
                            
            
                                    
            
            
                | 85 |  |  |             ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php', | 
            
                                                                        
                            
            
                                    
            
            
                | 86 |  |  |             $this->getFrontendBasePath(), | 
            
                                                                        
                            
            
                                    
            
            
                | 87 |  |  |             $url, | 
            
                                                                        
                            
            
                                    
            
            
                | 88 |  |  |             base64_encode(serialize($requestHeaders)), | 
            
                                                                        
                            
            
                                    
            
            
                | 89 |  |  |         ]; | 
            
                                                                        
                            
            
                                    
            
            
                | 90 |  |  |         $commandParts = CommandUtility::escapeShellArguments($commandParts); | 
            
                                                                        
                            
            
                                    
            
            
                | 91 |  |  |         $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary()); | 
            
                                                                        
                            
            
                                    
            
            
                | 92 |  |  |         $cmd .= ' ' . implode(' ', $commandParts); | 
            
                                                                        
                            
            
                                    
            
            
                | 93 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 94 |  |  |         $startTime = microtime(true); | 
            
                                                                        
                            
            
                                    
            
            
                | 95 |  |  |         $content = $this->executeShellCommand($cmd); | 
            
                                                                        
                            
            
                                    
            
            
                | 96 |  |  |         $this->logger->info($url . ' ' . (microtime(true) - $startTime)); | 
            
                                                                        
                            
            
                                    
            
            
                | 97 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 98 |  |  |         return unserialize($content); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |     private function buildRequestHeaders(array $url, string $crawlerId): array | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         $reqHeaders = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         $reqHeaders[] = 'Host: ' . $url['host']; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         $reqHeaders[] = 'Connection: close'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |             $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         $reqHeaders[] = 'X-T3crawler: ' . $crawlerId; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         $reqHeaders[] = 'User-Agent: TYPO3 crawler'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         return $reqHeaders; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |      * Executes a shell command and returns the outputted result. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |      * @param string $command Shell command to be executed | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |      * @return string|null Outputted result of the command execution | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |     private function executeShellCommand($command) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         return shell_exec($command); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |      * Gets the base path of the website frontend. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |      * (e.g. if you call http://mydomain.com/cms/index.php in | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |      * the browser the base path is "/cms/") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |      * @return string Base path of the website frontend | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |     private function getFrontendBasePath() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         $frontendBasePath = '/'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         // Get the path from the extension settings: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |             $frontendBasePath = $this->extensionSettings['frontendBasePath']; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         // If empty, try to use config.absRefPrefix: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |             $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         // If not in CLI mode the base path can be determined from $_SERVER environment: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |         } elseif (! Environment::isCli()) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |             $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |         // Base path must be '/<pathSegements>/': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |         if ($frontendBasePath !== '/') { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |             $frontendBasePath = '/' . ltrim($frontendBasePath, '/'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |             $frontendBasePath = rtrim($frontendBasePath, '/') . '/'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         return $frontendBasePath; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 156 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 157 |  |  |  |