AOEpeople /
crawler
| 1 | <?php |
||
| 2 | |||
| 3 | declare(strict_types=1); |
||
| 4 | |||
| 5 | namespace AOE\Crawler\CrawlStrategy; |
||
| 6 | |||
| 7 | /* |
||
| 8 | * (c) 2020 AOE GmbH <[email protected]> |
||
| 9 | * |
||
| 10 | * This file is part of the TYPO3 Crawler Extension. |
||
| 11 | * |
||
| 12 | * It is free software; you can redistribute it and/or modify it under |
||
| 13 | * the terms of the GNU General Public License, either version 2 |
||
| 14 | * of the License, or any later version. |
||
| 15 | * |
||
| 16 | * For the full copyright and license information, please read the |
||
| 17 | * LICENSE.txt file that was distributed with this source code. |
||
| 18 | * |
||
| 19 | * The TYPO3 project - inspiring people to share! |
||
| 20 | */ |
||
| 21 | |||
| 22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
||
| 23 | use AOE\Crawler\Utility\PhpBinaryUtility; |
||
| 24 | use Psr\Http\Message\UriInterface; |
||
| 25 | use Psr\Log\LoggerAwareInterface; |
||
| 26 | use Psr\Log\LoggerAwareTrait; |
||
| 27 | use TYPO3\CMS\Core\Core\Environment; |
||
| 28 | use TYPO3\CMS\Core\Utility\CommandUtility; |
||
| 29 | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility; |
||
| 30 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
| 31 | |||
| 32 | /** |
||
| 33 | * Executes another process via shell_exec() to include cli/bootstrap.php which in turn |
||
| 34 | * includes the index.php for frontend. |
||
| 35 | */ |
||
| 36 | class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategy |
||
| 37 | { |
||
| 38 | use LoggerAwareTrait; |
||
| 39 | |||
| 40 | /** |
||
| 41 | * @var array |
||
| 42 | */ |
||
| 43 | protected $extensionSettings; |
||
| 44 | |||
| 45 | 1 | public function __construct(?ExtensionConfigurationProvider $configurationProvider = null) |
|
| 46 | { |
||
| 47 | 1 | $configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(ExtensionConfigurationProvider::class); |
|
| 48 | 1 | $settings = $configurationProvider->getExtensionConfiguration(); |
|
| 49 | 1 | $this->extensionSettings = is_array($settings) ? $settings : []; |
|
| 50 | 1 | } |
|
| 51 | |||
| 52 | /** |
||
| 53 | * Fetches a URL by calling a shell script. |
||
| 54 | * |
||
| 55 | * @return array|bool|mixed |
||
| 56 | */ |
||
| 57 | public function fetchUrlContents(UriInterface $url, string $crawlerId) |
||
| 58 | { |
||
| 59 | $url = (string) $url; |
||
| 60 | $parsedUrl = parse_url($url); |
||
| 61 | |||
| 62 | if ($parsedUrl === false) { |
||
| 63 | $this->logger->debug( |
||
|
0 ignored issues
–
show
|
|||
| 64 | sprintf('Could not parse_url() for string "%s"', $url), |
||
| 65 | ['crawlerId' => $crawlerId] |
||
| 66 | ); |
||
| 67 | return false; |
||
| 68 | } |
||
| 69 | |||
| 70 | if (! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) { |
||
| 71 | $this->logger->debug( |
||
| 72 | sprintf('Scheme does not match for url "%s"', $url), |
||
| 73 | ['crawlerId' => $crawlerId] |
||
| 74 | ); |
||
| 75 | return false; |
||
| 76 | } |
||
| 77 | |||
| 78 | if (! is_array($parsedUrl)) { |
||
|
0 ignored issues
–
show
|
|||
| 79 | return []; |
||
| 80 | } |
||
| 81 | |||
| 82 | $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId); |
||
| 83 | |||
| 84 | $commandParts = [ |
||
| 85 | ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php', |
||
| 86 | $this->getFrontendBasePath(), |
||
| 87 | $url, |
||
| 88 | base64_encode(serialize($requestHeaders)), |
||
| 89 | ]; |
||
| 90 | $commandParts = CommandUtility::escapeShellArguments($commandParts); |
||
| 91 | $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary()); |
||
| 92 | $cmd .= ' ' . implode(' ', $commandParts); |
||
| 93 | |||
| 94 | $startTime = microtime(true); |
||
| 95 | $content = $this->executeShellCommand($cmd); |
||
| 96 | $this->logger->info($url . ' ' . (microtime(true) - $startTime)); |
||
| 97 | |||
| 98 | if ($content === null) { |
||
|
0 ignored issues
–
show
|
|||
| 99 | return false; |
||
| 100 | } |
||
| 101 | return unserialize($content); |
||
| 102 | } |
||
| 103 | |||
| 104 | private function buildRequestHeaders(array $url, string $crawlerId): array |
||
| 105 | { |
||
| 106 | $reqHeaders = []; |
||
| 107 | $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0'; |
||
| 108 | $reqHeaders[] = 'Host: ' . $url['host']; |
||
| 109 | $reqHeaders[] = 'Connection: close'; |
||
| 110 | if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') { |
||
| 111 | $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']); |
||
| 112 | } |
||
| 113 | $reqHeaders[] = 'X-T3crawler: ' . $crawlerId; |
||
| 114 | $reqHeaders[] = 'User-Agent: TYPO3 crawler'; |
||
| 115 | return $reqHeaders; |
||
| 116 | } |
||
| 117 | |||
| 118 | /** |
||
| 119 | * Executes a shell command and returns the outputted result. |
||
| 120 | * |
||
| 121 | * @param string $command Shell command to be executed |
||
| 122 | * @return string|null Outputted result of the command execution |
||
| 123 | */ |
||
| 124 | private function executeShellCommand($command) |
||
| 125 | { |
||
| 126 | return shell_exec($command); |
||
| 127 | } |
||
| 128 | |||
| 129 | /** |
||
| 130 | * Gets the base path of the website frontend. |
||
| 131 | * (e.g. if you call http://mydomain.com/cms/index.php in |
||
| 132 | * the browser the base path is "/cms/") |
||
| 133 | * |
||
| 134 | * @return string Base path of the website frontend |
||
| 135 | */ |
||
| 136 | private function getFrontendBasePath() |
||
| 137 | { |
||
| 138 | $frontendBasePath = '/'; |
||
| 139 | |||
| 140 | // Get the path from the extension settings: |
||
| 141 | if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) { |
||
| 142 | $frontendBasePath = $this->extensionSettings['frontendBasePath']; |
||
| 143 | // If empty, try to use config.absRefPrefix: |
||
| 144 | } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) { |
||
| 145 | $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix; |
||
| 146 | // If not in CLI mode the base path can be determined from $_SERVER environment: |
||
| 147 | } elseif (! Environment::isCli()) { |
||
| 148 | $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'); |
||
| 149 | } |
||
| 150 | |||
| 151 | // Base path must be '/<pathSegements>/': |
||
| 152 | if ($frontendBasePath !== '/') { |
||
| 153 | $frontendBasePath = '/' . ltrim($frontendBasePath, '/'); |
||
| 154 | $frontendBasePath = rtrim($frontendBasePath, '/') . '/'; |
||
| 155 | } |
||
| 156 | |||
| 157 | return $frontendBasePath; |
||
| 158 | } |
||
| 159 | } |
||
| 160 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.