1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace AOE\Crawler\CrawlStrategy; |
||
6 | |||
7 | /* |
||
8 | * (c) 2020 AOE GmbH <[email protected]> |
||
9 | * |
||
10 | * This file is part of the TYPO3 Crawler Extension. |
||
11 | * |
||
12 | * It is free software; you can redistribute it and/or modify it under |
||
13 | * the terms of the GNU General Public License, either version 2 |
||
14 | * of the License, or any later version. |
||
15 | * |
||
16 | * For the full copyright and license information, please read the |
||
17 | * LICENSE.txt file that was distributed with this source code. |
||
18 | * |
||
19 | * The TYPO3 project - inspiring people to share! |
||
20 | */ |
||
21 | |||
22 | use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
||
23 | use AOE\Crawler\Utility\PhpBinaryUtility; |
||
24 | use Psr\Http\Message\UriInterface; |
||
25 | use Psr\Log\LoggerAwareInterface; |
||
26 | use Psr\Log\LoggerAwareTrait; |
||
27 | use TYPO3\CMS\Core\Core\Environment; |
||
28 | use TYPO3\CMS\Core\Utility\CommandUtility; |
||
29 | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility; |
||
30 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
31 | |||
32 | /** |
||
33 | * Executes another process via shell_exec() to include cli/bootstrap.php which in turn |
||
34 | * includes the index.php for frontend. |
||
35 | */ |
||
36 | class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategy |
||
37 | { |
||
38 | use LoggerAwareTrait; |
||
39 | |||
40 | /** |
||
41 | * @var array |
||
42 | */ |
||
43 | protected $extensionSettings; |
||
44 | |||
45 | 1 | public function __construct(?ExtensionConfigurationProvider $configurationProvider = null) |
|
46 | { |
||
47 | 1 | $configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(ExtensionConfigurationProvider::class); |
|
48 | 1 | $settings = $configurationProvider->getExtensionConfiguration(); |
|
49 | 1 | $this->extensionSettings = is_array($settings) ? $settings : []; |
|
50 | 1 | } |
|
51 | |||
52 | /** |
||
53 | * Fetches a URL by calling a shell script. |
||
54 | * |
||
55 | * @return array|bool|mixed |
||
56 | */ |
||
57 | public function fetchUrlContents(UriInterface $url, string $crawlerId) |
||
58 | { |
||
59 | $url = (string) $url; |
||
60 | $parsedUrl = parse_url($url); |
||
61 | |||
62 | if ($parsedUrl === false) { |
||
63 | $this->logger->debug( |
||
0 ignored issues
–
show
|
|||
64 | sprintf('Could not parse_url() for string "%s"', $url), |
||
65 | ['crawlerId' => $crawlerId] |
||
66 | ); |
||
67 | return false; |
||
68 | } |
||
69 | |||
70 | if (! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) { |
||
71 | $this->logger->debug( |
||
72 | sprintf('Scheme does not match for url "%s"', $url), |
||
73 | ['crawlerId' => $crawlerId] |
||
74 | ); |
||
75 | return false; |
||
76 | } |
||
77 | |||
78 | if (! is_array($parsedUrl)) { |
||
0 ignored issues
–
show
|
|||
79 | return []; |
||
80 | } |
||
81 | |||
82 | $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId); |
||
83 | |||
84 | $commandParts = [ |
||
85 | ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php', |
||
86 | $this->getFrontendBasePath(), |
||
87 | $url, |
||
88 | base64_encode(serialize($requestHeaders)), |
||
89 | ]; |
||
90 | $commandParts = CommandUtility::escapeShellArguments($commandParts); |
||
91 | $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary()); |
||
92 | $cmd .= ' ' . implode(' ', $commandParts); |
||
93 | |||
94 | $startTime = microtime(true); |
||
95 | $content = $this->executeShellCommand($cmd); |
||
96 | $this->logger->info($url . ' ' . (microtime(true) - $startTime)); |
||
97 | |||
98 | if ($content === null) { |
||
99 | return false; |
||
100 | } |
||
101 | return unserialize($content); |
||
102 | } |
||
103 | |||
104 | private function buildRequestHeaders(array $url, string $crawlerId): array |
||
105 | { |
||
106 | $reqHeaders = []; |
||
107 | $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0'; |
||
108 | $reqHeaders[] = 'Host: ' . $url['host']; |
||
109 | $reqHeaders[] = 'Connection: close'; |
||
110 | if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') { |
||
111 | $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']); |
||
112 | } |
||
113 | $reqHeaders[] = 'X-T3crawler: ' . $crawlerId; |
||
114 | $reqHeaders[] = 'User-Agent: TYPO3 crawler'; |
||
115 | return $reqHeaders; |
||
116 | } |
||
117 | |||
118 | /** |
||
119 | * Executes a shell command and returns the outputted result. |
||
120 | * |
||
121 | * @param string $command Shell command to be executed |
||
122 | * @return string|null Outputted result of the command execution |
||
123 | */ |
||
124 | private function executeShellCommand($command) |
||
125 | { |
||
126 | return shell_exec($command); |
||
127 | } |
||
128 | |||
129 | /** |
||
130 | * Gets the base path of the website frontend. |
||
131 | * (e.g. if you call http://mydomain.com/cms/index.php in |
||
132 | * the browser the base path is "/cms/") |
||
133 | * |
||
134 | * @return string Base path of the website frontend |
||
135 | */ |
||
136 | private function getFrontendBasePath() |
||
137 | { |
||
138 | $frontendBasePath = '/'; |
||
139 | |||
140 | // Get the path from the extension settings: |
||
141 | if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) { |
||
142 | $frontendBasePath = $this->extensionSettings['frontendBasePath']; |
||
143 | // If empty, try to use config.absRefPrefix: |
||
144 | } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) { |
||
145 | $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix; |
||
146 | // If not in CLI mode the base path can be determined from $_SERVER environment: |
||
147 | } elseif (! Environment::isCli()) { |
||
148 | $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'); |
||
149 | } |
||
150 | |||
151 | // Base path must be '/<pathSegements>/': |
||
152 | if ($frontendBasePath !== '/') { |
||
153 | $frontendBasePath = '/' . ltrim($frontendBasePath, '/'); |
||
154 | $frontendBasePath = rtrim($frontendBasePath, '/') . '/'; |
||
155 | } |
||
156 | |||
157 | return $frontendBasePath; |
||
158 | } |
||
159 | } |
||
160 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.