SubProcessExecutionStrategy   A
last analyzed

Complexity

Total Complexity 20

Size/Duplication

Total Lines 122
Duplicated Lines 0 %

Test Coverage

Coverage 8.62%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 56
c 1
b 0
f 0
dl 0
loc 122
ccs 5
cts 58
cp 0.0862
rs 10
wmc 20

5 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 5 2
A buildRequestHeaders() 0 12 5
B getFrontendBasePath() 0 22 7
A executeShellCommand() 0 3 1
A fetchUrlContents() 0 45 5
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\CrawlStrategy;
6
7
/*
8
 * (c) 2020 AOE GmbH <[email protected]>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21
22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\UriInterface;
25
use Psr\Log\LoggerAwareInterface;
26
use Psr\Log\LoggerAwareTrait;
27
use TYPO3\CMS\Core\Core\Environment;
28
use TYPO3\CMS\Core\Utility\CommandUtility;
29
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
30
use TYPO3\CMS\Core\Utility\GeneralUtility;
31
32
/**
33
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
34
 * includes the index.php for frontend.
35
 */
36
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategy
37
{
38
    use LoggerAwareTrait;
39
40
    /**
41
     * @var array
42
     */
43
    protected $extensionSettings;
44
45 1
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
46
    {
47 1
        $configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
48 1
        $settings = $configurationProvider->getExtensionConfiguration();
49 1
        $this->extensionSettings = is_array($settings) ? $settings : [];
50 1
    }
51
52
    /**
53
     * Fetches a URL by calling a shell script.
54
     *
55
     * @return array|bool|mixed
56
     */
57
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
58
    {
59
        $url = (string) $url;
60
        $parsedUrl = parse_url($url);
61
62
        if ($parsedUrl === false) {
63
            $this->logger->debug(
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

63
            $this->logger->/** @scrutinizer ignore-call */ 
64
                           debug(

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
64
                sprintf('Could not parse_url() for string "%s"', $url),
65
                ['crawlerId' => $crawlerId]
66
            );
67
            return false;
68
        }
69
70
        if (! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
71
            $this->logger->debug(
72
                sprintf('Scheme does not match for url "%s"', $url),
73
                ['crawlerId' => $crawlerId]
74
            );
75
            return false;
76
        }
77
78
        if (! is_array($parsedUrl)) {
0 ignored issues
show
introduced by
The condition is_array($parsedUrl) is always true.
Loading history...
79
            return [];
80
        }
81
82
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
83
84
        $commandParts = [
85
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
86
            $this->getFrontendBasePath(),
87
            $url,
88
            base64_encode(serialize($requestHeaders)),
89
        ];
90
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
91
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
92
        $cmd .= ' ' . implode(' ', $commandParts);
93
94
        $startTime = microtime(true);
95
        $content = $this->executeShellCommand($cmd);
96
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
97
98
        if ($content === null) {
0 ignored issues
show
introduced by
The condition $content === null is always false.
Loading history...
99
            return false;
100
        }
101
        return unserialize($content);
102
    }
103
104
    private function buildRequestHeaders(array $url, string $crawlerId): array
105
    {
106
        $reqHeaders = [];
107
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
108
        $reqHeaders[] = 'Host: ' . $url['host'];
109
        $reqHeaders[] = 'Connection: close';
110
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
111
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
112
        }
113
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
114
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
115
        return $reqHeaders;
116
    }
117
118
    /**
119
     * Executes a shell command and returns the outputted result.
120
     *
121
     * @param string $command Shell command to be executed
122
     * @return string|null Outputted result of the command execution
123
     */
124
    private function executeShellCommand($command)
125
    {
126
        return shell_exec($command);
127
    }
128
129
    /**
130
     * Gets the base path of the website frontend.
131
     * (e.g. if you call http://mydomain.com/cms/index.php in
132
     * the browser the base path is "/cms/")
133
     *
134
     * @return string Base path of the website frontend
135
     */
136
    private function getFrontendBasePath()
137
    {
138
        $frontendBasePath = '/';
139
140
        // Get the path from the extension settings:
141
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
142
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
143
        // If empty, try to use config.absRefPrefix:
144
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) {
145
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
146
        // If not in CLI mode the base path can be determined from $_SERVER environment:
147
        } elseif (! Environment::isCli()) {
148
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
149
        }
150
151
        // Base path must be '/<pathSegements>/':
152
        if ($frontendBasePath !== '/') {
153
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
154
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
155
        }
156
157
        return $frontendBasePath;
158
    }
159
}
160