Passed
Push — master ( 2432f8...918912 )
by Dispositif
02:37
created

ExternHttpClient::extractCharset()   A

Complexity

Conditions 4
Paths 6

Size

Total Lines 17
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
eloc 9
nc 6
nop 1
dl 0
loc 17
ccs 0
cts 0
cp 0
crap 20
rs 9.9666
c 0
b 0
f 0
1
<?php
2
3
/*
4
 * This file is part of dispositif/wikibot application (@github)
5
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
6
 * For the full copyright and MIT license information, view the license file.
7
 */
8
declare(strict_types=1);
9
10
namespace App\Application\Http;
11
12
use App\Application\Utils\HttpUtil;
13
use App\Domain\InfrastructurePorts\ExternHttpClientInterface;
14
use DomainException;
15
use GuzzleHttp\Client;
16
use Psr\Log\LoggerInterface;
17
use Psr\Log\NullLogger;
18
19
/**
20
 * TODO refac as a factory for Tor client or normal Guzzle client.
21
 * Http client (Guzzle) configured for web crawling.
22
 */
23
class ExternHttpClient implements ExternHttpClientInterface
24
{
25
    private readonly Client $client;
26
27
    // todo : inject Tor client
28
    public function __construct(private readonly LoggerInterface $log = new NullLogger())
29
    {
30
        $this->client = new Client(
0 ignored issues
show
Bug introduced by
The property client is declared read-only in App\Application\Http\ExternHttpClient.
Loading history...
31
            [
32
                'timeout' => 30,
33
                'allow_redirects' => true,
34
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
35
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
36
                //                'proxy'           => '192.192.192.192:10',
37
            ]
38
        );
39
    }
40
41
    //hack for WikiwixAdapter todo : plutôt request() compatible ClientInterface ?
42
    public function getClient(): Client
43
    {
44
        return $this->client;
45
    }
46
47
    /**
48
     * import source from URL with Guzzle.
49
     * todo abstract + refac async request
50
     */
51
    public function getHTML(string $url, ?bool $normalized = false): ?string
52
    {
53
        // todo : check banned domains ?
54
        // todo : check DNS record => ban ?
55
        // todo : accept non-ascii URL ?
56
        // idn_to_ascii($url);
57
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
58
        // checkdnsrr($string, "A") // check DNS record
59
        if (!HttpUtil::isHttpURL($url)) {
60
            throw new DomainException('URL not compatible : ' . $url);
61
        }
62
        $response = $this->client->get($url);
63
64
        if (200 !== $response->getStatusCode()) {
65
            echo 'HTTP error ' . $response->getStatusCode();
66
            $this->log->error('HTTP error ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase());
67
68
            return null;
69
        }
70
71
        $contentType = $response->getHeader('Content-Type');
72
        if (in_array('application/pdf', explode(';', $contentType[0]))) {
73 6
            $this->log->debug('Incompatible application/pdf content-type');
74
            return null;
75 6
        }
76 1
77
        $html = $response->getBody()->getContents() ?? '';
78
79 6
        return ($normalized) ? HttpUtil::normalizeHtml($html, $url) : $html;
80
    }
81
}
82