Passed
Push — master ( cd664b...7baf30 )
by Dispositif
13:51
created

ExternHttpClient::getClient()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 0
cts 0
cp 0
cc 1
nc 1
nop 0
crap 2
1
<?php
2
3
/*
4
 * This file is part of dispositif/wikibot application (@github)
5
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
6
 * For the full copyright and MIT license information, view the license file.
7
 */
8
declare(strict_types=1);
9
10
namespace App\Application\Http;
11
12
use App\Domain\InfrastructurePorts\ExternHttpClientInterface;
13
use DomainException;
14
use GuzzleHttp\Client;
15
use Normalizer;
16
use Psr\Log\LoggerInterface;
17
use Psr\Log\NullLogger;
18
use Throwable;
19
20
/**
21
 * Http client (Guzzle) configured for web crawling.
22
 */
23
class ExternHttpClient implements ExternHttpClientInterface
24
{
25
    private readonly Client $client;
26
27
    public function __construct(private readonly LoggerInterface $log = new NullLogger())
28
    {
29
        $this->client = new Client(
0 ignored issues
show
Bug introduced by
The property client is declared read-only in App\Application\Http\ExternHttpClient.
Loading history...
30
            [
31
                'timeout' => 30,
32
                'allow_redirects' => true,
33
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
34
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
35
                //                'proxy'           => '192.192.192.192:10',
36
            ]
37
        );
38
    }
39
40
    //hack for WikiwixAdapter todo : plutôt request() compatible ClientInterface ?
41
    public function getClient(): Client
42
    {
43
        return $this->client;
44
    }
45
46
    /**
47
     * import source from URL with Guzzle.
48
     * todo abstract + refac async request
49
     */
50
    public function getHTML(string $url, ?bool $normalized = false): ?string
51
    {
52
        // todo : check banned domains ?
53
        // todo : check DNS record => ban ?
54
        // todo : accept non-ascii URL ?
55
        // idn_to_ascii($url);
56
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
57
        // checkdnsrr($string, "A") // check DNS record
58
        if (!self::isHttpURL($url)) {
59
            throw new DomainException('URL not compatible : ' . $url);
60
        }
61
        $response = $this->client->get($url);
62
63
        if (200 !== $response->getStatusCode()) {
64
            echo 'HTTP error ' . $response->getStatusCode();
65
            $this->log->error('HTTP error ' . $response->getStatusCode() . ' ' . $response->getReasonPhrase());
66
67
            return null;
68
        }
69
70
        $html = $response->getBody()->getContents() ?? '';
71
72
        return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
73 6
    }
74
75 6
    /**
76 1
     * Todo Move HttpUtils ?
77
     * Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable.
78
     * See for example .中国 domain name
79 6
     */
80
    public static function isHttpURL(string $url): bool
81
    {
82
        return (bool)preg_match('#^https?://[^ ]+$#i', $url);
83
    }
84
85
    /**
86
     * Normalize and converting to UTF-8 encoding
87
     */
88
    private function normalizeHtml(string $html, ?string $url = ''): ?string
89
    {
90
        $e = null;
91
        if (empty($html)) {
92
            return $html;
93
        }
94
95
        $html2 = Normalizer::normalize($html);
96
97
        if (is_string($html2) && !empty($html2)) {
98
            return $html2;
99
        }
100
101
        $charset = $this->extractCharset($html) ?? 'WINDOWS-1252';
102
103
        if (empty($charset)) {
104
            throw new DomainException('normalized html error and no charset found : ' . $url);
105
        }
106
        try {
107
            $html2 = iconv($charset, 'UTF-8//TRANSLIT', $html);
108
            if (false === $html2) {
109
                throw new DomainException("error iconv : $charset to UTF-8 on " . $url);
110
            }
111
            $html2 = Normalizer::normalize($html2);
112
            if (!is_string($html2)) {
0 ignored issues
show
introduced by
The condition is_string($html2) is always true.
Loading history...
113
                throw new DomainException("error normalizer : $charset to UTF-8 on " . $url);
114
            }
115
        } catch (Throwable $e) {
116
            throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e);
117
        }
118
119
        return $html2;
120
    }
121
122
    /**
123
     * Extract charset from HTML text
124
     */
125
    private function extractCharset(string $html): ?string
126
    {
127
        if (preg_match(
128
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
129
            $html,
130
            $matches
131
        )
132
        ) {
133
            $charset = $matches[2] ?? $matches[1] ?? null;
134
        }
135
        if (empty($charset)) {
136
137
            $encoding = mb_detect_encoding($html, null, true);
138
            $charset = is_string($encoding) ? strtoupper($encoding) : null;
0 ignored issues
show
introduced by
The condition is_string($encoding) is always true.
Loading history...
139
        }
140
141
        return $charset;
142
    }
143
}
144