Passed
Push — master ( 077fb2...0549f4 )
by Dispositif
08:22
created

ExternHttpClient::normalizeHtml()   B

Complexity

Conditions 7
Paths 8

Size

Total Lines 28
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
cc 7
eloc 16
c 0
b 0
f 0
nc 8
nop 2
dl 0
loc 28
ccs 0
cts 0
cp 0
crap 56
rs 8.8333
1
<?php
2
3
/*
4
 * This file is part of dispositif/wikibot application (@github)
5
 * 2019/2020 © Philippe/Irønie  <[email protected]>
6
 * For the full copyright and MIT license information, view the license file.
7
 */
8
declare(strict_types=1);
9
10
namespace App\Application\Http;
11
12
use DomainException;
13
use GuzzleHttp\Client;
14
use Normalizer;
15
use Psr\Log\LoggerInterface;
16
use Throwable;
17
18
class ExternHttpClient implements HttpClientInterface
19
{
20
    /**
21
     * @var Client
22
     */
23
    private $client;
24
    /**
25
     * @var LoggerInterface|null
26
     */
27
    private $log;
28
29
    public function __construct(?LoggerInterface $log = null)
30
    {
31
        $this->log = $log;
32
        $this->client = new Client(
33
            [
34
                'timeout' => 30,
35
                'allow_redirects' => true,
36
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
37
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
38
                //                'proxy'           => '192.168.16.1:10',
39
            ]
40
        );
41
    }
42
43
    /**
44
     * import source from URL with Guzzle.
45
     * todo abstract + refac async request
46
     *
47
     * @param string    $url
48
     * @param bool|null $normalized
49
     *
50
     * @return string|null
51
     */
52
    public function getHTML(string $url, ?bool $normalized = false): ?string
53
    {
54
        // todo : check banned domains ?
55
        // todo : check DNS record => ban ?
56
        // todo : accept non-ascii URL ?
57
        // idn_to_ascii($url);
58
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
59
        // checkdnsrr($string, "A") // check DNS record
60
        if (!self::isWebURL($url)) {
61
            throw new DomainException('URL not compatible : '.$url);
62
        }
63
        $response = $this->client->get($url);
64
65
        if (200 !== $response->getStatusCode()) {
66
            echo 'HTTP error '.$response->getStatusCode();
67
            if ($this->log) {
68
                $this->log->error('HTTP error '.$response->getStatusCode().' '.$response->getReasonPhrase());
69
            }
70
71
            return null;
72
        }
73 6
74
        $html = (string)$response->getBody()->getContents() ?? '';
75 6
76 1
        return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
77
    }
78
79 6
    public static function isWebURL(string $url): bool
80
    {
81
        //$url = filter_var($url, FILTER_SANITIZE_URL); // strip "é" !!!
82
        // FILTER_VALIDATE_URL restreint à caractères ASCII : renvoie false avec "é" dans URL / not multibyte capable
83
        // !filter_var($url, FILTER_VALIDATE_URL)
84
        if (!preg_match('#^https?://[^ ]+$#i', $url)) {
85
            return false;
86
        }
87
88
        return true;
89
    }
90
91
    /**
92
     * Normalize and converting to UTF-8 encoding
93
     *
94
     * @param string      $html
95
     * @param string|null $url
96
     *
97
     * @return string|null
98
     */
99
    private function normalizeHtml(string $html, ?string $url = ''): ?string
100
    {
101
        if (empty($html)) {
102
            return $html;
103
        }
104
105
        $html2 = Normalizer::normalize($html);
106
107
        if (is_string($html2) && !empty($html2)) {
108
            return $html2;
109
        }
110
111
        $charset = $this->extractCharset($html) ?? 'WINDOWS-1252';
112
113
        if (empty($charset)) {
114
            throw new DomainException('normalized html error and no charset found : '.$url);
115
        }
116
        try {
117
            $html2 = iconv($charset, 'UTF-8//TRANSLIT', $html);
118
            $html2 = Normalizer::normalize($html2);
119
            if (!is_string($html2)) {
0 ignored issues
show
introduced by
The condition is_string($html2) is always true.
Loading history...
120
                return '';
121
            }
122
        } catch (Throwable $e) {
123
            throw new DomainException("error converting : $charset to UTF-8".$url);
124
        }
125
126
        return $html2;
127
    }
128
129
    /**
130
     * Extract charset from HTML text
131
     *
132
     * @param string $html
133
     *
134
     * @return string|null
135
     */
136
    private function extractCharset(string $html): ?string
137
    {
138
        if (preg_match(
139
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
140
            $html,
141
            $matches
142
        )
143
        ) {
144
            $charset = $matches[2] ?? $matches[1] ?? null;
145
        }
146
        if (empty($charset)) {
147
            $encoding = mb_detect_encoding($html, mb_detect_order(), true);
148
            $charset = is_string($encoding) ? strtoupper($encoding) : null;
0 ignored issues
show
introduced by
The condition is_string($encoding) is always true.
Loading history...
149
        }
150
151
        return $charset;
152
    }
153
154
}
155