Test Failed
Push — master ( 766a39...696a12 )
by Dispositif
09:33
created

ExternHttpClient::normalizeHtml()   B

Complexity

Conditions 7
Paths 8

Size

Total Lines 29
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
cc 7
eloc 17
c 0
b 0
f 0
nc 8
nop 2
dl 0
loc 29
ccs 0
cts 0
cp 0
crap 56
rs 8.8333
1
<?php
2
3
/*
4
 * This file is part of dispositif/wikibot application (@github)
5
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
6
 * For the full copyright and MIT license information, view the license file.
7
 */
8
declare(strict_types=1);
9
10
namespace App\Application\Http;
11
12
use DomainException;
13
use GuzzleHttp\Client;
14
use Normalizer;
15
use Psr\Log\LoggerInterface;
16
use Throwable;
17
18
class ExternHttpClient implements HttpClientInterface
19
{
20
    /**
21
     * @var Client
22
     */
23
    private $client;
24
    /**
25
     * @var LoggerInterface|null
26
     */
27
    private $log;
28
29
    public function __construct(?LoggerInterface $log = null)
30
    {
31
        $this->log = $log;
32
        $this->client = new Client(
33
            [
34
                'timeout' => 30,
35
                'allow_redirects' => true,
36
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
37
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
38
                //                'proxy'           => '192.168.16.1:10',
39
            ]
40
        );
41
    }
42
43
    /**
44
     * import source from URL with Guzzle.
45
     * todo abstract + refac async request
46
     *
47
     * @param string    $url
48
     * @param bool|null $normalized
49
     *
50
     * @return string|null
51
     */
52
    public function getHTML(string $url, ?bool $normalized = false): ?string
53
    {
54
        // todo : check banned domains ?
55
        // todo : check DNS record => ban ?
56
        // todo : accept non-ascii URL ?
57
        // idn_to_ascii($url);
58
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
59
        // checkdnsrr($string, "A") // check DNS record
60
        if (!self::isHttpURL($url)) {
61
            throw new DomainException('URL not compatible : '.$url);
62
        }
63
        $response = $this->client->get($url);
64
65
        if (200 !== $response->getStatusCode()) {
66
            echo 'HTTP error '.$response->getStatusCode();
67
            if ($this->log !== null) {
68
                $this->log->error('HTTP error '.$response->getStatusCode().' '.$response->getReasonPhrase());
69
            }
70
71
            return null;
72
        }
73 6
74
        $html = (string)$response->getBody()->getContents() ?? '';
75 6
76 1
        return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
77
    }
78
79 6
    public static function isHttpURL(string $url): bool
80
    {
81
        //$url = filter_var($url, FILTER_SANITIZE_URL); // strip "é" !!!
82
        // FILTER_VALIDATE_URL restreint à caractères ASCII : renvoie false avec "é" dans URL / not multibyte capable
83
        // !filter_var($url, FILTER_VALIDATE_URL)
84
        return (bool) preg_match('#^https?://[^ ]+$#i', $url);
85
    }
86
87
    /**
88
     * Normalize and converting to UTF-8 encoding
89
     *
90
     * @param string      $html
91
     * @param string|null $url
92
     *
93
     * @return string|null
94
     */
95
    private function normalizeHtml(string $html, ?string $url = ''): ?string
96
    {
97
        $e = null;
1 ignored issue
show
Unused Code introduced by
The assignment to $e is dead and can be removed.
Loading history...
98
        if (empty($html)) {
99
            return $html;
100
        }
101
102
        $html2 = Normalizer::normalize($html);
103
104
        if (is_string($html2) && !empty($html2)) {
105
            return $html2;
106
        }
107
108
        $charset = $this->extractCharset($html) ?? 'WINDOWS-1252';
109
110
        if (empty($charset)) {
111
            throw new DomainException('normalized html error and no charset found : '.$url);
112
        }
113
        try {
114
            $html2 = iconv($charset, 'UTF-8//TRANSLIT', $html);
115
            $html2 = Normalizer::normalize($html2);
116
            if (!is_string($html2)) {
0 ignored issues
show
introduced by
The condition is_string($html2) is always true.
Loading history...
117
                return '';
118
            }
119
        } catch (Throwable $e) {
120
            throw new DomainException("error converting : $charset to UTF-8".$url, $e->getCode(), $e);
121
        }
122
123
        return $html2;
124
    }
125
126
    /**
127
     * Extract charset from HTML text
128
     *
129
     * @param string $html
130
     *
131
     * @return string|null
132
     */
133
    private function extractCharset(string $html): ?string
134
    {
135
        if (preg_match(
136
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
137
            $html,
138
            $matches
139
        )
140
        ) {
141
            $charset = $matches[2] ?? $matches[1] ?? null;
142
        }
143
        if (empty($charset)) {
144
145
            $encoding = mb_detect_encoding($html, null, true);
146
            $charset = is_string($encoding) ? strtoupper($encoding) : null;
0 ignored issues
show
introduced by
The condition is_string($encoding) is always true.
Loading history...
147
        }
148
149
        return $charset;
150
    }
151
152
}
153