Passed
Push — master ( dafac1...9ad278 )
by Dispositif
08:36
created

ExternHttpClient   A

Complexity

Total Complexity 19

Size/Duplication

Total Lines 131
Duplicated Lines 0 %

Test Coverage

Coverage 20%

Importance

Changes 1
Bugs 0 Features 1
Metric Value
eloc 45
dl 0
loc 131
ccs 4
cts 20
cp 0.2
rs 10
c 1
b 0
f 1
wmc 19

5 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 9 1
A isWebURL() 0 10 3
A getHTML() 0 25 5
A extractCharset() 0 15 4
A normalizeHtml() 0 26 6
1
<?php
2
3
/**
4
 * This file is part of dispositif/wikibot application (@github)
5
 * 2019/2020 © Philippe M. <[email protected]>
6
 * For the full copyright and MIT license information, please view the license file.
7
 */
8
declare(strict_types=1);
9
10
namespace App\Application\Http;
11
12
use DomainException;
13
use GuzzleHttp\Client;
14
use Normalizer;
15
use Psr\Log\LoggerInterface;
16
use Throwable;
17
18
class ExternHttpClient implements HttpClientInterface
19
{
20
    /**
21
     * @var Client
22
     */
23
    private $client;
24
    /**
25
     * @var LoggerInterface|null
26
     */
27
    private $log;
28
29
    public function __construct(?LoggerInterface $log = null)
30
    {
31
        $this->log = $log;
32
        $this->client = new Client(
33
            [
34
                'timeout' => 30,
35
                'allow_redirects' => true,
36
                'headers' => ['User-Agent' => getenv('USER_AGENT')],
37
                'verify' => false, // CURLOPT_SSL_VERIFYHOST
38
                //                'proxy'           => '192.168.16.1:10',
39
            ]
40
        );
41
    }
42
43
    /**
44
     * import source from URL with Guzzle.
45
     * todo abstract + refac async request
46
     *
47
     * @param string    $url
48
     * @param bool|null $normalized
49
     *
50
     * @return string|null
51
     */
52
    public function getHTML(string $url, ?bool $normalized=false): ?string
53
    {
54
        // todo : check banned domains ?
55
        // todo : check DNS record => ban ?
56
        // todo : accept non-ascii URL ?
57
        // idn_to_ascii($url);
58
        // idn_to_ascii('teßt.com',IDNA_NONTRANSITIONAL_TO_ASCII,INTL_IDNA_VARIANT_UTS46)
59
        // checkdnsrr($string, "A") // check DNS record
60
        if (!self::isWebURL($url)) {
61
            throw new DomainException('URL not compatible : '.$url);
62
        }
63
        $response = $this->client->get($url);
64
65
        if (200 !== $response->getStatusCode()) {
66
            echo 'HTTP error '.$response->getStatusCode();
67
            if ($this->log) {
68
                $this->log->error('HTTP error '.$response->getStatusCode().' '.$response->getReasonPhrase());
69
            }
70
71
            return null;
72
        }
73 6
74
        $html = (string)$response->getBody()->getContents() ?? '';
75 6
76 1
        return ($normalized) ? $this->normalizeHtml($html, $url) : $html;
77
    }
78
79 6
    public static function isWebURL(string $url): bool
80
    {
81
        //$url = filter_var($url, FILTER_SANITIZE_URL); // strip "é" !!!
82
        // FILTER_VALIDATE_URL restreint à caractères ASCII : renvoie false avec "é" dans URL / not multibyte capable
83
        // !filter_var($url, FILTER_VALIDATE_URL)
84
        if (!filter_var($url, FILTER_VALIDATE_URL) || !preg_match('#^https?://[^ ]+#i', $url)) {
85
            return false;
86
        }
87
88
        return true;
89
    }
90
91
    /**
92
     * Normalize and converting to UTF-8 encoding
93
     *
94
     * @param string      $html
95
     * @param string|null $url
96
     *
97
     * @return string|null
98
     */
99
    private function normalizeHtml(string $html, ?string $url = ''): ?string
100
    {
101
        if(empty($html)) {
102
            return $html;
103
        }
104
105
        $html2 = Normalizer::normalize($html);
106
107
        if (is_string($html2) && !empty($html2)) {
108
            return $html2;
109
        }
110
111
        $charset = $this->extractCharset($html) ?? 'WINDOWS-1252';
112
113
        if (empty($charset)) {
114
            throw new DomainException('normalized html error and no charset found : '.$url);
115
        }
116
        try {
117
            $html2 = iconv($charset ?? 'pouet', 'UTF-8//TRANSLIT', $html);
118
            $html2 = Normalizer::normalize($html2);
119
        } catch (Throwable $e) {
120
            throw new DomainException("error converting : $charset to UTF-8".$url);
121
        }
122
123
124
        return $html2;
125
    }
126
127
    /**
128
     * Extract charset from HTML text
129
     *
130
     * @param string $html
131
     *
132
     * @return string|null
133
     */
134
    private function extractCharset(string $html): ?string
135
    {
136
        if (preg_match(
137
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
138
            $html,
139
            $matches
140
        )
141
        ) {
142
            $charset = $matches[2] ?? $matches[1] ?? null;
143
        }
144
        if (empty($charset)) {
145
            $encoding = mb_detect_encoding($html, mb_detect_order(), true);
146
            $charset = is_string($encoding) ? strtoupper($encoding) : null;
0 ignored issues
show
introduced by
The condition is_string($encoding) is always true.
Loading history...
147
        }
148
        return $charset;
149
    }
150
151
}
152