1 | <?php |
||
2 | /* |
||
3 | * This file is part of dispositif/wikibot application (@github) |
||
4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||
5 | * For the full copyright and MIT license information, view the license file. |
||
6 | */ |
||
7 | |||
8 | declare(strict_types=1); |
||
9 | |||
10 | namespace App\Application\Utils; |
||
11 | |||
12 | use DomainException; |
||
13 | use Normalizer; |
||
14 | use Throwable; |
||
15 | |||
16 | class HttpUtil |
||
17 | { |
||
18 | |||
19 | /** |
||
20 | * Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable. |
||
21 | * See for example .中国 domain name |
||
22 | */ |
||
23 | public static function isHttpURL(string $url): bool |
||
24 | { |
||
25 | return (bool) preg_match('#^https?://[^ \n\t\r]+$#i', $url); |
||
26 | } |
||
27 | |||
28 | /** |
||
29 | * Normalize and converting to UTF-8 encoding |
||
30 | */ |
||
31 | public static function normalizeHtml(string $html, ?string $url = ''): ?string |
||
32 | { |
||
33 | if (empty($html)) { |
||
34 | return $html; |
||
35 | } |
||
36 | |||
37 | $html2 = Normalizer::normalize($html); |
||
38 | |||
39 | if (is_string($html2) && !empty($html2)) { |
||
40 | return $html2; |
||
41 | } |
||
42 | |||
43 | $charset = self::extractCharset($html) ?? 'WINDOWS-1252'; |
||
44 | if (empty($charset)) { |
||
45 | throw new DomainException('normalized html error and no charset found : ' . $url); |
||
46 | } |
||
47 | try { |
||
48 | // PHP Notice: iconv(): Detected an illegal character in input string |
||
49 | $html2 = @iconv($charset, 'UTF-8//TRANSLIT', $html); |
||
50 | if (false === $html2) { |
||
51 | throw new DomainException("error iconv : $charset to UTF-8 on " . $url); |
||
52 | } |
||
53 | $html2 = Normalizer::normalize($html2); |
||
54 | if (!is_string($html2)) { |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
55 | throw new DomainException("error normalizer : $charset to UTF-8 on " . $url); |
||
56 | } |
||
57 | } catch (Throwable $e) { |
||
58 | throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e); |
||
59 | } |
||
60 | |||
61 | return $html2; |
||
62 | } |
||
63 | |||
64 | /** |
||
65 | * Extract charset from HTML text |
||
66 | */ |
||
67 | private static function extractCharset(string $html): ?string |
||
68 | { |
||
69 | if (preg_match( |
||
70 | '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#', |
||
71 | $html, |
||
72 | $matches |
||
73 | ) |
||
74 | ) { |
||
75 | $charset = $matches[2] ?? $matches[1] ?? null; |
||
76 | } |
||
77 | if (empty($charset)) { |
||
78 | |||
79 | $encoding = mb_detect_encoding($html, null, true); |
||
80 | $charset = is_string($encoding) ? strtoupper($encoding) : null; |
||
0 ignored issues
–
show
|
|||
81 | } |
||
82 | |||
83 | return $charset; |
||
84 | } |
||
85 | } |
||
86 |