Dispositif /
Wikibot
| 1 | <?php |
||
| 2 | /* |
||
| 3 | * This file is part of dispositif/wikibot application (@github) |
||
| 4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||
| 5 | * For the full copyright and MIT license information, view the license file. |
||
| 6 | */ |
||
| 7 | |||
| 8 | declare(strict_types=1); |
||
| 9 | |||
| 10 | namespace App\Application\Utils; |
||
| 11 | |||
| 12 | use DomainException; |
||
| 13 | use Normalizer; |
||
| 14 | use Throwable; |
||
| 15 | |||
| 16 | class HttpUtil |
||
| 17 | { |
||
| 18 | |||
| 19 | /** |
||
| 20 | * Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable. |
||
| 21 | * See for example .中国 domain name |
||
| 22 | */ |
||
| 23 | public static function isHttpURL(string $url): bool |
||
| 24 | { |
||
| 25 | return (bool) preg_match('#^https?://[^ \n\t\r]+$#i', $url); |
||
| 26 | } |
||
| 27 | |||
| 28 | /** |
||
| 29 | * Normalize and converting to UTF-8 encoding |
||
| 30 | */ |
||
| 31 | public static function normalizeHtml(string $html, ?string $url = ''): ?string |
||
| 32 | { |
||
| 33 | if (empty($html)) { |
||
| 34 | return $html; |
||
| 35 | } |
||
| 36 | |||
| 37 | $html2 = Normalizer::normalize($html); |
||
| 38 | |||
| 39 | if (is_string($html2) && !empty($html2)) { |
||
| 40 | return $html2; |
||
| 41 | } |
||
| 42 | |||
| 43 | $charset = self::extractCharset($html) ?? 'WINDOWS-1252'; |
||
| 44 | if (empty($charset)) { |
||
| 45 | throw new DomainException('normalized html error and no charset found : ' . $url); |
||
| 46 | } |
||
| 47 | try { |
||
| 48 | // PHP Notice: iconv(): Detected an illegal character in input string |
||
| 49 | $html2 = @iconv($charset, 'UTF-8//TRANSLIT', $html); |
||
| 50 | if (false === $html2) { |
||
| 51 | throw new DomainException("error iconv : $charset to UTF-8 on " . $url); |
||
| 52 | } |
||
| 53 | $html2 = Normalizer::normalize($html2); |
||
| 54 | if (!is_string($html2)) { |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 55 | throw new DomainException("error normalizer : $charset to UTF-8 on " . $url); |
||
| 56 | } |
||
| 57 | } catch (Throwable $e) { |
||
| 58 | throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e); |
||
| 59 | } |
||
| 60 | |||
| 61 | return $html2; |
||
| 62 | } |
||
| 63 | |||
| 64 | /** |
||
| 65 | * Extract charset from HTML text |
||
| 66 | */ |
||
| 67 | private static function extractCharset(string $html): ?string |
||
| 68 | { |
||
| 69 | if (preg_match( |
||
| 70 | '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#', |
||
| 71 | $html, |
||
| 72 | $matches |
||
| 73 | ) |
||
| 74 | ) { |
||
| 75 | $charset = $matches[2] ?? $matches[1] ?? null; |
||
| 76 | } |
||
| 77 | if (empty($charset)) { |
||
| 78 | |||
| 79 | $encoding = mb_detect_encoding($html, null, true); |
||
| 80 | $charset = is_string($encoding) ? strtoupper($encoding) : null; |
||
|
0 ignored issues
–
show
|
|||
| 81 | } |
||
| 82 | |||
| 83 | return $charset; |
||
| 84 | } |
||
| 85 | } |
||
| 86 |