Issues (106)

src/Application/Utils/HttpUtil.php (2 issues)

Severity
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application\Utils;
11
12
use DomainException;
13
use Normalizer;
14
use Throwable;
15
16
class HttpUtil
17
{
18
19
    /**
20
     * Better than filter_var($url, FILTER_VALIDATE_URL) because it's not multibyte capable.
21
     * See for example .中国 domain name
22
     */
23
    public static function isHttpURL(string $url): bool
24
    {
25
        return (bool) preg_match('#^https?://[^ \n\t\r]+$#i', $url);
26
    }
27
28
    /**
29
     * Normalize and converting to UTF-8 encoding
30
     */
31
    public static function normalizeHtml(string $html, ?string $url = ''): ?string
32
    {
33
        if (empty($html)) {
34
            return $html;
35
        }
36
37
        $html2 = Normalizer::normalize($html);
38
39
        if (is_string($html2) && !empty($html2)) {
40
            return $html2;
41
        }
42
43
        $charset = self::extractCharset($html) ?? 'WINDOWS-1252';
44
        if (empty($charset)) {
45
            throw new DomainException('normalized html error and no charset found : ' . $url);
46
        }
47
        try {
48
            // PHP Notice:  iconv(): Detected an illegal character in input string
49
            $html2 = @iconv($charset, 'UTF-8//TRANSLIT', $html);
50
            if (false === $html2) {
51
                throw new DomainException("error iconv : $charset to UTF-8 on " . $url);
52
            }
53
            $html2 = Normalizer::normalize($html2);
54
            if (!is_string($html2)) {
0 ignored issues
show
The condition is_string($html2) is always true.
Loading history...
55
                throw new DomainException("error normalizer : $charset to UTF-8 on " . $url);
56
            }
57
        } catch (Throwable $e) {
58
            throw new DomainException("error converting : $charset to UTF-8 on " . $url, $e->getCode(), $e);
59
        }
60
61
        return $html2;
62
    }
63
64
    /**
65
     * Extract charset from HTML text
66
     */
67
    private static function extractCharset(string $html): ?string
68
    {
69
        if (preg_match(
70
            '#<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s"\']*)?([^>]*?)[\s"\';]*charset\s*=[\s"\']*([^\s"\'/>]*)#',
71
            $html,
72
            $matches
73
        )
74
        ) {
75
            $charset = $matches[2] ?? $matches[1] ?? null;
76
        }
77
        if (empty($charset)) {
78
79
            $encoding = mb_detect_encoding($html, null, true);
80
            $charset = is_string($encoding) ? strtoupper($encoding) : null;
0 ignored issues
show
The condition is_string($encoding) is always true.
Loading history...
81
        }
82
83
        return $charset;
84
    }
85
}
86