Passed
Push — master ( dff8a4...2556d0 )
by Dispositif
08:19
created

ExternPage::parseHtmlFirstH1()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
eloc 3
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 7
ccs 0
cts 0
cp 0
crap 6
rs 10
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Domain\Utils\TextUtil;
15
use App\Infrastructure\InternetDomainParser;
16
use App\Infrastructure\TagParser;
17
use Exception;
18
use Psr\Log\LoggerInterface;
19
20
/**
21
 * Représente une page web d'un Lien Externe (hors wiki)
22
 * Class ExternPage
23
 *
24
 * @package App\Domain
25
 */
26
class ExternPage
27
{
28
    // todo move to config
29
    protected const PRETTY_DOMAIN_EXCLUSION
30
        = [
31
            '.中国',
32
            '.gov',
33
            '.free.fr',
34
            '.gouv.fr',
35
            '.com.cn',
36
            'site.google.com',
37
            'wordpress.com',
38
            'blogspot.com',
39
        ];
40
41
    /**
42
     * @var string
43
     */
44
    private $url;
45
46
    /**
47
     * @var string
48
     */
49
    private $html;
50 5
51
    /**
52 5
     * @var LoggerInterface|null
53
     */
54
    private $log;
55 5
56 5
    /**
57 5
     * ExternPage constructor.
58 5
     *
59
     * @throws Exception
60
     */
61
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
62
    {
63
        if (!ExternHttpClient::isHttpURL($url)) {
64
            throw new Exception('string is not an URL '.$url);
65
        }
66
        $this->url = $url;
67
        $this->html = $html;
68
        $this->log = $log;
69
    }
70
71
    public function getUrl(): string
72 5
    {
73
        return $this->url;
74 5
    }
75 5
76
    public function getData(): array
77 5
    {
78
        $ld = $this->parseLdJson($this->html);
79
        $meta = $this->parseMetaTags($this->html);
80
81
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
82
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
83
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
84
        $meta['html-url'] = $this->url;
85
        $meta['prettyDomainName'] = $this->getPrettyDomainName();
86
87
        return ['JSON-LD' => $ld, 'meta' => $meta];
88
    }
89 5
90
    /**
91 5
     * extract LD-JSON metadata from <script type="application/ld+json">.
92 5
     *
93 5
     * @throws Exception
94
     */
95
    private function parseLdJson(string $html): array
96 5
    {
97 3
        $parser = new TagParser();
98
        $results = $parser->importHtml($html)->xpathResults(
99 3
            '//script[@type="application/ld+json"]'
100
        );
101
102 3
        foreach ($results as $result) {
103 3
            $json = trim($result);
104
            // filtrage empty value (todo?)
105
            if ($json === '') {
106
                continue;
107 3
            }
108
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
109
            if (!is_array($data)
110 2
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
111
            ) {
112
                continue;
113
            }
114
115
            return $data;
116
        }
117
118
        return [];
119
    }
120 5
121
    /**
122 5
     * todo move? /refac/delete?
123
     */
124
    private function parseMetaTags(string $str): array
125
    {
126
        $pattern = '
127
              ~<\s*meta\s
128
              # using lookahead to capture type to $1
129
                (?=[^>]*?
130
                \b(?:name|property|http-equiv)\s*=\s*
131
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
132
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
133
              )
134
              # capture content to $2
135
              [^>]*?\bcontent\s*=\s*
136
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
137 5
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
138 5
              [^>]*>
139
              ~ix';
140 5
141
        if (preg_match_all($pattern, $str, $out)) {
142
            $combine = array_combine($out[1], $out[2]);
143
144
            return $combine ?: [];
145
        }
146
147
        return [];
148
    }
149
150
    /**
151
     * test.com => test.com
152
     * bla.test.com => test.com
153
     * test.co.uk => test.co.uk (national commercial subdomain)
154
     * site.google.com => site.google.com (blog)
155
     * bla.site.google.com => site.google.com (blog)
156
     */
157
    public function getPrettyDomainName(): string
158
    {
159
        // Parse custom exceptions (free.fr, gouv.fr, etc)
160
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url);
161
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
162
            if (TextUtil::str_ends_with($rawDomain, $end)) {
163
                return $this->sanitizeSubDomain($rawDomain);
164
            }
165
        }
166
167
        // Parse using InternetDomainParser library
168
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
169
    }
170
171
    /**
172
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
173
     * @throws Exception
174
     */
175
    public function getRegistrableSubDomain(): string
176
    {
177
        try {
178
            if (!ExternHttpClient::isHttpURL($this->url)) {
179
                throw new Exception('string is not an URL '.$this->url);
180
            }
181
182
            return InternetDomainParser::getRegistrableDomainFromURL($this->url);
183
        } catch (Exception $e) {
184
            if ($this->log !== null) {
185
                $this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
186
            }
187
            throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
188
        }
189
    }
190
191
    /**
192
     * Extract language from <html lang="en-us"> tag.
193
     */
194
    private function parseHtmlLang(string $html): ?string
195
    {
196
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
197
            return $matches[1];
198
        }
199
200
        return null;
201
    }
202
203
    /**
204
     * Extract webpage title from HTML <title>
205
     * not foolproof : example <!-- <title>bla</title> -->
206
     */
207
    private function parseHtmlTitle(string $html): ?string
208
    {
209
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
210
            return trim(strip_tags($matches[1]));
211
        }
212
213
        return null;
214
    }
215
216
    /**
217
     * Extract first <h1> from HTML.
218
     */
219
    private function parseHtmlFirstH1(string $html): ?string
220
    {
221
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
222
            return trim(strip_tags($matches[1]));
223
        }
224
225
        return null;
226
    }
227
228
    /**
229
     * TODO strip not unicode characters ?
230
     * TODO add initial capital letter ?
231
     * This method is used to sanitize subdomain name.
232
     * WTF ?!?!?!
233
     */
234
    protected function sanitizeSubDomain(string $subDomain): string
235
    {
236
        return str_replace('www.', '', $subDomain);
237
    }
238
}
239