Test Failed
Push — master ( 766a39...696a12 )
by Dispositif
09:33
created

ExternPage::getRegistrableSubDomain()   A

Complexity

Conditions 4
Paths 7

Size

Total Lines 13
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
eloc 8
c 0
b 0
f 0
nc 7
nop 0
dl 0
loc 13
ccs 0
cts 0
cp 0
crap 20
rs 10
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Domain\Utils\TextUtil;
15
use App\Infrastructure\InternetDomainParser;
16
use App\Infrastructure\TagParser;
17
use Exception;
18
use Psr\Log\LoggerInterface;
19
20
/**
21
 * Représente une page web d'un Lien Externe (hors wiki)
22
 * Class ExternPage
23
 *
24
 * @package App\Domain
25
 */
26
class ExternPage
27
{
28
    // todo move to config
29
    protected const PRETTY_DOMAIN_EXCLUSION
30
        = [
31
            '.中国',
32
            '.gov',
33
            '.free.fr',
34
            '.gouv.fr',
35
            '.com.cn',
36
            'site.google.com',
37
        ];
38
39
    /**
40
     * @var string
41
     */
42
    private $url;
43
44
    /**
45
     * @var string
46
     */
47
    private $html;
48
49
    /**
50 5
     * @var LoggerInterface|null
51
     */
52 5
    private $log;
53
54
    /**
55 5
     * ExternPage constructor.
56 5
     *
57 5
     * @param string               $url
58 5
     * @param string               $html
59
     * @param LoggerInterface|null $log
60
     *
61
     * @throws Exception
62
     */
63
    public function __construct(string $url, string $html, ?LoggerInterface $log = null)
64
    {
65
        if (!ExternHttpClient::isHttpURL($url)) {
66
            throw new Exception('string is not an URL '.$url);
67
        }
68
        $this->url = $url;
69
        $this->html = $html;
70
        $this->log = $log;
71
    }
72 5
73
    /**
74 5
     * @return string
75 5
     */
76
    public function getUrl(): string
77 5
    {
78
        return $this->url;
79
    }
80
81
    /**
82
     * @return array
83
     * @throws Exception
84
     */
85
    public function getData(): array
86
    {
87
        $ld = $this->parseLdJson($this->html);
88
        $meta = $this->parseMetaTags($this->html);
89 5
90
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
91 5
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
92 5
        $meta['html-url'] = $this->url;
93 5
94
        return ['JSON-LD' => $ld, 'meta' => $meta];
95
    }
96 5
97 3
    /**
98
     * extract LD-JSON metadata from <script type="application/ld+json">.
99 3
     *
100
     * @param string $html
101
     *
102 3
     * @return array
103 3
     * @throws Exception
104
     * @throws Exception
105
     */
106
    private function parseLdJson(string $html): array
107 3
    {
108
        $parser = new TagParser();
109
        $results = $parser->importHtml($html)->xpathResults(
110 2
            '//script[@type="application/ld+json"]'
111
        );
112
113
        foreach ($results as $result) {
114
            $json = trim($result);
115
            // filtrage empty value (todo?)
116
            if ($json === '') {
117
                continue;
118
            }
119
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
120 5
            if (!is_array($data)
121
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
122 5
            ) {
123
                continue;
124
            }
125
126
            return $data;
127
        }
128
129
        return [];
130
    }
131
132
    /**
133
     * todo move? /refac/delete?
134
     *
135
     * @param string $str
136
     *
137 5
     * @return array
138 5
     */
139
    private function parseMetaTags(string $str): array
140 5
    {
141
        $pattern = '
142
              ~<\s*meta\s
143
              # using lookahead to capture type to $1
144
                (?=[^>]*?
145
                \b(?:name|property|http-equiv)\s*=\s*
146
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
147
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
148
              )
149
              # capture content to $2
150
              [^>]*?\bcontent\s*=\s*
151
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
152
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
153
              [^>]*>
154
              ~ix';
155
156
        if (preg_match_all($pattern, $str, $out)) {
157
            $combine = array_combine($out[1], $out[2]);
158
159
            return $combine ?: [];
160
        }
161
162
        return [];
163
    }
164
165
    /**
166
     * test.com => test.com
167
     * bla.test.com => test.com
168
     * test.co.uk => test.co.uk (national commercial subdomain)
169
     * site.google.com => site.google.com (blog)
170
     * bla.site.google.com => site.google.com (blog)
171
     *
172
     * @throws Exception
173
     */
174
    public function getPrettyDomainName(): string
175
    {
176
        // Parse custom exceptions (free.fr, gouv.fr, etc)
177
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url);
178
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
179
            if (TextUtil::str_ends_with($rawDomain, $end)) {
180
                return $this->sanitizeSubDomain($rawDomain);
181
            }
182
        }
183
184
        // Parse using InternetDomainParser library
185
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain());
186
    }
187
188
    /**
189
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
190
     * @throws Exception
191
     */
192
    public function getRegistrableSubDomain(): string
193
    {
194
        try {
195
            if (!ExternHttpClient::isHttpURL($this->url)) {
196
                throw new \Exception('string is not an URL '.$this->url);
197
            }
198
199
            return InternetDomainParser::getRegistrableDomainFromURL($this->url);
200
        } catch (Exception $e) {
201
            if ($this->log !== null) {
202
                $this->log->warning('InternetDomainParser::getRegistrableDomainFromURL NULL '.$this->url);
203
            }
204
            throw new Exception('InternetDomainParser::getRegistrableDomainFromURL NULL', $e->getCode(), $e);
205
        }
206
    }
207
208
    /**
209
     * Extract language from <html lang="en-us"> tag.
210
     *
211
     * @param string $html
212
     *
213
     * @return string|null
214
     */
215
    private function parseHtmlLang(string $html): ?string
216
    {
217
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
218
            return $matches[1];
219
        }
220
221
        return null;
222
    }
223
224
    /**
225
     * Extract webpage title from HTML <title>
226
     * not foolproof : example <!-- <title>bla</title> -->
227
     *
228
     * @param string $html
229
     *
230
     * @return string|null
231
     */
232
    private function parseHtmlTitle(string $html): ?string
233
    {
234
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
235
            return trim(strip_tags($matches[1]));
236
        }
237
238
        return null;
239
    }
240
241
    /**
242
     * TODO strip not unicode characters ?
243
     * TODO add initial capital letter ?
244
     * This method is used to sanitize subdomain name.
245
     * WTF ?!?!?!
246
     */
247
    protected function sanitizeSubDomain(string $subDomain): string
248
    {
249
        return str_replace('www.', '', $subDomain);
250
    }
251
}
252