Passed
Push — master ( 9538c8...6c70b1 )
by Dispositif
07:44
created

ExternPage::parseLdJson()   B

Complexity

Conditions 8
Paths 5

Size

Total Lines 27
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 14
nc 5
nop 1
dl 0
loc 27
rs 8.4444
c 0
b 0
f 0
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain\ExternLink;
12
13
use App\Application\Http\ExternHttpClient;
14
use App\Domain\Utils\TextUtil;
15
use App\Infrastructure\InternetDomainParser;
16
use Exception;
17
use Psr\Log\LoggerInterface;
18
use Psr\Log\NullLogger;
19
20
/**
21
 * Représente une page web d'un Lien Externe (hors wiki)
22
 * Class ExternPage
23
 * @package App\Domain
24
 */
25
class ExternPage
26
{
27
    // todo move to config
28
    protected const PRETTY_DOMAIN_EXCLUSION
29
        = [
30
            '.中国',
31
            '.gov',
32
            '.free.fr',
33
            '.gouv.fr',
34
            '.com.cn',
35
            'site.google.com',
36
            'wordpress.com',
37
            'blogspot.com',
38
        ];
39
40
    /**
41
     * @var string
42
     */
43
    private $url;
44
45
    /**
46
     * @var string
47
     */
48
    private $html;
49
50
    /** @var TagParserInterface|null */
51
    private $tagParser;
52
53
    /** @var InternetDomainParserInterface|null */
54
    private $domainParser;
55
56
    /** @var LoggerInterface */
57
    private $log;
58
59
    /**
60
     * ExternPage constructor.
61
     * @throws Exception
62
     */
63
    public function __construct(
64
        string                         $url,
65
        string                         $html,
66
        ?TagParserInterface            $tagParser = null,
67
        ?InternetDomainParserInterface $domainParser = null,
68
        ?LoggerInterface               $log = null
69
    )
70
    {
71
        if (!ExternHttpClient::isHttpURL($url)) {
72
            throw new Exception('string is not an URL ' . $url);
73
        }
74
        $this->url = $url;
75
        $this->html = $html;
76
        $this->tagParser = $tagParser;
77
        $this->domainParser = $domainParser;
78
        $this->log = $log ?? new NullLogger();
79
    }
80
81
    public function getUrl(): string
82
    {
83
        return $this->url;
84
    }
85
86
    public function getData(): array
87
    {
88
        $ld = $this->parseLdJson($this->html);
89
        $meta = $this->parseMetaTags($this->html);
90
91
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
92
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
93
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
94
        $meta['html-url'] = $this->url;
95
        $meta['prettyDomainName'] = $this->getPrettyDomainName();
96
        $meta['robots'] = $this->getMetaRobotsContent($this->html);
97
98
        return ['JSON-LD' => $ld, 'meta' => $meta];
99
    }
100
101
    /**
102
     * extract LD-JSON metadata from <script type="application/ld+json">.
103
     * @throws Exception
104
     */
105
    private function parseLdJson(string $html): array
106
    {
107
        if (!$this->tagParser instanceof TagParserInterface) {
108
            return [];
109
        }
110
111
        $results = $this->tagParser->importHtml($html)->xpathResults(
112
            '//script[@type="application/ld+json"]'
113
        );
114
115
        foreach ($results as $result) {
116
            $json = trim($result);
117
            // filtrage empty value (todo?)
118
            if ($json === '') {
119
                continue;
120
            }
121
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
122
            if (!is_array($data)
123
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
124
            ) {
125
                continue;
126
            }
127
128
            return $data;
129
        }
130
131
        return [];
132
    }
133
134
    /**
135
     * todo move? /refac/delete?
136
     */
137
    private function parseMetaTags(string $str): array
138
    {
139
        $pattern = '
140
              ~<\s*meta\s
141
              # using lookahead to capture type to $1
142
                (?=[^>]*?
143
                \b(?:name|property|http-equiv)\s*=\s*
144
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
145
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
146
              )
147
              # capture content to $2
148
              [^>]*?\bcontent\s*=\s*
149
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
150
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
151
              [^>]*>
152
              ~ix';
153
154
        if (preg_match_all($pattern, $str, $out)) {
155
            $combine = array_combine($out[1], $out[2]);
156
157
            return $combine ?: [];
158
        }
159
160
        return [];
161
    }
162
163
    /**
164
     * test.com => test.com
165
     * bla.test.com => test.com
166
     * test.co.uk => test.co.uk (national commercial subdomain)
167
     * site.google.com => site.google.com (blog)
168
     * bla.site.google.com => site.google.com (blog)
169
     */
170
    public function getPrettyDomainName(): string
171
    {
172
        // Parse custom exceptions (free.fr, gouv.fr, etc)
173
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
174
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
175
            if (TextUtil::str_ends_with($rawDomain, $end)) {
176
                return $this->sanitizeSubDomain($rawDomain);
177
            }
178
        }
179
180
        // Parse using InternetDomainParser library
181
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
182
    }
183
184
    /**
185
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
186
     * @throws Exception
187
     */
188
    public function getRegistrableSubDomain(): ?string
189
    {
190
        try {
191
            if (!ExternHttpClient::isHttpURL($this->url)) {
192
                throw new Exception('string is not an URL ' . $this->url);
193
            }
194
            if (!$this->domainParser instanceof InternetDomainParserInterface) {
195
                $this->log->notice('InternetDomainParser is not set');
196
197
                return null;
198
            }
199
200
            return $this->domainParser->getRegistrableDomainFromURL($this->url);
201
        } catch (Exception $e) {
202
            if ($this->log !== null) {
203
                $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
204
            }
205
            throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
206
        }
207
    }
208
209
    /**
210
     * Extract language from <html lang="en-us"> tag.
211
     */
212
    private function parseHtmlLang(string $html): ?string
213
    {
214
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
215
            return $matches[1];
216
        }
217
218
        return null;
219
    }
220
221
    /**
222
     * Extract webpage title from HTML <title>
223
     * not foolproof : example <!-- <title>bla</title> -->
224
     */
225
    private function parseHtmlTitle(string $html): ?string
226
    {
227
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
228
            return trim(strip_tags($matches[1]));
229
        }
230
231
        return null;
232
    }
233
234
    /**
235
     * Extract first <h1> from HTML.
236
     */
237
    private function parseHtmlFirstH1(string $html): ?string
238
    {
239
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
240
            return trim(strip_tags($matches[1]));
241
        }
242
243
        return null;
244
    }
245
246
    /**
247
     * TODO strip not unicode characters ?
248
     * TODO add initial capital letter ?
249
     * This method is used to sanitize subdomain name.
250
     * WTF ?!?!?!
251
     */
252
    protected function sanitizeSubDomain(string $subDomain): string
253
    {
254
        return str_replace('www.', '', $subDomain);
255
    }
256
257
    /**
258
     * Extract robots meta tag content.
259
     * <meta name="robots" content="noindex,noarchive">
260
     */
261
    private function getMetaRobotsContent(string $html): string
262
    {
263
        if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
264
            return $matches[1];
265
        }
266
267
        return '';
268
    }
269
}
270