Issues (106)

src/Domain/ExternLink/ExternPage.php (1 issue)

Labels
Severity
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
11
namespace App\Domain\ExternLink;
12
13
use App\Application\Utils\HttpUtil;
14
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
15
use App\Domain\InfrastructurePorts\TagParserInterface;
16
use App\Domain\Utils\TextUtil;
17
use App\Infrastructure\InternetDomainParser;
18
use App\Infrastructure\Monitor\NullLogger;
19
use Exception;
20
use Psr\Log\LoggerInterface;
21
22
/**
23
 * Représente une page web d'un Lien Externe (hors wiki)
24
 * Class ExternPage
25
 * @package App\Domain
26
 */
27
class ExternPage
28
{
29
    // todo move to config
30
    protected const PRETTY_DOMAIN_EXCLUSION
31
        = [
32
            '.中国',
33
            '.gov',
34
            '.free.fr',
35
            '.gouv.fr',
36
            '.com.cn',
37
            'site.google.com',
38
            'wordpress.com',
39
            'blogspot.com',
40
        ];
41
42
    private readonly string $url;
43
44
    /**
45
     * ExternPage constructor.
46
     * @throws Exception
47
     */
48
    public function __construct(
49
        string                                          $url,
50
        private readonly string                         $html,
51
        private readonly ?TagParserInterface            $tagParser = null,
52
        private readonly ?InternetDomainParserInterface $domainParser = null,
53
        private readonly LoggerInterface                $log = new NullLogger()
54
    )
55
    {
56
        if (!HttpUtil::isHttpURL($url)) {
57
            throw new Exception('string is not an URL ' . $url);
58
        }
59
        $this->url = $url;
0 ignored issues
show
The property url is declared read-only in App\Domain\ExternLink\ExternPage.
Loading history...
60
    }
61
62
    public function getUrl(): string
63
    {
64
        return $this->url;
65
    }
66
67
    public function getData(): array
68
    {
69
        $ld = $this->parseLdJson($this->html);
70
        $meta = $this->parseMetaTags($this->html);
71
72
        $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en">
73
        $meta['html-title'] = $this->parseHtmlTitle($this->html);
74
        $meta['html-h1'] = $this->parseHtmlFirstH1($this->html);
75
        $meta['html-url'] = $this->url;
76
        $meta['prettyDomainName'] = $this->getPrettyDomainName();
77
        $meta['robots'] = $this->getMetaRobotsContent($this->html);
78
79
        return ['JSON-LD' => $ld, 'meta' => $meta];
80
    }
81
82
    /**
83
     * extract LD-JSON metadata from <script type="application/ld+json">.
84
     * @throws Exception
85
     */
86
    private function parseLdJson(string $html): array
87
    {
88
        if (!$this->tagParser instanceof TagParserInterface) {
89
            return [];
90
        }
91
92
        try {
93
            $results = $this->tagParser->importHtml($html)->xpathResults(
94
                '//script[@type="application/ld+json"]'
95
            );
96
        } catch (Exception $e) {
97
            $this->log->warning('TagParser->xpathResults NULL ' . $this->url);
98
99
            return [];
100
        }
101
102
        foreach ($results as $result) {
103
            $json = trim((string) $result);
104
            // filtrage empty value (todo?)
105
            if ($json === '') {
106
                continue;
107
            }
108
            $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR);
109
            if (!is_array($data)
110
                || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type']))
111
            ) {
112
                continue;
113
            }
114
115
            return $data;
116
        }
117
118
        return [];
119
    }
120
121
    /**
122
     * todo move? /refac/delete?
123
     */
124
    private function parseMetaTags(string $str): array
125
    {
126
        $pattern = '
127
              ~<\s*meta\s
128
              # using lookahead to capture type to $1
129
                (?=[^>]*?
130
                \b(?:name|property|http-equiv)\s*=\s*
131
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
132
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
133
              )
134
              # capture content to $2
135
              [^>]*?\bcontent\s*=\s*
136
                (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'|
137
                ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=))
138
              [^>]*>
139
              ~ix';
140
141
        if (preg_match_all($pattern, $str, $out)) {
142
            $combine = array_combine($out[1], $out[2]);
143
144
            return $combine ?: [];
145
        }
146
147
        return [];
148
    }
149
150
    /**
151
     * test.com => test.com
152
     * bla.test.com => test.com
153
     * test.co.uk => test.co.uk (national commercial subdomain)
154
     * site.google.com => site.google.com (blog)
155
     * bla.site.google.com => site.google.com (blog)
156
     */
157
    public function getPrettyDomainName(): string
158
    {
159
        // Parse custom exceptions (free.fr, gouv.fr, etc)
160
        $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing
161
        foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) {
162
            if (TextUtil::str_ends_with($rawDomain, $end)) {
163
                return $this->sanitizeSubDomain($rawDomain);
164
            }
165
        }
166
167
        // Parse using InternetDomainParser library
168
        return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data
169
    }
170
171
    /**
172
     * "http://www.bla.co.uk/fubar" => "bla.co.uk"
173
     * @throws Exception
174
     */
175
    public function getRegistrableSubDomain(): ?string
176
    {
177
        try {
178
            if (!HttpUtil::isHttpURL($this->url)) {
179
                throw new Exception('string is not an URL ' . $this->url);
180
            }
181
            if (!$this->domainParser instanceof InternetDomainParserInterface) {
182
                $this->log->notice('InternetDomainParser is not set');
183
184
                return null;
185
            }
186
187
            return $this->domainParser->getRegistrableDomainFromURL($this->url);
188
        } catch (Exception $e) {
189
            if ($this->log !== null) {
190
                $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url);
191
            }
192
            throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e);
193
        }
194
    }
195
196
    /**
197
     * Extract language from <html lang="en-us"> tag.
198
     */
199
    private function parseHtmlLang(string $html): ?string
200
    {
201
        if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) {
202
            return $matches[1];
203
        }
204
205
        return null;
206
    }
207
208
    /**
209
     * Extract webpage title from HTML <title>
210
     * not foolproof : example <!-- <title>bla</title> -->
211
     */
212
    private function parseHtmlTitle(string $html): ?string
213
    {
214
        if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) {
215
            return trim(strip_tags($matches[1]));
216
        }
217
218
        return null;
219
    }
220
221
    /**
222
     * Extract first <h1> from HTML.
223
     */
224
    private function parseHtmlFirstH1(string $html): ?string
225
    {
226
        if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) {
227
            return trim(strip_tags($matches[1]));
228
        }
229
230
        return null;
231
    }
232
233
    /**
234
     * TODO strip not unicode characters ?
235
     * TODO add initial capital letter ?
236
     * This method is used to sanitize subdomain name.
237
     * WTF ?!?!?!
238
     */
239
    protected function sanitizeSubDomain(string $subDomain): string
240
    {
241
        return str_replace('www.', '', $subDomain);
242
    }
243
244
    /**
245
     * Extract robots meta tag content.
246
     * <meta name="robots" content="noindex,noarchive">
247
     */
248
    private function getMetaRobotsContent(string $html): string
249
    {
250
        if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) {
251
            return $matches[1];
252
        }
253
254
        return '';
255
    }
256
}
257