1 | <?php |
||
2 | /* |
||
3 | * This file is part of dispositif/wikibot application (@github) |
||
4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||
5 | * For the full copyright and MIT license information, view the license file. |
||
6 | */ |
||
7 | |||
8 | declare(strict_types=1); |
||
9 | |||
10 | |||
11 | namespace App\Domain\ExternLink; |
||
12 | |||
13 | use App\Application\Utils\HttpUtil; |
||
14 | use App\Domain\InfrastructurePorts\InternetDomainParserInterface; |
||
15 | use App\Domain\InfrastructurePorts\TagParserInterface; |
||
16 | use App\Domain\Utils\TextUtil; |
||
17 | use App\Infrastructure\InternetDomainParser; |
||
18 | use App\Infrastructure\Monitor\NullLogger; |
||
19 | use Exception; |
||
20 | use Psr\Log\LoggerInterface; |
||
21 | |||
22 | /** |
||
23 | * Représente une page web d'un Lien Externe (hors wiki) |
||
24 | * Class ExternPage |
||
25 | * @package App\Domain |
||
26 | */ |
||
27 | class ExternPage |
||
28 | { |
||
29 | // todo move to config |
||
30 | protected const PRETTY_DOMAIN_EXCLUSION |
||
31 | = [ |
||
32 | '.中国', |
||
33 | '.gov', |
||
34 | '.free.fr', |
||
35 | '.gouv.fr', |
||
36 | '.com.cn', |
||
37 | 'site.google.com', |
||
38 | 'wordpress.com', |
||
39 | 'blogspot.com', |
||
40 | ]; |
||
41 | |||
42 | private readonly string $url; |
||
43 | |||
44 | /** |
||
45 | * ExternPage constructor. |
||
46 | * @throws Exception |
||
47 | */ |
||
48 | public function __construct( |
||
49 | string $url, |
||
50 | private readonly string $html, |
||
51 | private readonly ?TagParserInterface $tagParser = null, |
||
52 | private readonly ?InternetDomainParserInterface $domainParser = null, |
||
53 | private readonly LoggerInterface $log = new NullLogger() |
||
54 | ) |
||
55 | { |
||
56 | if (!HttpUtil::isHttpURL($url)) { |
||
57 | throw new Exception('string is not an URL ' . $url); |
||
58 | } |
||
59 | $this->url = $url; |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
60 | } |
||
61 | |||
62 | public function getUrl(): string |
||
63 | { |
||
64 | return $this->url; |
||
65 | } |
||
66 | |||
67 | public function getData(): array |
||
68 | { |
||
69 | $ld = $this->parseLdJson($this->html); |
||
70 | $meta = $this->parseMetaTags($this->html); |
||
71 | |||
72 | $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en"> |
||
73 | $meta['html-title'] = $this->parseHtmlTitle($this->html); |
||
74 | $meta['html-h1'] = $this->parseHtmlFirstH1($this->html); |
||
75 | $meta['html-url'] = $this->url; |
||
76 | $meta['prettyDomainName'] = $this->getPrettyDomainName(); |
||
77 | $meta['robots'] = $this->getMetaRobotsContent($this->html); |
||
78 | |||
79 | return ['JSON-LD' => $ld, 'meta' => $meta]; |
||
80 | } |
||
81 | |||
82 | /** |
||
83 | * extract LD-JSON metadata from <script type="application/ld+json">. |
||
84 | * @throws Exception |
||
85 | */ |
||
86 | private function parseLdJson(string $html): array |
||
87 | { |
||
88 | if (!$this->tagParser instanceof TagParserInterface) { |
||
89 | return []; |
||
90 | } |
||
91 | |||
92 | try { |
||
93 | $results = $this->tagParser->importHtml($html)->xpathResults( |
||
94 | '//script[@type="application/ld+json"]' |
||
95 | ); |
||
96 | } catch (Exception $e) { |
||
97 | $this->log->warning('TagParser->xpathResults NULL ' . $this->url); |
||
98 | |||
99 | return []; |
||
100 | } |
||
101 | |||
102 | foreach ($results as $result) { |
||
103 | $json = trim((string) $result); |
||
104 | // filtrage empty value (todo?) |
||
105 | if ($json === '') { |
||
106 | continue; |
||
107 | } |
||
108 | $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR); |
||
109 | if (!is_array($data) |
||
110 | || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type'])) |
||
111 | ) { |
||
112 | continue; |
||
113 | } |
||
114 | |||
115 | return $data; |
||
116 | } |
||
117 | |||
118 | return []; |
||
119 | } |
||
120 | |||
121 | /** |
||
122 | * todo move? /refac/delete? |
||
123 | */ |
||
124 | private function parseMetaTags(string $str): array |
||
125 | { |
||
126 | $pattern = ' |
||
127 | ~<\s*meta\s |
||
128 | # using lookahead to capture type to $1 |
||
129 | (?=[^>]*? |
||
130 | \b(?:name|property|http-equiv)\s*=\s* |
||
131 | (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| |
||
132 | ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) |
||
133 | ) |
||
134 | # capture content to $2 |
||
135 | [^>]*?\bcontent\s*=\s* |
||
136 | (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| |
||
137 | ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) |
||
138 | [^>]*> |
||
139 | ~ix'; |
||
140 | |||
141 | if (preg_match_all($pattern, $str, $out)) { |
||
142 | $combine = array_combine($out[1], $out[2]); |
||
143 | |||
144 | return $combine ?: []; |
||
145 | } |
||
146 | |||
147 | return []; |
||
148 | } |
||
149 | |||
150 | /** |
||
151 | * test.com => test.com |
||
152 | * bla.test.com => test.com |
||
153 | * test.co.uk => test.co.uk (national commercial subdomain) |
||
154 | * site.google.com => site.google.com (blog) |
||
155 | * bla.site.google.com => site.google.com (blog) |
||
156 | */ |
||
157 | public function getPrettyDomainName(): string |
||
158 | { |
||
159 | // Parse custom exceptions (free.fr, gouv.fr, etc) |
||
160 | $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing |
||
161 | foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) { |
||
162 | if (TextUtil::str_ends_with($rawDomain, $end)) { |
||
163 | return $this->sanitizeSubDomain($rawDomain); |
||
164 | } |
||
165 | } |
||
166 | |||
167 | // Parse using InternetDomainParser library |
||
168 | return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data |
||
169 | } |
||
170 | |||
171 | /** |
||
172 | * "http://www.bla.co.uk/fubar" => "bla.co.uk" |
||
173 | * @throws Exception |
||
174 | */ |
||
175 | public function getRegistrableSubDomain(): ?string |
||
176 | { |
||
177 | try { |
||
178 | if (!HttpUtil::isHttpURL($this->url)) { |
||
179 | throw new Exception('string is not an URL ' . $this->url); |
||
180 | } |
||
181 | if (!$this->domainParser instanceof InternetDomainParserInterface) { |
||
182 | $this->log->notice('InternetDomainParser is not set'); |
||
183 | |||
184 | return null; |
||
185 | } |
||
186 | |||
187 | return $this->domainParser->getRegistrableDomainFromURL($this->url); |
||
188 | } catch (Exception $e) { |
||
189 | if ($this->log !== null) { |
||
190 | $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url); |
||
191 | } |
||
192 | throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e); |
||
193 | } |
||
194 | } |
||
195 | |||
196 | /** |
||
197 | * Extract language from <html lang="en-us"> tag. |
||
198 | */ |
||
199 | private function parseHtmlLang(string $html): ?string |
||
200 | { |
||
201 | if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) { |
||
202 | return $matches[1]; |
||
203 | } |
||
204 | |||
205 | return null; |
||
206 | } |
||
207 | |||
208 | /** |
||
209 | * Extract webpage title from HTML <title> |
||
210 | * not foolproof : example <!-- <title>bla</title> --> |
||
211 | */ |
||
212 | private function parseHtmlTitle(string $html): ?string |
||
213 | { |
||
214 | if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) { |
||
215 | return trim(strip_tags($matches[1])); |
||
216 | } |
||
217 | |||
218 | return null; |
||
219 | } |
||
220 | |||
221 | /** |
||
222 | * Extract first <h1> from HTML. |
||
223 | */ |
||
224 | private function parseHtmlFirstH1(string $html): ?string |
||
225 | { |
||
226 | if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) { |
||
227 | return trim(strip_tags($matches[1])); |
||
228 | } |
||
229 | |||
230 | return null; |
||
231 | } |
||
232 | |||
233 | /** |
||
234 | * TODO strip not unicode characters ? |
||
235 | * TODO add initial capital letter ? |
||
236 | * This method is used to sanitize subdomain name. |
||
237 | * WTF ?!?!?! |
||
238 | */ |
||
239 | protected function sanitizeSubDomain(string $subDomain): string |
||
240 | { |
||
241 | return str_replace('www.', '', $subDomain); |
||
242 | } |
||
243 | |||
244 | /** |
||
245 | * Extract robots meta tag content. |
||
246 | * <meta name="robots" content="noindex,noarchive"> |
||
247 | */ |
||
248 | private function getMetaRobotsContent(string $html): string |
||
249 | { |
||
250 | if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) { |
||
251 | return $matches[1]; |
||
252 | } |
||
253 | |||
254 | return ''; |
||
255 | } |
||
256 | } |
||
257 |