Dispositif /
Wikibot
| 1 | <?php |
||
| 2 | /* |
||
| 3 | * This file is part of dispositif/wikibot application (@github) |
||
| 4 | * 2019-2023 © Philippe M./Irønie <[email protected]> |
||
| 5 | * For the full copyright and MIT license information, view the license file. |
||
| 6 | */ |
||
| 7 | |||
| 8 | declare(strict_types=1); |
||
| 9 | |||
| 10 | |||
| 11 | namespace App\Domain\ExternLink; |
||
| 12 | |||
| 13 | use App\Application\Utils\HttpUtil; |
||
| 14 | use App\Domain\InfrastructurePorts\InternetDomainParserInterface; |
||
| 15 | use App\Domain\InfrastructurePorts\TagParserInterface; |
||
| 16 | use App\Domain\Utils\TextUtil; |
||
| 17 | use App\Infrastructure\InternetDomainParser; |
||
| 18 | use App\Infrastructure\Monitor\NullLogger; |
||
| 19 | use Exception; |
||
| 20 | use Psr\Log\LoggerInterface; |
||
| 21 | |||
| 22 | /** |
||
| 23 | * Représente une page web d'un Lien Externe (hors wiki) |
||
| 24 | * Class ExternPage |
||
| 25 | * @package App\Domain |
||
| 26 | */ |
||
| 27 | class ExternPage |
||
| 28 | { |
||
| 29 | // todo move to config |
||
| 30 | protected const PRETTY_DOMAIN_EXCLUSION |
||
| 31 | = [ |
||
| 32 | '.中国', |
||
| 33 | '.gov', |
||
| 34 | '.free.fr', |
||
| 35 | '.gouv.fr', |
||
| 36 | '.com.cn', |
||
| 37 | 'site.google.com', |
||
| 38 | 'wordpress.com', |
||
| 39 | 'blogspot.com', |
||
| 40 | ]; |
||
| 41 | |||
| 42 | private readonly string $url; |
||
| 43 | |||
| 44 | /** |
||
| 45 | * ExternPage constructor. |
||
| 46 | * @throws Exception |
||
| 47 | */ |
||
| 48 | public function __construct( |
||
| 49 | string $url, |
||
| 50 | private readonly string $html, |
||
| 51 | private readonly ?TagParserInterface $tagParser = null, |
||
| 52 | private readonly ?InternetDomainParserInterface $domainParser = null, |
||
| 53 | private readonly LoggerInterface $log = new NullLogger() |
||
| 54 | ) |
||
| 55 | { |
||
| 56 | if (!HttpUtil::isHttpURL($url)) { |
||
| 57 | throw new Exception('string is not an URL ' . $url); |
||
| 58 | } |
||
| 59 | $this->url = $url; |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 60 | } |
||
| 61 | |||
| 62 | public function getUrl(): string |
||
| 63 | { |
||
| 64 | return $this->url; |
||
| 65 | } |
||
| 66 | |||
| 67 | public function getData(): array |
||
| 68 | { |
||
| 69 | $ld = $this->parseLdJson($this->html); |
||
| 70 | $meta = $this->parseMetaTags($this->html); |
||
| 71 | |||
| 72 | $meta['html-lang'] = $this->parseHtmlLang($this->html); // <html lang="en"> |
||
| 73 | $meta['html-title'] = $this->parseHtmlTitle($this->html); |
||
| 74 | $meta['html-h1'] = $this->parseHtmlFirstH1($this->html); |
||
| 75 | $meta['html-url'] = $this->url; |
||
| 76 | $meta['prettyDomainName'] = $this->getPrettyDomainName(); |
||
| 77 | $meta['robots'] = $this->getMetaRobotsContent($this->html); |
||
| 78 | |||
| 79 | return ['JSON-LD' => $ld, 'meta' => $meta]; |
||
| 80 | } |
||
| 81 | |||
| 82 | /** |
||
| 83 | * extract LD-JSON metadata from <script type="application/ld+json">. |
||
| 84 | * @throws Exception |
||
| 85 | */ |
||
| 86 | private function parseLdJson(string $html): array |
||
| 87 | { |
||
| 88 | if (!$this->tagParser instanceof TagParserInterface) { |
||
| 89 | return []; |
||
| 90 | } |
||
| 91 | |||
| 92 | try { |
||
| 93 | $results = $this->tagParser->importHtml($html)->xpathResults( |
||
| 94 | '//script[@type="application/ld+json"]' |
||
| 95 | ); |
||
| 96 | } catch (Exception $e) { |
||
| 97 | $this->log->warning('TagParser->xpathResults NULL ' . $this->url); |
||
| 98 | |||
| 99 | return []; |
||
| 100 | } |
||
| 101 | |||
| 102 | foreach ($results as $result) { |
||
| 103 | $json = trim((string) $result); |
||
| 104 | // filtrage empty value (todo?) |
||
| 105 | if ($json === '') { |
||
| 106 | continue; |
||
| 107 | } |
||
| 108 | $data = json_decode($json, true, 512, JSON_THROW_ON_ERROR); |
||
| 109 | if (!is_array($data) |
||
| 110 | || (isset($data['@type']) && is_string($data['@type']) && preg_match('#Breadcrumb#i', $data['@type'])) |
||
| 111 | ) { |
||
| 112 | continue; |
||
| 113 | } |
||
| 114 | |||
| 115 | return $data; |
||
| 116 | } |
||
| 117 | |||
| 118 | return []; |
||
| 119 | } |
||
| 120 | |||
| 121 | /** |
||
| 122 | * todo move? /refac/delete? |
||
| 123 | */ |
||
| 124 | private function parseMetaTags(string $str): array |
||
| 125 | { |
||
| 126 | $pattern = ' |
||
| 127 | ~<\s*meta\s |
||
| 128 | # using lookahead to capture type to $1 |
||
| 129 | (?=[^>]*? |
||
| 130 | \b(?:name|property|http-equiv)\s*=\s* |
||
| 131 | (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| |
||
| 132 | ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) |
||
| 133 | ) |
||
| 134 | # capture content to $2 |
||
| 135 | [^>]*?\bcontent\s*=\s* |
||
| 136 | (?|"\s*([^"]*?)\s*"|\'\s*([^\']*?)\s*\'| |
||
| 137 | ([^"\'>]*?)(?=\s*/?\s*>|\s\w+\s*=)) |
||
| 138 | [^>]*> |
||
| 139 | ~ix'; |
||
| 140 | |||
| 141 | if (preg_match_all($pattern, $str, $out)) { |
||
| 142 | $combine = array_combine($out[1], $out[2]); |
||
| 143 | |||
| 144 | return $combine ?: []; |
||
| 145 | } |
||
| 146 | |||
| 147 | return []; |
||
| 148 | } |
||
| 149 | |||
| 150 | /** |
||
| 151 | * test.com => test.com |
||
| 152 | * bla.test.com => test.com |
||
| 153 | * test.co.uk => test.co.uk (national commercial subdomain) |
||
| 154 | * site.google.com => site.google.com (blog) |
||
| 155 | * bla.site.google.com => site.google.com (blog) |
||
| 156 | */ |
||
| 157 | public function getPrettyDomainName(): string |
||
| 158 | { |
||
| 159 | // Parse custom exceptions (free.fr, gouv.fr, etc) |
||
| 160 | $rawDomain = InternetDomainParser::extractSubdomainString($this->url); //only php parsing |
||
| 161 | foreach (self::PRETTY_DOMAIN_EXCLUSION as $end) { |
||
| 162 | if (TextUtil::str_ends_with($rawDomain, $end)) { |
||
| 163 | return $this->sanitizeSubDomain($rawDomain); |
||
| 164 | } |
||
| 165 | } |
||
| 166 | |||
| 167 | // Parse using InternetDomainParser library |
||
| 168 | return $this->sanitizeSubDomain($this->getRegistrableSubDomain() ?? $rawDomain); // use lib and cached data |
||
| 169 | } |
||
| 170 | |||
| 171 | /** |
||
| 172 | * "http://www.bla.co.uk/fubar" => "bla.co.uk" |
||
| 173 | * @throws Exception |
||
| 174 | */ |
||
| 175 | public function getRegistrableSubDomain(): ?string |
||
| 176 | { |
||
| 177 | try { |
||
| 178 | if (!HttpUtil::isHttpURL($this->url)) { |
||
| 179 | throw new Exception('string is not an URL ' . $this->url); |
||
| 180 | } |
||
| 181 | if (!$this->domainParser instanceof InternetDomainParserInterface) { |
||
| 182 | $this->log->notice('InternetDomainParser is not set'); |
||
| 183 | |||
| 184 | return null; |
||
| 185 | } |
||
| 186 | |||
| 187 | return $this->domainParser->getRegistrableDomainFromURL($this->url); |
||
| 188 | } catch (Exception $e) { |
||
| 189 | if ($this->log !== null) { |
||
| 190 | $this->log->warning('InternetDomainParser->getRegistrableDomainFromURL NULL ' . $this->url); |
||
| 191 | } |
||
| 192 | throw new Exception('InternetDomainParser->getRegistrableDomainFromURL NULL', $e->getCode(), $e); |
||
| 193 | } |
||
| 194 | } |
||
| 195 | |||
| 196 | /** |
||
| 197 | * Extract language from <html lang="en-us"> tag. |
||
| 198 | */ |
||
| 199 | private function parseHtmlLang(string $html): ?string |
||
| 200 | { |
||
| 201 | if (preg_match('#<html(?: [^>]+)? lang="([A-Z-]{2,15})"(?: [^>]+)?>#i', $html, $matches)) { |
||
| 202 | return $matches[1]; |
||
| 203 | } |
||
| 204 | |||
| 205 | return null; |
||
| 206 | } |
||
| 207 | |||
| 208 | /** |
||
| 209 | * Extract webpage title from HTML <title> |
||
| 210 | * not foolproof : example <!-- <title>bla</title> --> |
||
| 211 | */ |
||
| 212 | private function parseHtmlTitle(string $html): ?string |
||
| 213 | { |
||
| 214 | if (preg_match('#<title>([^<]+)</title>#i', $html, $matches)) { |
||
| 215 | return trim(strip_tags($matches[1])); |
||
| 216 | } |
||
| 217 | |||
| 218 | return null; |
||
| 219 | } |
||
| 220 | |||
| 221 | /** |
||
| 222 | * Extract first <h1> from HTML. |
||
| 223 | */ |
||
| 224 | private function parseHtmlFirstH1(string $html): ?string |
||
| 225 | { |
||
| 226 | if (preg_match('#<h1[^>]*>([^<]+)</h1>#i', $html, $matches)) { |
||
| 227 | return trim(strip_tags($matches[1])); |
||
| 228 | } |
||
| 229 | |||
| 230 | return null; |
||
| 231 | } |
||
| 232 | |||
| 233 | /** |
||
| 234 | * TODO strip not unicode characters ? |
||
| 235 | * TODO add initial capital letter ? |
||
| 236 | * This method is used to sanitize subdomain name. |
||
| 237 | * WTF ?!?!?! |
||
| 238 | */ |
||
| 239 | protected function sanitizeSubDomain(string $subDomain): string |
||
| 240 | { |
||
| 241 | return str_replace('www.', '', $subDomain); |
||
| 242 | } |
||
| 243 | |||
| 244 | /** |
||
| 245 | * Extract robots meta tag content. |
||
| 246 | * <meta name="robots" content="noindex,noarchive"> |
||
| 247 | */ |
||
| 248 | private function getMetaRobotsContent(string $html): string |
||
| 249 | { |
||
| 250 | if (preg_match('#<meta[^>]+name="robots"[^>]+content="([^"]+)"#i', $html, $matches)) { |
||
| 251 | return $matches[1]; |
||
| 252 | } |
||
| 253 | |||
| 254 | return ''; |
||
| 255 | } |
||
| 256 | } |
||
| 257 |