Total Complexity | 100 |
Total Lines | 551 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
33 | class ExternRefTransformer implements ExternRefTransformerInterface |
||
34 | { |
||
35 | public const HTTP_REQUEST_LOOP_DELAY = 10; |
||
36 | public const LOG_REQUEST_ERROR = __DIR__ . '/../../Application/resources/external_request_error.log'; // todo move |
||
37 | public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt'; |
||
38 | public const REPLACE_404 = true; |
||
39 | public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml'; |
||
40 | public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json'; |
||
41 | public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json'; |
||
42 | public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json'; |
||
43 | public const ROBOT_NOINDEX_WHITELIST = ['legifrance.gouv.fr']; |
||
44 | |||
45 | public $skipSiteBlacklisted = true; |
||
46 | public $skipRobotNoIndex = true; |
||
47 | /** |
||
48 | * @var array |
||
49 | */ |
||
50 | public $summaryLog = []; |
||
51 | /** |
||
52 | * @var LoggerInterface |
||
53 | */ |
||
54 | protected $log; |
||
55 | private $config; |
||
56 | /** |
||
57 | * @var string |
||
58 | */ |
||
59 | private $domain; |
||
60 | /** |
||
61 | * @var string |
||
62 | */ |
||
63 | private $url; |
||
64 | /** |
||
65 | * @var ExternMapper |
||
66 | */ |
||
67 | private $mapper; |
||
68 | /** |
||
69 | * @var array |
||
70 | */ |
||
71 | private $data = []; |
||
72 | /** |
||
73 | * @var array |
||
74 | */ |
||
75 | private $skip_domain; |
||
76 | /** |
||
77 | * @var ExternPage |
||
78 | */ |
||
79 | private $externalPage; |
||
80 | /** |
||
81 | * @var Summary|null |
||
82 | */ |
||
83 | private $summary; |
||
84 | /** |
||
85 | * @var ExternHttpClientInterface |
||
86 | */ |
||
87 | private $httpClient; |
||
88 | |||
89 | public function __construct(ExternMapper $externMapper, ExternHttpClientInterface $httpClient, ?LoggerInterface $logger) |
||
90 | { |
||
91 | $this->log = $logger ?? new NullLogger(); |
||
92 | $this->importConfigAndData(); |
||
93 | $this->mapper = $externMapper; |
||
94 | $this->httpClient = $httpClient; |
||
95 | } |
||
96 | |||
97 | /** |
||
98 | * TODO Refac : chain of responsibility or composite pattern |
||
99 | * @throws Exception |
||
100 | */ |
||
101 | public function process(string $url, Summary $summary): string |
||
102 | { |
||
103 | if (!$this->isURLAuthorized($url)) { |
||
104 | return $url; |
||
105 | } |
||
106 | try { |
||
107 | $url = WikiTextUtil::normalizeUrlForTemplate($url); |
||
108 | $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !! |
||
109 | } catch (Exception $exception) { |
||
110 | return $this->manageHttpErrors($exception, $url); |
||
111 | } |
||
112 | if ($this->emptyPageData($pageData, $url)) { |
||
113 | return $url; |
||
114 | } |
||
115 | if ($this->isRobotNoIndex($pageData, $url) && $this->skipRobotNoIndex) { |
||
116 | // TODO ? return {lien web| titre=Titre inconnu... |
||
117 | // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986 |
||
118 | return $url; |
||
119 | } |
||
120 | |||
121 | $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess |
||
122 | if ($this->emptyMapData($mappedData, $url)) { |
||
123 | // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ... |
||
124 | return $url; |
||
125 | } |
||
126 | $mappedData = $this->unsetAccesLibre($mappedData); |
||
127 | |||
128 | $this->addSummaryLog($mappedData, $summary); |
||
129 | $this->tagAndLog($mappedData); |
||
130 | |||
131 | $template = $this->chooseTemplateByData($mappedData); |
||
132 | |||
133 | $mappedData = $this->replaceSomeData($mappedData, $template); |
||
134 | $serialized = $this->optimizeAndSerialize($template, $mappedData); |
||
135 | $normalized = Normalizer::normalize($serialized); // sometimes :bool |
||
136 | if (!empty($normalized) && is_string($normalized)) { |
||
137 | return $normalized; |
||
138 | } |
||
139 | if (!empty($serialized)) { |
||
140 | return $serialized; |
||
141 | } |
||
142 | |||
143 | return $url; // error fallback |
||
144 | } |
||
145 | |||
146 | protected function isURLAuthorized(string $url): bool |
||
147 | { |
||
148 | $this->url = $url; |
||
149 | if (!ExternHttpClient::isHttpURL($url)) { |
||
150 | $this->log->debug('Skip : not a valid URL : ' . $url); |
||
151 | return false; |
||
152 | } |
||
153 | |||
154 | if ($this->hasForbiddenFilenameExtension($url)) { |
||
155 | return false; |
||
156 | } |
||
157 | if (!ExternHttpClient::isHttpURL($url)) { |
||
158 | throw new Exception('string is not an URL ' . $url); |
||
159 | } |
||
160 | try { |
||
161 | $this->domain = (new InternetDomainParser())->getRegistrableDomainFromURL($url); |
||
162 | } catch (Exception $e) { |
||
163 | $this->log->warning('Skip : not a valid URL : ' . $url); |
||
164 | return false; |
||
165 | } |
||
166 | |||
167 | return $this->validateConfigWebDomain(); |
||
168 | } |
||
169 | |||
170 | /** |
||
171 | * @param array $mapData |
||
172 | * |
||
173 | * @throws Exception |
||
174 | */ |
||
175 | private function tagAndLog(array $mapData) |
||
176 | { |
||
177 | $this->log->debug('mapData', $mapData); |
||
178 | $this->summary->citationNumber = $this->summary->citationNumber ?? 0; |
||
179 | $this->summary->citationNumber++; |
||
180 | |||
181 | if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) { |
||
182 | $this->log->notice("Article OK"); |
||
183 | } |
||
184 | if (isset($this->data['newspaper'][$this->domain])) { |
||
185 | $this->log->notice('PRESSE'); |
||
186 | $this->summary->memo['presse'] = true; |
||
187 | } |
||
188 | if ($this->isScientificDomain()) { |
||
189 | $this->log->notice('SCIENCE'); |
||
190 | $this->summary->memo['science'] = true; |
||
191 | } |
||
192 | if (!isset($this->summary->memo['sites']) |
||
193 | || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites']) |
||
194 | ) { |
||
195 | $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName(); |
||
196 | } |
||
197 | if (isset($mapData['accès url'])) { |
||
198 | $this->log->notice('accès 🔒 ' . $mapData['accès url']); |
||
199 | if ($mapData['accès url'] !== 'libre') { |
||
200 | $this->summary->memo['accès url non libre'] = true; |
||
201 | } |
||
202 | } |
||
203 | } |
||
204 | |||
205 | private function isScientificDomain(): bool |
||
206 | { |
||
207 | if (isset($this->data['scientific domain'][$this->domain])) { |
||
208 | return true; |
||
209 | } |
||
210 | return strpos('.revues.org', $this->domain) > 0; |
||
211 | } |
||
212 | |||
213 | private function addSummaryLog(array $mapData, Summary $summary) |
||
214 | { |
||
215 | $this->summary = $summary; |
||
216 | $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?'; |
||
217 | } |
||
218 | |||
219 | /** |
||
220 | * todo refac lisible |
||
221 | * |
||
222 | * @param array $mapData |
||
223 | * |
||
224 | * @return AbstractWikiTemplate |
||
225 | * @throws Exception |
||
226 | */ |
||
227 | private function chooseTemplateByData(array $mapData): AbstractWikiTemplate |
||
228 | { |
||
229 | // Logique : choix template |
||
230 | $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? []; |
||
231 | $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false; |
||
232 | |||
233 | if (!empty($mapData['doi'])) { |
||
234 | $templateName = 'article'; |
||
235 | } |
||
236 | |||
237 | if ($this->config[$this->domain]['template'] === 'article' |
||
238 | || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE']) |
||
239 | || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain])) |
||
240 | || $this->isScientificDomain() |
||
241 | ) { |
||
242 | $templateName = 'article'; |
||
243 | } |
||
244 | if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') { |
||
245 | $templateName = 'lien web'; |
||
246 | } |
||
247 | |||
248 | // date obligatoire pour {article} |
||
249 | if (!isset($mapData['date'])) { |
||
250 | $templateName = 'lien web'; |
||
251 | } |
||
252 | |||
253 | $template = WikiTemplateFactory::create($templateName); |
||
254 | $template->userSeparator = " |"; |
||
255 | $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0); |
||
256 | |||
257 | return $template; |
||
258 | } |
||
259 | |||
260 | /** |
||
261 | * Logique : remplacement titre périodique ou nom du site |
||
262 | * |
||
263 | * @param array $mapData |
||
264 | * @param $template |
||
265 | * |
||
266 | * @return array |
||
267 | */ |
||
268 | private function replaceSitenameByConfig(array $mapData, $template): array |
||
311 | } |
||
312 | |||
313 | private function replaceURLbyOriginal(array $mapData): array |
||
314 | { |
||
315 | $mapData['url'] = $this->url; |
||
316 | |||
317 | return $mapData; |
||
318 | } |
||
319 | |||
320 | /** |
||
321 | * todo move + prettyDomainName |
||
322 | * URL => "parismatch.com/People/bla…" |
||
323 | */ |
||
324 | public function generateTitleFromURLText(string $url): string |
||
325 | { |
||
326 | $text = str_replace(['https://', 'http://', 'www.'], '', $url); |
||
327 | if (strlen($text) > 30) { |
||
328 | $text = substr($text, 0, 30) . '…'; |
||
329 | } |
||
330 | |||
331 | return $text; |
||
332 | } |
||
333 | |||
334 | /** |
||
335 | * Skip PDF GIF etc |
||
336 | * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers |
||
337 | * |
||
338 | * @param string $url |
||
339 | * |
||
340 | * @return bool |
||
341 | */ |
||
342 | private function hasForbiddenFilenameExtension(string $url): bool |
||
343 | { |
||
344 | return (bool)preg_match( |
||
345 | '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i', |
||
346 | $url |
||
347 | ); |
||
348 | } |
||
349 | |||
350 | // todo extract Infra getcont form file + inject |
||
351 | protected function importConfigAndData(): void |
||
352 | { |
||
353 | // todo REFAC DataObject[] |
||
354 | $this->config = Yaml::parseFile(self::CONFIG_PRESSE); |
||
355 | $skipFromFile = file( |
||
356 | self::SKIP_DOMAIN_FILENAME, |
||
357 | FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES |
||
358 | ); |
||
359 | $this->skip_domain = $skipFromFile ?: []; |
||
360 | |||
361 | $this->data['newspaper'] = json_decode(file_get_contents(self::CONFIG_NEWSPAPER_JSON), true, 512, JSON_THROW_ON_ERROR); |
||
362 | $this->data['scientific domain'] = json_decode( |
||
363 | file_get_contents(self::CONFIG_SCIENTIFIC_JSON), |
||
364 | true, |
||
365 | 512, |
||
366 | JSON_THROW_ON_ERROR |
||
367 | ); |
||
368 | $this->data['scientific wiki'] = json_decode( |
||
369 | file_get_contents(self::CONFIG_SCIENTIFIC_WIKI_JSON), |
||
370 | true, |
||
371 | 512, |
||
372 | JSON_THROW_ON_ERROR |
||
373 | ); |
||
374 | } |
||
375 | |||
376 | /** |
||
377 | * @throws Exception |
||
378 | */ |
||
379 | protected function extractPageDataFromUrl(string $url): array |
||
380 | { |
||
381 | sleep(self::HTTP_REQUEST_LOOP_DELAY); |
||
382 | $this->externalPage = ExternPageFactory::fromURL($url, $this->httpClient, $this->log); |
||
383 | $pageData = $this->externalPage->getData(); |
||
384 | $this->log->debug('metaData', $pageData); |
||
385 | |||
386 | return $pageData; |
||
387 | } |
||
388 | |||
389 | protected function formatLienBrise(string $url): string |
||
390 | { |
||
391 | return sprintf( |
||
392 | '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}', |
||
393 | $url, |
||
394 | $this->generateTitleFromURLText($url), |
||
395 | date('d-m-Y') |
||
396 | ); |
||
397 | } |
||
398 | |||
399 | protected function log403(string $url): void |
||
400 | { |
||
401 | $this->log->warning('403 Forbidden : ' . $url); |
||
402 | file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : ' . $this->domain . "\n", FILE_APPEND); |
||
403 | } |
||
404 | |||
405 | protected function manageHttpErrors(Exception $e, string $url): string |
||
406 | { |
||
407 | // "410 gone" => {lien brisé} |
||
408 | if (preg_match('#410 Gone#i', $e->getMessage())) { |
||
409 | $this->log->notice('410 Gone'); |
||
410 | |||
411 | return $this->formatLienBrise($url); |
||
412 | } // 403 |
||
413 | elseif (preg_match('#403 Forbidden#i', $e->getMessage())) { |
||
414 | $this->log403($url); |
||
415 | |||
416 | return $url; |
||
417 | } elseif (preg_match('#404 Not Found#i', $e->getMessage())) { |
||
418 | $this->log->notice('404 Not Found'); |
||
419 | |||
420 | if (self::REPLACE_404) { |
||
421 | return $this->formatLienBrise($url); |
||
422 | } |
||
423 | return $url; |
||
424 | } elseif (preg_match('#401 Unauthorized#i', $e->getMessage())) { |
||
425 | $this->log->notice('401 Unauthorized : skip ' . $url); |
||
426 | |||
427 | return $url; |
||
428 | } else { |
||
429 | // autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire |
||
430 | $this->log->warning('erreur sur extractWebData ' . $e->getMessage()); |
||
431 | |||
432 | //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND); |
||
433 | |||
434 | return $url; |
||
435 | } |
||
436 | } |
||
437 | |||
438 | private function emptyPageData(array $pageData, string $url): bool |
||
439 | { |
||
440 | if (empty($pageData) |
||
441 | || (empty($pageData['JSON-LD']) && empty($pageData['meta'])) |
||
442 | ) { |
||
443 | $this->log->notice('No metadata : ' . $url); |
||
444 | |||
445 | return true; |
||
446 | } |
||
447 | |||
448 | return false; |
||
449 | } |
||
450 | |||
451 | /** |
||
452 | * Detect if robots noindex |
||
453 | * https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag?hl=fr |
||
454 | */ |
||
455 | private function isRobotNoIndex(array $pageData, string $url): bool |
||
456 | { |
||
457 | $robots = $pageData['meta']['robots'] ?? null; |
||
458 | if ( |
||
459 | !empty($robots) |
||
460 | && ( |
||
461 | strpos(strtolower($robots), 'noindex') !== false |
||
462 | || strpos(strtolower($robots), 'none') !== false |
||
463 | ) |
||
464 | ) { |
||
465 | $this->log->notice('robots NOINDEX : ' . $url); |
||
466 | |||
467 | return !$this->isNoIndexDomainWhitelisted($pageData['meta']['prettyDomainName']); |
||
468 | } |
||
469 | |||
470 | return false; |
||
471 | } |
||
472 | |||
473 | /** |
||
474 | * Pas de 'accès url=libre' # débat février 2021 |
||
475 | */ |
||
476 | protected function unsetAccesLibre(array $mapData): array |
||
477 | { |
||
478 | if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') { |
||
479 | unset($mapData['accès url']); |
||
480 | } |
||
481 | return $mapData; |
||
482 | } |
||
483 | |||
484 | /** |
||
485 | * check dataValide |
||
486 | * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL |
||
487 | */ |
||
488 | private function emptyMapData(array $mapData, string $url): bool |
||
489 | { |
||
490 | if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) { |
||
491 | $this->log->info('Mapping incomplet : ' . $url); |
||
492 | |||
493 | return true; |
||
494 | } |
||
495 | return false; |
||
496 | } |
||
497 | |||
498 | protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array |
||
511 | } |
||
512 | |||
513 | /** |
||
514 | * @param AbstractWikiTemplate $template |
||
515 | * @param array $mapData |
||
516 | * |
||
517 | * @return string |
||
518 | * @throws Exception |
||
519 | */ |
||
520 | protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string |
||
521 | { |
||
522 | $template->hydrate($mapData); |
||
523 | $optimizer = OptimizerFactory::fromTemplate($template); |
||
524 | $optimizer->doTasks(); |
||
525 | $templateOptimized = $optimizer->getOptiTemplate(); |
||
526 | |||
527 | $serialized = $templateOptimized->serialize(true); |
||
528 | $this->log->info('Serialized 444: ' . $serialized . "\n"); |
||
529 | return $serialized; |
||
530 | } |
||
531 | |||
532 | /** |
||
533 | * @return bool |
||
534 | */ |
||
535 | protected function validateConfigWebDomain(): bool |
||
536 | { |
||
537 | if ($this->isSiteBlackListed()) { |
||
538 | return false; |
||
539 | } |
||
540 | $this->logDebugConfigWebDomain(); |
||
541 | |||
542 | $this->config[$this->domain] = $this->config[$this->domain] ?? []; |
||
543 | $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : []; |
||
544 | |||
545 | if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) { |
||
546 | $this->log->info("Domain " . $this->domain . " disabled\n"); |
||
547 | |||
548 | return false; |
||
549 | } |
||
550 | |||
551 | return true; |
||
552 | } |
||
553 | |||
554 | /** |
||
555 | * @return void |
||
556 | */ |
||
557 | protected function logDebugConfigWebDomain(): void |
||
558 | { |
||
559 | if (!isset($this->config[$this->domain])) { |
||
560 | $this->log->debug("Domain " . $this->domain . " non configuré"); |
||
561 | } else { |
||
562 | $this->log->debug("Domain " . $this->domain . " configuré"); |
||
563 | } |
||
564 | } |
||
565 | |||
566 | protected function isSiteBlackListed(): bool |
||
573 | } |
||
574 | |||
575 | protected function isNoIndexDomainWhitelisted(?string $prettyDomain): bool |
||
576 | { |
||
577 | if (in_array($prettyDomain ?? '', self::ROBOT_NOINDEX_WHITELIST)) { |
||
578 | $this->log->notice('ROBOT_NOINDEX_WHITELIST ' . $prettyDomain); |
||
579 | |||
580 | return true; |
||
581 | } |
||
582 | |||
583 | return false; |
||
584 | } |
||
585 | } |
||
586 |