Passed
Push — master ( 9538c8...6c70b1 )
by Dispositif
07:44
created

ExternRefTransformer::replaceURLbyOriginal()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\Models\Summary;
14
use App\Domain\Models\Wiki\AbstractWikiTemplate;
15
use App\Domain\Models\Wiki\ArticleTemplate;
16
use App\Domain\Models\Wiki\LienWebTemplate;
17
use App\Domain\Models\Wiki\OuvrageTemplate;
18
use App\Domain\OptimizerFactory;
19
use App\Domain\Publisher\ExternMapper;
20
use App\Domain\Utils\WikiTextUtil;
21
use App\Domain\WikiTemplateFactory;
22
use App\Infrastructure\InternetDomainParser;
23
use Exception;
24
use Normalizer;
25
use Psr\Log\LoggerInterface;
26
use Psr\Log\NullLogger;
27
use Symfony\Component\Yaml\Yaml;
28
use Throwable;
29
30
/**
31
 * TODO refac too big (responsibility)
32
 */
33
class ExternRefTransformer implements ExternRefTransformerInterface
34
{
35
    public const HTTP_REQUEST_LOOP_DELAY = 10;
36
    public const LOG_REQUEST_ERROR = __DIR__ . '/../../Application/resources/external_request_error.log'; // todo move
37
    public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt';
38
    public const REPLACE_404 = true;
39
    public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml';
40
    public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json';
41
    public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json';
42
    public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json';
43
    public const ROBOT_NOINDEX_WHITELIST = ['legifrance.gouv.fr'];
44
45
    public $skipSiteBlacklisted = true;
46
    public $skipRobotNoIndex = true;
47
    /**
48
     * @var array
49
     */
50
    public $summaryLog = [];
51
    /**
52
     * @var LoggerInterface
53
     */
54
    protected $log;
55
    private $config;
56
    /**
57
     * @var string
58
     */
59
    private $domain;
60
    /**
61
     * @var string
62
     */
63
    private $url;
64
    /**
65
     * @var ExternMapper
66
     */
67
    private $mapper;
68
    /**
69
     * @var array
70
     */
71
    private $data = [];
72
    /**
73
     * @var array
74
     */
75
    private $skip_domain;
76
    /**
77
     * @var ExternPage
78
     */
79
    private $externalPage;
80
    /**
81
     * @var Summary|null
82
     */
83
    private $summary;
84
    /**
85
     * @var ExternHttpClientInterface
86
     */
87
    private $httpClient;
88
89
    public function __construct(ExternMapper $externMapper, ExternHttpClientInterface $httpClient, ?LoggerInterface $logger)
90
    {
91
        $this->log = $logger ?? new NullLogger();
92
        $this->importConfigAndData();
93
        $this->mapper = $externMapper;
94
        $this->httpClient = $httpClient;
95
    }
96
97
    /**
98
     * TODO Refac : chain of responsibility or composite pattern
99
     * @throws Exception
100
     */
101
    public function process(string $url, Summary $summary): string
102
    {
103
        if (!$this->isURLAuthorized($url)) {
104
            return $url;
105
        }
106
        try {
107
            $url = WikiTextUtil::normalizeUrlForTemplate($url);
108
            $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !!
109
        } catch (Exception $exception) {
110
            return $this->manageHttpErrors($exception, $url);
111
        }
112
        if ($this->emptyPageData($pageData, $url)) {
113
            return $url;
114
        }
115
        if ($this->isRobotNoIndex($pageData, $url) && $this->skipRobotNoIndex) {
116
            // TODO ? return {lien web| titre=Titre inconnu...
117
            // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986
118
            return $url;
119
        }
120
121
        $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess
122
        if ($this->emptyMapData($mappedData, $url)) {
123
            // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ...
124
            return $url;
125
        }
126
        $mappedData = $this->unsetAccesLibre($mappedData);
127
128
        $this->addSummaryLog($mappedData, $summary);
129
        $this->tagAndLog($mappedData);
130
131
        $template = $this->chooseTemplateByData($mappedData);
132
133
        $mappedData = $this->replaceSomeData($mappedData, $template);
134
        $serialized = $this->optimizeAndSerialize($template, $mappedData);
135
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
136
        if (!empty($normalized) && is_string($normalized)) {
137
            return $normalized;
138
        }
139
        if (!empty($serialized)) {
140
            return $serialized;
141
        }
142
143
        return $url; // error fallback
144
    }
145
146
    protected function isURLAuthorized(string $url): bool
147
    {
148
        $this->url = $url;
149
        if (!ExternHttpClient::isHttpURL($url)) {
150
            $this->log->debug('Skip : not a valid URL : ' . $url);
151
            return false;
152
        }
153
154
        if ($this->hasForbiddenFilenameExtension($url)) {
155
            return false;
156
        }
157
        if (!ExternHttpClient::isHttpURL($url)) {
158
            throw new Exception('string is not an URL ' . $url);
159
        }
160
        try {
161
            $this->domain = (new InternetDomainParser())->getRegistrableDomainFromURL($url);
162
        } catch (Exception $e) {
163
            $this->log->warning('Skip : not a valid URL : ' . $url);
164
            return false;
165
        }
166
167
        return $this->validateConfigWebDomain();
168
    }
169
170
    /**
171
     * @param array $mapData
172
     *
173
     * @throws Exception
174
     */
175
    private function tagAndLog(array $mapData)
176
    {
177
        $this->log->debug('mapData', $mapData);
178
        $this->summary->citationNumber = $this->summary->citationNumber ?? 0;
179
        $this->summary->citationNumber++;
180
181
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
182
            $this->log->notice("Article OK");
183
        }
184
        if (isset($this->data['newspaper'][$this->domain])) {
185
            $this->log->notice('PRESSE');
186
            $this->summary->memo['presse'] = true;
187
        }
188
        if ($this->isScientificDomain()) {
189
            $this->log->notice('SCIENCE');
190
            $this->summary->memo['science'] = true;
191
        }
192
        if (!isset($this->summary->memo['sites'])
193
            || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites'])
194
        ) {
195
            $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName();
196
        }
197
        if (isset($mapData['accès url'])) {
198
            $this->log->notice('accès 🔒 ' . $mapData['accès url']);
199
            if ($mapData['accès url'] !== 'libre') {
200
                $this->summary->memo['accès url non libre'] = true;
201
            }
202
        }
203
    }
204
205
    private function isScientificDomain(): bool
206
    {
207
        if (isset($this->data['scientific domain'][$this->domain])) {
208
            return true;
209
        }
210
        return strpos('.revues.org', $this->domain) > 0;
211
    }
212
213
    private function addSummaryLog(array $mapData, Summary $summary)
214
    {
215
        $this->summary = $summary;
216
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
217
    }
218
219
    /**
220
     * todo refac lisible
221
     *
222
     * @param array $mapData
223
     *
224
     * @return AbstractWikiTemplate
225
     * @throws Exception
226
     */
227
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
228
    {
229
        // Logique : choix template
230
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
231
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
232
233
        if (!empty($mapData['doi'])) {
234
            $templateName = 'article';
235
        }
236
237
        if ($this->config[$this->domain]['template'] === 'article'
238
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
239
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
240
            || $this->isScientificDomain()
241
        ) {
242
            $templateName = 'article';
243
        }
244
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
245
            $templateName = 'lien web';
246
        }
247
248
        // date obligatoire pour {article}
249
        if (!isset($mapData['date'])) {
250
            $templateName = 'lien web';
251
        }
252
253
        $template = WikiTemplateFactory::create($templateName);
254
        $template->userSeparator = " |";
255
        $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0);
256
257
        return $template;
258
    }
259
260
    /**
261
     * Logique : remplacement titre périodique ou nom du site
262
     *
263
     * @param array $mapData
264
     * @param       $template
265
     *
266
     * @return array
267
     */
268
    private function replaceSitenameByConfig(array $mapData, $template): array
269
    {
270
        // from wikidata URL of newspapers
271
        if (!empty($this->data['newspaper'][$this->domain])) {
272
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
273
            $label = $this->data['newspaper'][$this->domain]['fr'];
274
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
275
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
276
            }
277
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
278
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
279
            }
280
        }
281
282
        // from wikidata of scientific journals
283
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
284
            $mapData['périodique'] = WikiTextUtil::wikilink(
285
                $mapData['périodique'],
286
                $this->data['scientific wiki'][$mapData['périodique']]
287
            );
288
        }
289
290
        // from YAML config
291
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
292
            $mapData['site'] = $this->config[$this->domain]['site'];
293
        }
294
        if (!empty($this->config[$this->domain]['périodique'])
295
            && (!empty($mapData['périodique'])
296
                || $template instanceof OuvrageTemplate)
297
        ) {
298
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
299
        }
300
301
        // from logic
302
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
303
            try {
304
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
305
            } catch (Throwable $e) {
306
                unset($e);
307
            }
308
        }
309
310
        return $mapData;
311
    }
312
313
    private function replaceURLbyOriginal(array $mapData): array
314
    {
315
        $mapData['url'] = $this->url;
316
317
        return $mapData;
318
    }
319
320
    /**
321
     * todo move + prettyDomainName
322
     * URL => "parismatch.com/People/bla…"
323
     */
324
    public function generateTitleFromURLText(string $url): string
325
    {
326
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
327
        if (strlen($text) > 30) {
328
            $text = substr($text, 0, 30) . '…';
329
        }
330
331
        return $text;
332
    }
333
334
    /**
335
     * Skip PDF GIF etc
336
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
337
     *
338
     * @param string $url
339
     *
340
     * @return bool
341
     */
342
    private function hasForbiddenFilenameExtension(string $url): bool
343
    {
344
        return (bool)preg_match(
345
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
346
            $url
347
        );
348
    }
349
350
    // todo extract Infra getcont form file + inject
351
    protected function importConfigAndData(): void
352
    {
353
        // todo REFAC DataObject[]
354
        $this->config = Yaml::parseFile(self::CONFIG_PRESSE);
355
        $skipFromFile = file(
356
            self::SKIP_DOMAIN_FILENAME,
357
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
358
        );
359
        $this->skip_domain = $skipFromFile ?: [];
360
361
        $this->data['newspaper'] = json_decode(file_get_contents(self::CONFIG_NEWSPAPER_JSON), true, 512, JSON_THROW_ON_ERROR);
362
        $this->data['scientific domain'] = json_decode(
363
            file_get_contents(self::CONFIG_SCIENTIFIC_JSON),
364
            true,
365
            512,
366
            JSON_THROW_ON_ERROR
367
        );
368
        $this->data['scientific wiki'] = json_decode(
369
            file_get_contents(self::CONFIG_SCIENTIFIC_WIKI_JSON),
370
            true,
371
            512,
372
            JSON_THROW_ON_ERROR
373
        );
374
    }
375
376
    /**
377
     * @throws Exception
378
     */
379
    protected function extractPageDataFromUrl(string $url): array
380
    {
381
        sleep(self::HTTP_REQUEST_LOOP_DELAY);
382
        $this->externalPage = ExternPageFactory::fromURL($url, $this->httpClient, $this->log);
383
        $pageData = $this->externalPage->getData();
384
        $this->log->debug('metaData', $pageData);
385
386
        return $pageData;
387
    }
388
389
    protected function formatLienBrise(string $url): string
390
    {
391
        return sprintf(
392
            '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
393
            $url,
394
            $this->generateTitleFromURLText($url),
395
            date('d-m-Y')
396
        );
397
    }
398
399
    protected function log403(string $url): void
400
    {
401
        $this->log->warning('403 Forbidden : ' . $url);
402
        file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : ' . $this->domain . "\n", FILE_APPEND);
403
    }
404
405
    protected function manageHttpErrors(Exception $e, string $url): string
406
    {
407
        // "410 gone" => {lien brisé}
408
        if (preg_match('#410 Gone#i', $e->getMessage())) {
409
            $this->log->notice('410 Gone');
410
411
            return $this->formatLienBrise($url);
412
        } // 403
413
        elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
414
            $this->log403($url);
415
416
            return $url;
417
        } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
418
            $this->log->notice('404 Not Found');
419
420
            if (self::REPLACE_404) {
421
                return $this->formatLienBrise($url);
422
            }
423
            return $url;
424
        } elseif (preg_match('#401 Unauthorized#i', $e->getMessage())) {
425
            $this->log->notice('401 Unauthorized : skip ' . $url);
426
427
            return $url;
428
        } else {
429
            //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
430
            $this->log->warning('erreur sur extractWebData ' . $e->getMessage());
431
432
            //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
433
434
            return $url;
435
        }
436
    }
437
438
    private function emptyPageData(array $pageData, string $url): bool
439
    {
440
        if (empty($pageData)
441
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
442
        ) {
443
            $this->log->notice('No metadata : ' . $url);
444
445
            return true;
446
        }
447
448
        return false;
449
    }
450
451
    /**
452
     * Detect if robots noindex
453
     * https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag?hl=fr
454
     */
455
    private function isRobotNoIndex(array $pageData, string $url): bool
456
    {
457
        $robots = $pageData['meta']['robots'] ?? null;
458
        if (
459
            !empty($robots)
460
            && (
461
                strpos(strtolower($robots), 'noindex') !== false
462
                || strpos(strtolower($robots), 'none') !== false
463
            )
464
        ) {
465
            $this->log->notice('robots NOINDEX : ' . $url);
466
467
            return !$this->isNoIndexDomainWhitelisted($pageData['meta']['prettyDomainName']);
468
        }
469
470
        return false;
471
    }
472
473
    /**
474
     * Pas de 'accès url=libre' # débat février 2021
475
     */
476
    protected function unsetAccesLibre(array $mapData): array
477
    {
478
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
479
            unset($mapData['accès url']);
480
        }
481
        return $mapData;
482
    }
483
484
    /**
485
     * check dataValide
486
     * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
487
     */
488
    private function emptyMapData(array $mapData, string $url): bool
489
    {
490
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
491
            $this->log->info('Mapping incomplet : ' . $url);
492
493
            return true;
494
        }
495
        return false;
496
    }
497
498
    protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array
499
    {
500
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
501
        $mapData = $this->replaceURLbyOriginal($mapData);
502
503
        if ($template instanceof ArticleTemplate) {
504
            unset($mapData['site']);
505
        }
506
        unset($mapData['DATA-TYPE']); // ugly
507
        unset($mapData['DATA-ARTICLE']); // ugly
508
        unset($mapData['url-access']);
509
510
        return $mapData;
511
    }
512
513
    /**
514
     * @param AbstractWikiTemplate $template
515
     * @param array $mapData
516
     *
517
     * @return string
518
     * @throws Exception
519
     */
520
    protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string
521
    {
522
        $template->hydrate($mapData);
523
        $optimizer = OptimizerFactory::fromTemplate($template);
524
        $optimizer->doTasks();
525
        $templateOptimized = $optimizer->getOptiTemplate();
526
527
        $serialized = $templateOptimized->serialize(true);
528
        $this->log->info('Serialized 444: ' . $serialized . "\n");
529
        return $serialized;
530
    }
531
532
    /**
533
     * @return bool
534
     */
535
    protected function validateConfigWebDomain(): bool
536
    {
537
        if ($this->isSiteBlackListed()) {
538
            return false;
539
        }
540
        $this->logDebugConfigWebDomain();
541
542
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
543
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
544
545
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
546
            $this->log->info("Domain " . $this->domain . " disabled\n");
547
548
            return false;
549
        }
550
551
        return true;
552
    }
553
554
    /**
555
     * @return void
556
     */
557
    protected function logDebugConfigWebDomain(): void
558
    {
559
        if (!isset($this->config[$this->domain])) {
560
            $this->log->debug("Domain " . $this->domain . " non configuré");
561
        } else {
562
            $this->log->debug("Domain " . $this->domain . " configuré");
563
        }
564
    }
565
566
    protected function isSiteBlackListed(): bool
567
    {
568
        if ($this->skipSiteBlacklisted && in_array($this->domain, $this->skip_domain)) {
569
            $this->log->notice("Skip web site " . $this->domain);
570
            return true;
571
        }
572
        return false;
573
    }
574
575
    protected function isNoIndexDomainWhitelisted(?string $prettyDomain): bool
576
    {
577
        if (in_array($prettyDomain ?? '', self::ROBOT_NOINDEX_WHITELIST)) {
578
            $this->log->notice('ROBOT_NOINDEX_WHITELIST ' . $prettyDomain);
579
580
            return true;
581
        }
582
583
        return false;
584
    }
585
}
586