Passed
Push — master ( 6b8380...9538c8 )
by Dispositif
02:38
created

hasForbiddenFilenameExtension()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 3
c 2
b 0
f 0
nc 1
nop 1
dl 0
loc 5
ccs 0
cts 0
cp 0
crap 2
rs 10
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternPage;
14
use App\Domain\ExternPageFactory;
15
use App\Domain\Models\Summary;
16
use App\Domain\Models\Wiki\AbstractWikiTemplate;
17
use App\Domain\Models\Wiki\ArticleTemplate;
18
use App\Domain\Models\Wiki\LienWebTemplate;
19
use App\Domain\Models\Wiki\OuvrageTemplate;
20
use App\Domain\OptimizerFactory;
21
use App\Domain\Publisher\ExternMapper;
22
use App\Domain\Utils\WikiTextUtil;
23
use App\Domain\WikiTemplateFactory;
24
use App\Infrastructure\InternetDomainParser;
25
use App\Infrastructure\Logger;
26
use Exception;
27
use Normalizer;
28
use Psr\Log\LoggerInterface;
29
use Symfony\Component\Yaml\Yaml;
30
use Throwable;
31
32
/**
33
 * todo move Domain ?
34
 */
35
class ExternRefTransformer implements TransformerInterface
36
{
37
    public const HTTP_REQUEST_LOOP_DELAY = 10;
38
    public const LOG_REQUEST_ERROR = __DIR__ . '/resources/external_request_error.log';
39
    public const SKIP_DOMAIN_FILENAME = __DIR__ . '/resources/config_skip_domain.txt';
40
    public const REPLACE_404 = true;
41
42
    public $skipUnauthorised = true;
43
    /**
44
     * @var array
45
     */
46
    public $summaryLog = [];
47
    /**
48
     * @var LoggerInterface
49
     */
50
    protected $log;
51
    private $config;
52
    /**
53
     * @var string
54
     */
55
    private $domain;
56
    /**
57
     * @var string
58
     */
59
    private $url;
60
    /**
61
     * @var ExternMapper
62
     */
63
    private $mapper;
64
    /**
65
     * @var array
66
     */
67
    private $data = [];
68
    /**
69
     * @var array
70
     */
71
    private $skip_domain;
72
    /**
73
     * @var ExternPage
74
     */
75
    private $externalPage;
76
    /**
77
     * @var Summary|null
78
     */
79
    private $summary;
80
81
    public function __construct(LoggerInterface $log)
82
    {
83
        $this->log = $log;
84
85
        $this->importConfigAndData();
86
87
        $this->mapper = new ExternMapper(new Logger());
88
    }
89
90
    /**
91
     * @throws Exception
92
     */
93
    public function process(string $url, Summary $summary): string
94
    {
95
        if (!$this->isURLAuthorized($url)) {
96
            return $url;
97
        }
98
        try {
99
            $url = WikiTextUtil::normalizeUrlForTemplate($url);
100
            $pageData = $this->extractPageDataFromUrl($url);
101
        } catch (Exception $exception) {
102
            return $this->manageHttpErrors($exception, $url);
103
        }
104
        if ($this->emptyPageData($pageData, $url) || $this->robotNoIndex($pageData, $url)) {
105
            return $url;
106
        }
107
108
        $mapData = $this->mapper->process($pageData);
109
        if ($this->emptyMapData($mapData, $url)) {
110
            return $url;
111
        }
112
        $mapData = $this->unsetAccesLibre($mapData);
113
114
        $this->addSummaryLog($mapData, $summary);
115
        $this->tagAndLog($mapData);
116
117
        $template = $this->chooseTemplateByData($mapData);
118
119
        $mapData = $this->replaceSomeData($mapData, $template);
120
        $serialized = $this->optimizeAndSerialize($template, $mapData);
121
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
122
        if (!empty($normalized) && is_string($normalized)) {
123
            return $normalized;
124
        }
125
        if (!empty($serialized)) {
126
            return $serialized;
127
        }
128
129
        return $url; // error fallback
130
    }
131
132
    protected function isURLAuthorized(string $url): bool
133
    {
134
        $this->url = $url;
135
        if (!ExternHttpClient::isHttpURL($url)) {
136
            $this->log->debug('Skip : not a valid URL : ' . $url);
137
            return false;
138
        }
139
140
        if ($this->hasForbiddenFilenameExtension($url)) {
141
            return false;
142
        }
143
        if (!ExternHttpClient::isHttpURL($url)) {
144
            throw new Exception('string is not an URL ' . $url);
145
        }
146
        try {
147
            $this->domain = InternetDomainParser::getRegistrableDomainFromURL($url);
148
        } catch (Exception $e) {
149
            $this->log->warning('Skip : not a valid URL : ' . $url);
150
            return false;
151
        }
152
153
        return $this->validateConfigWebDomain();
154
    }
155
156
    /**
157
     * @param array $mapData
158
     *
159
     * @throws Exception
160
     */
161
    private function tagAndLog(array $mapData)
162
    {
163
        $this->log->debug('mapData', $mapData);
164
        $this->summary->citationNumber = $this->summary->citationNumber ?? 0;
165
        $this->summary->citationNumber++;
166
167
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
168
            $this->log->notice("Article OK");
169
        }
170
        if (isset($this->data['newspaper'][$this->domain])) {
171
            $this->log->notice('PRESSE');
172
            $this->summary->memo['presse'] = true;
173
        }
174
        if ($this->isScientificDomain()) {
175
            $this->log->notice('SCIENCE');
176
            $this->summary->memo['science'] = true;
177
        }
178
        if (!isset($this->summary->memo['sites'])
179
            || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites'])
180
        ) {
181
            $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName();
182
        }
183
        if (isset($mapData['accès url'])) {
184
            $this->log->notice('accès 🔒 ' . $mapData['accès url']);
185
            if ($mapData['accès url'] !== 'libre') {
186
                $this->summary->memo['accès url non libre'] = true;
187
            }
188
        }
189
    }
190
191
    private function isScientificDomain(): bool
192
    {
193
        if (isset($this->data['scientific domain'][$this->domain])) {
194
            return true;
195
        }
196
        return strpos('.revues.org', $this->domain) > 0;
197
    }
198
199
    private function addSummaryLog(array $mapData, Summary $summary)
200
    {
201
        $this->summary = $summary;
202
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
203
    }
204
205
    /**
206
     * todo refac lisible
207
     *
208
     * @param array $mapData
209
     *
210
     * @return AbstractWikiTemplate
211
     * @throws Exception
212
     */
213
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
214
    {
215
        // Logique : choix template
216
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
217
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
218
219
        if (!empty($mapData['doi'])) {
220
            $templateName = 'article';
221
        }
222
223
        if ($this->config[$this->domain]['template'] === 'article'
224
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
225
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
226
            || $this->isScientificDomain()
227
        ) {
228
            $templateName = 'article';
229
        }
230
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
231
            $templateName = 'lien web';
232
        }
233
234
        // date obligatoire pour {article}
235
        if (!isset($mapData['date'])) {
236
            $templateName = 'lien web';
237
        }
238
239
        $template = WikiTemplateFactory::create($templateName);
240
        $template->userSeparator = " |";
241
        $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0);
242
243
        return $template;
244
    }
245
246
    /**
247
     * Logique : remplacement titre périodique ou nom du site
248
     *
249
     * @param array $mapData
250
     * @param       $template
251
     *
252
     * @return array
253
     */
254
    private function replaceSitenameByConfig(array $mapData, $template): array
255
    {
256
        // from wikidata URL of newspapers
257
        if (!empty($this->data['newspaper'][$this->domain])) {
258
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
259
            $label = $this->data['newspaper'][$this->domain]['fr'];
260
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
261
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
262
            }
263
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
264
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
265
            }
266
        }
267
268
        // from wikidata of scientific journals
269
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
270
            $mapData['périodique'] = WikiTextUtil::wikilink(
271
                $mapData['périodique'],
272
                $this->data['scientific wiki'][$mapData['périodique']]
273
            );
274
        }
275
276
        // from YAML config
277
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
278
            $mapData['site'] = $this->config[$this->domain]['site'];
279
        }
280
        if (!empty($this->config[$this->domain]['périodique'])
281
            && (!empty($mapData['périodique'])
282
                || $template instanceof OuvrageTemplate)
283
        ) {
284
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
285
        }
286
287
        // from logic
288
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
289
            try {
290
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
291
            } catch (Throwable $e) {
292
                unset($e);
293
            }
294
        }
295
296
        return $mapData;
297
    }
298
299
    private function replaceURLbyOriginal(array $mapData): array
300
    {
301
        $mapData['url'] = $this->url;
302
303
        return $mapData;
304
    }
305
306
    /**
307
     * todo move + prettyDomainName
308
     * URL => "parismatch.com/People/bla…"
309
     */
310
    public function generateTitleFromURLText(string $url): string
311
    {
312
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
313
        if (strlen($text) > 30) {
314
            $text = substr($text, 0, 30) . '…';
315
        }
316
317
        return $text;
318
    }
319
320
    /**
321
     * Skip PDF GIF etc
322
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
323
     *
324
     * @param string $url
325
     *
326
     * @return bool
327
     */
328
    private function hasForbiddenFilenameExtension(string $url): bool
329
    {
330
        return (bool)preg_match(
331
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
332
            $url
333
        );
334
    }
335
336
    // todo inject
337
    protected function importConfigAndData(): void
338
    {
339
        // todo REFAC DataObject[]
340
        $this->config = Yaml::parseFile(__DIR__ . '/resources/config_presse.yaml');
341
        $skipFromFile = file(
342
            self::SKIP_DOMAIN_FILENAME,
343
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
344
        );
345
        $this->skip_domain = $skipFromFile ?: [];
346
347
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__ . '/resources/data_newspapers.json'), true, 512, JSON_THROW_ON_ERROR);
348
        $this->data['scientific domain'] = json_decode(
349
            file_get_contents(__DIR__ . '/resources/data_scientific_domain.json'),
350
            true,
351
            512,
352
            JSON_THROW_ON_ERROR
353
        );
354
        $this->data['scientific wiki'] = json_decode(
355
            file_get_contents(__DIR__ . '/resources/data_scientific_wiki.json'),
356
            true,
357
            512,
358
            JSON_THROW_ON_ERROR
359
        );
360
    }
361
362
    protected function extractPageDataFromUrl(string $url): array
363
    {
364
        sleep(self::HTTP_REQUEST_LOOP_DELAY);
365
        $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
366
        $pageData = $this->externalPage->getData();
367
        $this->log->debug('metaData', $pageData);
368
369
        return $pageData;
370
    }
371
372
    protected function formatLienBrise(string $url): string
373
    {
374
        return sprintf(
375
            '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
376
            $url,
377
            $this->generateTitleFromURLText($url),
378
            date('d-m-Y')
379
        );
380
    }
381
382
    protected function log403(string $url): void
383
    {
384
        $this->log->warning('403 Forbidden : ' . $url);
385
        file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : ' . $this->domain . "\n", FILE_APPEND);
386
    }
387
388
    protected function manageHttpErrors(Exception $e, string $url): string
389
    {
390
        // "410 gone" => {lien brisé}
391
        if (preg_match('#410 Gone#i', $e->getMessage())) {
392
            $this->log->notice('410 page disparue : ' . $url);
393
394
            return $this->formatLienBrise($url);
395
        } // 403
396
        elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
397
            $this->log403($url);
398
399
            return $url;
400
        } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
401
            $this->log->notice('404 Not Found : ' . $url);
402
403
            if (self::REPLACE_404) {
404
                return $this->formatLienBrise($url);
405
            }
406
            return $url;
407
        } else {
408
            //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
409
            $this->log->warning('erreur sur extractWebData ' . $e->getMessage());
410
411
            //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
412
413
            return $url;
414
        }
415
    }
416
417
    private function emptyPageData(array $pageData, string $url): bool
418
    {
419
        if (empty($pageData)
420
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
421
        ) {
422
            $this->log->notice('SKIP no metadata : ' . $url);
423
424
            return true;
425
        }
426
427
        return false;
428
    }
429
430
    private function robotNoIndex(array $pageData, string $url): bool
431
    {
432
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
433
            $this->log->notice('SKIP robots: noindex : ' . $url);
434
435
            return true;
436
        }
437
        return false;
438
    }
439
440
    /**
441
     * Pas de 'accès url=libre' # débat février 2021
442
     */
443
    protected function unsetAccesLibre(array $mapData): array
444
    {
445
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
446
            unset($mapData['accès url']);
447
        }
448
        return $mapData;
449
    }
450
451
    /**
452
     * check dataValide
453
     * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
454
     */
455
    private function emptyMapData(array $mapData, string $url): bool
456
    {
457
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
458
            $this->log->info('Mapping incomplet : ' . $url);
459
460
            return true;
461
        }
462
        return false;
463
    }
464
465
    protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array
466
    {
467
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
468
        $mapData = $this->replaceURLbyOriginal($mapData);
469
470
        if ($template instanceof ArticleTemplate) {
471
            unset($mapData['site']);
472
        }
473
        unset($mapData['DATA-TYPE']); // ugly
474
        unset($mapData['DATA-ARTICLE']); // ugly
475
        unset($mapData['url-access']);
476
477
        return $mapData;
478
    }
479
480
    /**
481
     * @param AbstractWikiTemplate $template
482
     * @param array $mapData
483
     *
484
     * @return string
485
     * @throws Exception
486
     */
487
    protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string
488
    {
489
        $template->hydrate($mapData);
490
        $optimizer = OptimizerFactory::fromTemplate($template);
491
        $optimizer->doTasks();
492
        $templateOptimized = $optimizer->getOptiTemplate();
493
494
        $serialized = $templateOptimized->serialize(true);
495
        $this->log->info('Serialized 444: ' . $serialized . "\n");
496
        return $serialized;
497
    }
498
499
    /**
500
     * @return bool
501
     */
502
    protected function validateConfigWebDomain(): bool
503
    {
504
        if ($this->isSiteBlackListed()) {
505
            return false;
506
        }
507
        $this->logDebugConfigWebDomain();
508
509
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
510
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
511
512
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
513
            $this->log->info("Domain " . $this->domain . " disabled\n");
514
515
            return false;
516
        }
517
518
        return true;
519
    }
520
521
    /**
522
     * @return void
523
     */
524
    protected function logDebugConfigWebDomain(): void
525
    {
526
        if (!isset($this->config[$this->domain])) {
527
            $this->log->debug("Domain " . $this->domain . " non configuré");
528
        } else {
529
            $this->log->debug("Domain " . $this->domain . " configuré");
530
        }
531
    }
532
533
    protected function isSiteBlackListed(): bool
534
    {
535
        if ($this->skipUnauthorised && in_array($this->domain, $this->skip_domain)) {
536
            $this->log->notice("Skip web site " . $this->domain);
537
            return true;
538
        }
539
        return false;
540
    }
541
}
542