Passed
Push — master ( dff8a4...2556d0 )
by Dispositif
08:19
created

ExternRefTransformer::isURLAuthorized()   B

Complexity

Conditions 11
Paths 21

Size

Total Lines 46
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 132

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 11
eloc 26
c 2
b 0
f 0
nc 21
nop 1
dl 0
loc 46
ccs 0
cts 35
cp 0
crap 132
rs 7.3166

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternPage;
14
use App\Domain\ExternPageFactory;
15
use App\Domain\Models\Summary;
16
use App\Domain\Models\Wiki\AbstractWikiTemplate;
17
use App\Domain\Models\Wiki\ArticleTemplate;
18
use App\Domain\Models\Wiki\LienWebTemplate;
19
use App\Domain\Models\Wiki\OuvrageTemplate;
20
use App\Domain\OptimizerFactory;
21
use App\Domain\Publisher\ExternMapper;
22
use App\Domain\Utils\WikiTextUtil;
23
use App\Domain\WikiTemplateFactory;
24
use App\Infrastructure\InternetDomainParser;
25
use App\Infrastructure\Logger;
26
use Exception;
27
use Normalizer;
28
use Psr\Log\LoggerInterface;
29
use Symfony\Component\Yaml\Yaml;
30
use Throwable;
31
32
/**
33
 * todo move Domain
34
 * Class ExternRefTransformer
35
 *
36
 * @package App\Application
37
 */
38
class ExternRefTransformer implements TransformerInterface
39
{
40
    public const HTTP_REQUEST_LOOP_DELAY = 10;
41
    public const LOG_REQUEST_ERROR       = __DIR__.'/resources/external_request_error.log';
42
    public const SKIP_DOMAIN_FILENAME    = __DIR__.'/resources/config_skip_domain.txt';
43
44
    public $skipUnauthorised = true;
45
    /**
46
     * @var array
47
     */
48
    public $summaryLog = [];
49
    /**
50
     * @var LoggerInterface
51
     */
52
    protected $log;
53
    private $config;
54
    /**
55
     * @var string
56
     */
57
    private $domain;
58
    /**
59
     * @var string
60
     */
61
    private $url;
62
    /**
63
     * @var ExternMapper
64
     */
65
    private $mapper;
66
    /**
67
     * @var array
68
     */
69
    private $data = [];
70
    /**
71
     * @var array
72
     */
73
    private $skip_domain;
74
    /**
75
     * @var ExternPage
76
     */
77
    private $externalPage;
78
    /**
79
     * @var Summary|null
80
     */
81
    private $summary;
82
83
    public function __construct(LoggerInterface $log)
84
    {
85
        $this->log = $log;
86
87
        $this->importConfigAndData();
88
89
        $this->mapper = new ExternMapper(new Logger());
90
    }
91
92
    /**
93
     * @throws Exception
94
     */
95
    public function process(string $url, Summary $summary): string
96
    {
97
        $pageData = [];
98
        $this->summary = $summary;
99
        if (!$this->isURLAuthorized($url)) {
100
            return $url;
101
        }
102
103
        $url = WikiTextUtil::normalizeUrlForTemplate($url);
104
105
        try {
106
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
107
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
108
            $pageData = $this->externalPage->getData();
109
            $this->log->debug('metaData', $pageData);
110
        } catch (Exception $e) {
111
            // "410 gone" => {lien brisé}
112
            if (preg_match('#410 Gone#i', $e->getMessage())) {
113
                $this->log->notice('410 page disparue : '.$url);
114
115
                return sprintf(
116
                    '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
117
                    $url,
118
                    $this->url2TextStyleTitle($url),
119
                    date('d-m-Y')
120
                );
121
            } // 403
122
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
123
                $this->log->warning('403 Forbidden : '.$url);
124
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
125
            } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
126
                $this->log->notice('404 Not Found : '.$url);
127
128
                return $url;
129
            } else {
130
                //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
131
                $this->log->warning('erreur sur extractWebData '.$e->getMessage());
132
133
                //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
134
135
                return $url;
136
            }
137
        }
138
139
        if ($pageData === []
140
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
141
        ) {
142
            $this->log->notice('SKIP no metadata : '.$url);
143
144
            return $url;
145
        }
146
147
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
148
            $this->log->notice('SKIP robots: noindex : '.$url);
149
150
            return $url;
151
        }
152
153
        $mapData = $this->mapper->process($pageData);
154
155
        // check dataValide
156
        // Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
157
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
158
            $this->log->info('Mapping incomplet : '.$url);
159
160
            return $url;
161
        }
162
163
        // Pas de 'accès url=libre' # débat février 2021
164
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
165
            unset($mapData['accès url']);
166
        }
167
168
        $this->tagAndLog($mapData);
169
        $this->addSummaryLog($mapData);
170
171
        $template = $this->chooseTemplateByData($mapData);
172
173
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
174
        $mapData = $this->replaceURLbyOriginal($mapData);
175
176
177
        if ($template instanceof ArticleTemplate) {
178
            unset($mapData['site']);
179
        }
180
        unset($mapData['DATA-TYPE']); // ugly
181
        unset($mapData['DATA-ARTICLE']); // ugly
182
        unset($mapData['url-access']);
183
184
        $template->hydrate($mapData);
185
186
        $optimizer = OptimizerFactory::fromTemplate($template);
187
        $optimizer->doTasks();
188
        $templateOptimized = $optimizer->getOptiTemplate();
189
190
        $serialized = $templateOptimized->serialize(true);
191
        $this->log->info('Serialized 444: '.$serialized."\n");
192
193
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
194
        if (!empty($normalized) && is_string($normalized)) {
195
            return $normalized;
196
        }
197
        if (!empty($serialized) && is_string($serialized)) {
198
            return $serialized;
199
        }
200
201
        return $url;
202
    }
203
204
    protected function isURLAuthorized(string $url): bool
205
    {
206
        if (!ExternHttpClient::isHttpURL($url)) {
207
            //            $this->log->debug('Skip : not a valid URL : '.$url);
208
            return false;
209
        }
210
211
        if ($this->hasForbiddenFilenameExtension($url)) {
212
            return false;
213
        }
214
215
        $this->url = $url;
216
        if (!ExternHttpClient::isHttpURL($url)) {
217
            throw new Exception('string is not an URL '.$url);
218
        }
219
        try {
220
            $this->domain = InternetDomainParser::getRegistrableDomainFromURL($url);
221
        } catch (Exception $e) {
222
            $this->log->warning('Skip : not a valid URL : '.$url);
223
224
            return false;
225
        }
226
227
        if (in_array($this->domain, $this->skip_domain)) {
228
            $this->log->notice("Skip domain ".$this->domain);
229
            if ($this->skipUnauthorised) {
230
                return false;
231
            }
232
        }
233
234
        if (!isset($this->config[$this->domain])) {
235
            $this->log->debug("Domain ".$this->domain." non configuré");
236
        } else {
237
            $this->log->debug("Domain ".$this->domain." configuré");
238
        }
239
240
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
241
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
242
243
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
244
            $this->log->info("Domain ".$this->domain." disabled\n");
245
246
            return false;
247
        }
248
249
        return true;
250
    }
251
252
    /**
253
     * @param array $mapData
254
     *
255
     * @throws Exception
256
     */
257
    private function tagAndLog(array $mapData)
258
    {
259
        $this->log->debug('mapData', $mapData);
260
        $this->summary->citationNumber++;
261
262
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
263
            $this->log->notice("Article OK");
264
        }
265
        if (isset($this->data['newspaper'][$this->domain])) {
266
            $this->log->notice('PRESSE');
267
            $this->summary->memo['presse'] = true;
268
        }
269
        if ($this->isScientificDomain()) {
270
            $this->log->notice('SCIENCE');
271
            $this->summary->memo['science'] = true;
272
        }
273
        if (!isset($this->summary->memo['sites'])
274
            || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites'])
275
        ) {
276
            $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName();
277
        }
278
        if (isset($mapData['accès url'])) {
279
            $this->log->notice('accès 🔒 '.$mapData['accès url']);
280
            if ($mapData['accès url'] !== 'libre') {
281
                $this->summary->memo['accès url non libre'] = true;
282
            }
283
        }
284
    }
285
286
    private function isScientificDomain(): bool
287
    {
288
        if (isset($this->data['scientific domain'][$this->domain])) {
289
            return true;
290
        }
291
        return strpos('.revues.org', $this->domain) > 0;
292
    }
293
294
    private function addSummaryLog(array $mapData)
295
    {
296
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
297
    }
298
299
    /**
300
     * todo refac lisible
301
     *
302
     * @param array $mapData
303
     *
304
     * @return AbstractWikiTemplate
305
     * @throws Exception
306
     */
307
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
308
    {
309
        // Logique : choix template
310
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
311
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
312
313
        if (!empty($mapData['doi'])) {
314
            $templateName = 'article';
315
        }
316
317
        if ($this->config[$this->domain]['template'] === 'article'
318
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
319
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
320
            || $this->isScientificDomain()
321
        ) {
322
            $templateName = 'article';
323
        }
324
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
325
            $templateName = 'lien web';
326
        }
327
328
        // date obligatoire pour {article}
329
        if (!isset($mapData['date'])) {
330
            $templateName = 'lien web';
331
        }
332
333
        // Par défaut : {lien web}
334
        if (null === $templateName) {
335
            $templateName = 'lien web';
336
        }
337
338
        $template = WikiTemplateFactory::create($templateName);
339
        $template->userSeparator = " |";
340
        $this->summary->memo['count '.$templateName] = 1 + ($this->summary->memo['count '.$templateName] ?? 0);
341
342
        return $template;
343
    }
344
345
    /**
346
     * Logique : remplacement titre périodique ou nom du site
347
     *
348
     * @param array $mapData
349
     * @param       $template
350
     *
351
     * @return array
352
     */
353
    private function replaceSitenameByConfig(array $mapData, $template): array
354
    {
355
        // from wikidata URL of newspapers
356
        if (!empty($this->data['newspaper'][$this->domain])) {
357
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
358
            $label = $this->data['newspaper'][$this->domain]['fr'];
359
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
360
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
361
            }
362
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
363
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
364
            }
365
        }
366
367
        // from wikidata of scientific journals
368
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
369
            $mapData['périodique'] = WikiTextUtil::wikilink(
370
                $mapData['périodique'],
371
                $this->data['scientific wiki'][$mapData['périodique']]
372
            );
373
        }
374
375
        // from YAML config
376
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
377
            $mapData['site'] = $this->config[$this->domain]['site'];
378
        }
379
        if (!empty($this->config[$this->domain]['périodique'])
380
            && (!empty($mapData['périodique'])
381
                || $template instanceof OuvrageTemplate)
382
        ) {
383
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
384
        }
385
386
        // from logic
387
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
388
            try {
389
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
390
            } catch (Throwable $e) {
391
                unset($e);
392
            }
393
        }
394
395
        return $mapData;
396
    }
397
398
    private function replaceURLbyOriginal(array $mapData): array
399
    {
400
        $mapData['url'] = $this->url;
401
402
        return $mapData;
403
    }
404
405
    /**
406
     * todo move ?
407
     * URL => "parismatch.com/People/bla…"
408
     *
409
     * @param string $url
410
     *
411
     * @return string
412
     */
413
    public function url2TextStyleTitle(string $url): string
414
    {
415
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
416
        if (strlen($text) > 30) {
417
            $text = substr($text, 0, 30).'…';
418
        }
419
420
        return $text;
421
    }
422
423
    /**
424
     * Skip PDF GIF etc
425
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
426
     *
427
     * @param string $url
428
     *
429
     * @return bool
430
     */
431
    private function hasForbiddenFilenameExtension(string $url): bool
432
    {
433
        return (bool) preg_match(
434
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
435
            $url
436
        );
437
    }
438
439
    protected function importConfigAndData(): void
440
    {
441
        // todo REFAC DataObject[]
442
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
443
        $skipFromFile = file(
444
            self::SKIP_DOMAIN_FILENAME,
445
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
446
        );
447
        $this->skip_domain = $skipFromFile ?: [];
448
449
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true, 512, JSON_THROW_ON_ERROR);
450
        $this->data['scientific domain'] = json_decode(
451
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
452
            true,
453
            512,
454
            JSON_THROW_ON_ERROR
455
        );
456
        $this->data['scientific wiki'] = json_decode(
457
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
458
            true,
459
            512,
460
            JSON_THROW_ON_ERROR
461
        );
462
    }
463
464
}
465