Test Failed
Push — master ( 766a39...696a12 )
by Dispositif
09:33
created

ExternRefTransformer   F

Complexity

Total Complexity 80

Size/Duplication

Total Lines 430
Duplicated Lines 0 %

Test Coverage

Coverage 0%

Importance

Changes 6
Bugs 0 Features 0
Metric Value
eloc 196
c 6
b 0
f 0
dl 0
loc 430
ccs 0
cts 173
cp 0
rs 2
wmc 80

12 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 1
C replaceSitenameByConfig() 0 43 16
A url2TextStyleTitle() 0 8 2
C chooseTemplateByData() 0 36 12
A addSummaryLog() 0 3 1
A replaceURLbyOriginal() 0 5 1
D process() 0 107 21
C isURLAuthorized() 0 49 12
A importConfigAndData() 0 22 2
B tagAndLog() 0 25 9
A isScientificDomain() 0 6 2
A hasForbiddenFilenameExtension() 0 5 1

How to fix   Complexity   

Complex Class

Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternPage;
14
use App\Domain\ExternPageFactory;
15
use App\Domain\Models\Summary;
16
use App\Domain\Models\Wiki\AbstractWikiTemplate;
17
use App\Domain\Models\Wiki\ArticleTemplate;
18
use App\Domain\Models\Wiki\LienWebTemplate;
19
use App\Domain\Models\Wiki\OuvrageTemplate;
20
use App\Domain\OptimizerFactory;
21
use App\Domain\Publisher\ExternMapper;
22
use App\Domain\Utils\WikiTextUtil;
23
use App\Domain\WikiTemplateFactory;
24
use App\Infrastructure\InternetDomainParser;
25
use App\Infrastructure\Logger;
26
use Exception;
27
use Normalizer;
28
use Psr\Log\LoggerInterface;
29
use Symfony\Component\Yaml\Yaml;
30
use Throwable;
31
32
/**
33
 * todo move Domain
34
 * Class ExternRefTransformer
35
 *
36
 * @package App\Application
37
 */
38
class ExternRefTransformer implements TransformerInterface
39
{
40
    public const HTTP_REQUEST_LOOP_DELAY = 10;
41
    public const LOG_REQUEST_ERROR       = __DIR__.'/resources/external_request_error.log';
42
    public const SKIP_DOMAIN_FILENAME    = __DIR__.'/resources/config_skip_domain.txt';
43
44
    public $skipUnauthorised = true;
45
    /**
46
     * @var array
47
     */
48
    public $summaryLog = [];
49
    /**
50
     * @var LoggerInterface
51
     */
52
    protected $log;
53
    private $config;
54
    /**
55
     * @var string
56
     */
57
    private $domain;
58
    /**
59
     * @var string
60
     */
61
    private $url;
62
    /**
63
     * @var ExternMapper
64
     */
65
    private $mapper;
66
    /**
67
     * @var array
68
     */
69
    private $data = [];
70
    /**
71
     * @var array
72
     */
73
    private $skip_domain;
74
    /**
75
     * @var ExternPage
76
     */
77
    private $externalPage;
78
    /**
79
     * @var Summary
80
     */
81
    private $summary;
82
83
    public function __construct(LoggerInterface $log)
84
    {
85
        $this->log = $log;
86
87
        $this->importConfigAndData();
88
89
        $this->mapper = new ExternMapper(new Logger());
90
    }
91
92
    /**
93
     * @param string       $url
94
     * @param Summary|null $summary
95
     *
96
     * @return string
97
     * @throws Exception
98
     */
99
    public function process(string $url, Summary $summary): string
100
    {
101
        $pageData = [];
102
        $this->summary = $summary;
103
        if (!$this->isURLAuthorized($url)) {
104
            return $url;
105
        }
106
107
        $url = WikiTextUtil::normalizeUrlForTemplate($url);
108
109
        try {
110
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
111
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
112
            $pageData = $this->externalPage->getData();
113
            $this->log->debug('metaData', $pageData);
114
        } catch (Exception $e) {
115
            // "410 gone" => {lien brisé}
116
            if (preg_match('#410 Gone#i', $e->getMessage())) {
117
                $this->log->notice('410 page disparue : '.$url);
118
119
                return sprintf(
120
                    '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
121
                    $url,
122
                    $this->url2TextStyleTitle($url),
123
                    date('d-m-Y')
124
                );
125
            } // 403
126
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
127
                $this->log->warning('403 Forbidden : '.$url);
128
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
129
            } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
130
                $this->log->notice('404 Not Found : '.$url);
131
132
                return $url;
133
            } else {
134
                //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
135
                $this->log->warning('erreur sur extractWebData '.$e->getMessage());
136
137
                //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
138
139
                return $url;
140
            }
141
        }
142
143
        if ($pageData === []
144
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
145
        ) {
146
            $this->log->notice('SKIP no metadata : '.$url);
147
148
            return $url;
149
        }
150
151
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
152
            $this->log->notice('SKIP robots: noindex : '.$url);
153
154
            return $url;
155
        }
156
157
        $mapData = $this->mapper->process($pageData);
158
159
        // check dataValide
160
        // Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
161
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
162
            $this->log->info('Mapping incomplet : '.$url);
163
164
            return $url;
165
        }
166
167
        // Pas de 'accès url=libre' # débat février 2021
168
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
169
            unset($mapData['accès url']);
170
        }
171
172
        $this->tagAndLog($mapData);
173
        $this->addSummaryLog($mapData);
174
175
        $template = $this->chooseTemplateByData($mapData);
176
177
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
178
        $mapData = $this->replaceURLbyOriginal($mapData);
179
180
181
        if ($template instanceof ArticleTemplate) {
182
            unset($mapData['site']);
183
        }
184
        unset($mapData['DATA-TYPE']); // ugly
185
        unset($mapData['DATA-ARTICLE']); // ugly
186
        unset($mapData['url-access']);
187
188
        $template->hydrate($mapData);
189
190
        $optimizer = OptimizerFactory::fromTemplate($template);
191
        $optimizer->doTasks();
192
        $templateOptimized = $optimizer->getOptiTemplate();
193
194
        $serialized = $templateOptimized->serialize(true);
195
        $this->log->info('Serialized 444: '.$serialized."\n");
196
197
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
198
        if (!empty($normalized) && is_string($normalized)) {
199
            return $normalized;
200
        }
201
        if (!empty($serialized) && is_string($serialized)) {
202
            return $serialized;
203
        }
204
205
        return $url;
206
    }
207
208
    protected function isURLAuthorized(string $url): bool
209
    {
210
        if (!ExternHttpClient::isHttpURL($url)) {
211
            //            $this->log->debug('Skip : not a valid URL : '.$url);
212
            return false;
213
        }
214
215
        if ($this->hasForbiddenFilenameExtension($url)) {
216
            return false;
217
        }
218
219
        $this->url = $url;
220
        if (!ExternHttpClient::isHttpURL($url)) {
221
            throw new \Exception('string is not an URL '.$url);
222
        }
223
        try {
224
            $this->domain = InternetDomainParser::getRegistrableDomainFromURL($url);
225
        } catch (Exception $e) {
226
            $this->log->warning('Skip : not a valid URL : '.$url);
227
228
            return false;
229
        }
230
231
        if (in_array($this->domain, $this->skip_domain)) {
232
            $this->log->notice("Skip domain ".$this->domain);
233
            if ($this->skipUnauthorised) {
234
                return false;
235
            }
236
        }
237
238
        if (!isset($this->config[$this->domain])) {
239
            $this->log->debug("Domain ".$this->domain." non configuré");
240
            if ($this->skipUnauthorised) {
241
                return false;
242
            }
243
        } else {
244
            $this->log->debug("Domain ".$this->domain." configuré");
245
        }
246
247
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
248
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
249
250
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
251
            $this->log->info("Domain ".$this->domain." disabled\n");
252
253
            return false;
254
        }
255
256
        return true;
257
    }
258
259
    /**
260
     * @param array $mapData
261
     *
262
     * @throws Exception
263
     */
264
    private function tagAndLog(array $mapData)
265
    {
266
        $this->log->debug('mapData', $mapData);
267
        $this->summary->citationNumber++;
268
269
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
270
            $this->log->notice("Article OK");
271
        }
272
        if (isset($this->data['newspaper'][$this->domain])) {
273
            $this->log->notice('PRESSE');
274
            $this->summary->memo['presse'] = true;
275
        }
276
        if ($this->isScientificDomain()) {
277
            $this->log->notice('SCIENCE');
278
            $this->summary->memo['science'] = true;
279
        }
280
        if (!isset($this->summary->memo['sites'])
281
            || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites'])
282
        ) {
283
            $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName();
284
        }
285
        if (isset($mapData['accès url'])) {
286
            $this->log->notice('accès 🔒 '.$mapData['accès url']);
287
            if ($mapData['accès url'] !== 'libre') {
288
                $this->summary->memo['accès url non libre'] = true;
289
            }
290
        }
291
    }
292
293
    private function isScientificDomain(): bool
294
    {
295
        if (isset($this->data['scientific domain'][$this->domain])) {
296
            return true;
297
        }
298
        return strpos('.revues.org', $this->domain) > 0;
299
    }
300
301
    private function addSummaryLog(array $mapData)
302
    {
303
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
304
    }
305
306
    /**
307
     * todo refac lisible
308
     *
309
     * @param array $mapData
310
     *
311
     * @return AbstractWikiTemplate
312
     * @throws Exception
313
     */
314
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
315
    {
316
        // Logique : choix template
317
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
318
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
319
320
        if (!empty($mapData['doi'])) {
321
            $templateName = 'article';
322
        }
323
324
        if ($this->config[$this->domain]['template'] === 'article'
325
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
326
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
327
            || $this->isScientificDomain()
328
        ) {
329
            $templateName = 'article';
330
        }
331
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
332
            $templateName = 'lien web';
333
        }
334
335
        // date obligatoire pour {article}
336
        if (!isset($mapData['date'])) {
337
            $templateName = 'lien web';
338
        }
339
340
        // Par défaut : {lien web}
341
        if (!isset($templateName)) {
342
            $templateName = 'lien web';
343
        }
344
345
        $template = WikiTemplateFactory::create($templateName);
346
        $template->userSeparator = " |";
347
        $this->summary->memo['count '.$templateName] = 1 + ($this->summary->memo['count '.$templateName] ?? 0);
348
349
        return $template;
350
    }
351
352
    /**
353
     * Logique : remplacement titre périodique ou nom du site
354
     *
355
     * @param array $mapData
356
     * @param       $template
357
     *
358
     * @return array
359
     */
360
    private function replaceSitenameByConfig(array $mapData, $template): array
361
    {
362
        // from wikidata URL of newspapers
363
        if (!empty($this->data['newspaper'][$this->domain])) {
364
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
365
            $label = $this->data['newspaper'][$this->domain]['fr'];
366
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
367
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
368
            }
369
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
370
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
371
            }
372
        }
373
374
        // from wikidata of scientific journals
375
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
376
            $mapData['périodique'] = WikiTextUtil::wikilink(
377
                $mapData['périodique'],
378
                $this->data['scientific wiki'][$mapData['périodique']]
379
            );
380
        }
381
382
        // from YAML config
383
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
384
            $mapData['site'] = $this->config[$this->domain]['site'];
385
        }
386
        if (!empty($this->config[$this->domain]['périodique'])
387
            && (!empty($mapData['périodique'])
388
                || $template instanceof OuvrageTemplate)
389
        ) {
390
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
391
        }
392
393
        // from logic
394
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
395
            try {
396
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
397
            } catch (Throwable $e) {
398
                unset($e);
399
            }
400
        }
401
402
        return $mapData;
403
    }
404
405
    private function replaceURLbyOriginal(array $mapData): array
406
    {
407
        $mapData['url'] = $this->url;
408
409
        return $mapData;
410
    }
411
412
    /**
413
     * todo move ?
414
     * URL => "parismatch.com/People/bla…"
415
     *
416
     * @param string $url
417
     *
418
     * @return string
419
     */
420
    public function url2TextStyleTitle(string $url): string
421
    {
422
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
423
        if (strlen($text) > 30) {
424
            $text = substr($text, 0, 30).'…';
425
        }
426
427
        return $text;
428
    }
429
430
    /**
431
     * Skip PDF GIF etc
432
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
433
     *
434
     * @param string $url
435
     *
436
     * @return bool
437
     */
438
    private function hasForbiddenFilenameExtension(string $url): bool
439
    {
440
        return (bool) preg_match(
441
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
442
            $url
443
        );
444
    }
445
446
    protected function importConfigAndData(): void
447
    {
448
        // todo REFAC DataObject[]
449
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
450
        $skipFromFile = file(
451
            self::SKIP_DOMAIN_FILENAME,
452
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
453
        );
454
        $this->skip_domain = $skipFromFile ?: [];
455
456
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true, 512, JSON_THROW_ON_ERROR);
457
        $this->data['scientific domain'] = json_decode(
458
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
459
            true,
460
            512,
461
            JSON_THROW_ON_ERROR
462
        );
463
        $this->data['scientific wiki'] = json_decode(
464
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
465
            true,
466
            512,
467
            JSON_THROW_ON_ERROR
468
        );
469
    }
470
471
}
472