Passed
Push — master ( 1bcf8b...aa70d7 )
by Dispositif
05:36
created

ExternRefTransformer::addSummaryLog()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 0
cts 2
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternDomains;
14
use App\Domain\ExternPage;
15
use App\Domain\ExternPageFactory;
16
use App\Domain\Models\Summary;
17
use App\Domain\Models\Wiki\AbstractWikiTemplate;
18
use App\Domain\Models\Wiki\ArticleTemplate;
19
use App\Domain\Models\Wiki\LienWebTemplate;
20
use App\Domain\Models\Wiki\OuvrageTemplate;
21
use App\Domain\OptimizerFactory;
22
use App\Domain\Publisher\ExternMapper;
23
use App\Domain\Utils\WikiTextUtil;
24
use App\Domain\WikiTemplateFactory;
25
use App\Infrastructure\Logger;
26
use Exception;
27
use Normalizer;
28
use Psr\Log\LoggerInterface;
29
use Symfony\Component\Yaml\Yaml;
30
use Throwable;
31
32
/**
33
 * todo move Domain
34
 * Class ExternRefTransformer
35
 *
36
 * @package App\Application
37
 */
38
class ExternRefTransformer implements TransformerInterface
39
{
40
    const HTTP_REQUEST_LOOP_DELAY = 10;
41
    const LOG_REQUEST_ERROR       = __DIR__.'/resources/external_request_error.log';
42
    const SKIP_DOMAIN_FILENAME    = __DIR__.'/resources/config_skip_domain.txt';
43
44
    public $skipUnauthorised = true;
45
    /**
46
     * @var array
47
     */
48
    public $summaryLog = [];
49
    /**
50
     * @var LoggerInterface
51
     */
52
    protected $log;
53
    private $config;
54
    /**
55
     * @var string|string[]
56
     */
57
    private $domain;
58
    /**
59
     * @var string
60
     */
61
    private $url;
62
    /**
63
     * @var ExternMapper
64
     */
65
    private $mapper;
66
    /**
67
     * @var array
68
     */
69
    private $data = [];
70
    /**
71
     * @var array
72
     */
73
    private $skip_domain;
74
    /**
75
     * @var ExternPage
76
     */
77
    private $externalPage;
78
    /**
79
     * @var Summary
80
     */
81
    private $summary;
82
83
    public function __construct(LoggerInterface $log)
84
    {
85
        $this->log = $log;
86
87
        $this->importConfigAndData();
88
89
        $this->mapper = new ExternMapper(new Logger());
90
    }
91
92
    /**
93
     * @param string       $url
94
     * @param Summary|null $summary
95
     *
96
     * @return string
97
     * @throws Exception
98
     */
99
    public function process(string $url, Summary $summary): string
100
    {
101
        $this->summary = $summary;
102
        if (!$this->isURLAuthorized($url)) {
103
            return $url;
104
        }
105
        try {
106
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
107
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
108
            $pageData = $this->externalPage->getData();
109
            $this->log->debug('metaData', $this->externalPage->getData());
110
        } catch (Exception $e) {
111
            // "410 gone" => {lien brisé}
112
            if (preg_match('#410 Gone#i', $e->getMessage())) {
113
                $this->log->notice('410 page disparue : '.$url);
114
115
                return sprintf(
116
                    '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
117
                    $url,
118
                    $this->url2TextStyleTitle($url),
119
                    date('d-m-Y')
120
                );
121
            } // 403
122
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
123
                $this->log->warning('403 Forbidden : '.$url);
124
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
125
            } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
126
                $this->log->notice('404 Not Found : '.$url);
127
128
                return $url;
129
            } else {
130
                //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
131
                $this->log->warning('erreur sur extractWebData '.$e->getMessage());
132
133
                //file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
134
135
                return $url;
136
            }
137
        }
138
139
        if (empty($pageData)
140
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
141
        ) {
142
            $this->log->notice('SKIP no metadata : '.$url);
143
144
            return $url;
145
        }
146
147
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
148
            $this->log->notice('SKIP robots: noindex : '.$url);
149
150
            return $url;
151
        }
152
153
        $mapData = $this->mapper->process($pageData);
154
155
        // check dataValide
156
        // Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
157
        if (empty($mapData) || empty($mapData['url']) || empty($mapData['titre'])) {
158
            $this->log->info('Mapping incomplet : '.$url);
159
160
            return $url;
161
        }
162
163
        $this->tagAndLog($mapData);
164
        $this->addSummaryLog($mapData);
165
166
        $template = $this->chooseTemplateByData($mapData);
167
168
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
169
        $mapData = $this->replaceURLbyOriginal($mapData);
170
171
172
        if ($template instanceof ArticleTemplate) {
173
            unset($mapData['site']);
174
        }
175
        unset($mapData['DATA-TYPE']); // ugly
176
        unset($mapData['DATA-ARTICLE']); // ugly
177
        unset($mapData['url-access']);
178
179
        $template->hydrate($mapData);
180
181
        $optimizer = OptimizerFactory::fromTemplate($template);
182
        $optimizer->doTasks();
183
        $templateOptimized = $optimizer->getOptiTemplate();
184
185
        $serialized = $templateOptimized->serialize(true);
186
        $this->log->info($serialized."\n");
187
188
        return Normalizer::normalize($serialized);
189
    }
190
191
    /**
192
     * @param string $url
193
     *
194
     * @return bool
195
     * @throws Exception
196
     */
197
    protected function isURLAuthorized(string $url): bool
198
    {
199
        if (!ExternHttpClient::isWebURL($url)) {
200
            $this->log->debug('Skip : not an URL : '.$url);
201
202
            return false;
203
        }
204
205
        if ($this->hasForbiddenFilenameExtension($url)) {
206
            return false;
207
        }
208
209
        $this->url = $url;
210
        $this->domain = ExternDomains::extractSubDomain($this->url);
211
212
        if (in_array($this->domain, $this->skip_domain)) {
213
            $this->log->notice("Skip domain ".$this->domain);
214
215
            return false;
216
        }
217
218
        if (!isset($this->config[$this->domain])) {
219
            $this->log->debug("Domain ".$this->domain." non configuré");
220
            if ($this->skipUnauthorised) {
221
                return false;
222
            }
223
        } else {
224
            $this->log->debug("Domain ".$this->domain." configuré");
225
        }
226
227
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
228
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
229
230
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
231
            $this->log->info("Domain ".$this->domain." desactivé\n");
232
233
            return false;
234
        }
235
236
        return true;
237
    }
238
239
    private function tagAndLog(array $mapData)
240
    {
241
        $this->log->debug('mapData', $mapData);
242
        $this->summary->citationNumber++;
243
244
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
245
            $this->log->notice("Article OK");
246
        }
247
        if (isset($this->data['newspaper'][$this->domain])) {
248
            $this->log->notice('PRESSE');
249
            $this->summary->memo['presse'] = true;
250
        }
251
        if ($this->isScientificDomain()) {
252
            $this->log->notice('SCIENCE');
253
            $this->summary->memo['science'] = true;
254
        }
255
        if (!isset($this->summary->memo['sites'])
256
            || !in_array($this->externalPage->getPrettyDomainName(), $this->summary->memo['sites'])
257
        ) {
258
            $this->summary->memo['sites'][] = $this->externalPage->getPrettyDomainName();
259
        }
260
    }
261
262
    private function isScientificDomain(): bool
263
    {
264
        if (isset($this->data['scientific domain'][$this->domain])) {
265
            return true;
266
        }
267
        if (strpos('.revues.org', $this->domain) > 0) {
268
            return true;
269
        }
270
271
        return false;
272
    }
273
274
    private function addSummaryLog(array $mapData)
275
    {
276
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
277
    }
278
279
    /**
280
     * todo refac lisible
281
     *
282
     * @param array $mapData
283
     *
284
     * @return AbstractWikiTemplate
285
     * @throws Exception
286
     */
287
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
288
    {
289
        // Logique : choix template
290
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
291
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
292
293
        if (!empty($mapData['doi'])) {
294
            $templateName = 'article';
295
        }
296
297
        if ($this->config[$this->domain]['template'] === 'article'
298
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
299
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
300
            || $this->isScientificDomain()
301
        ) {
302
            $templateName = 'article';
303
        }
304
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
305
            $templateName = 'lien web';
306
        }
307
        // date obligatoire pour {article}
308
        if (!isset($mapData['date'])) {
309
            $templateName = 'lien web';
310
        }
311
312
        $template = WikiTemplateFactory::create($templateName);
313
        $template->userSeparator = " |";
314
        $this->summary->memo['count '.$templateName] = 1 + ($this->summary->memo['count '.$templateName] ?? 0);
315
316
        return $template;
317
    }
318
319
    /**
320
     * Logique : remplacement titre périodique ou nom du site
321
     *
322
     * @param array $mapData
323
     * @param       $template
324
     *
325
     * @return array
326
     */
327
    private function replaceSitenameByConfig(array $mapData, $template): array
328
    {
329
        // from wikidata URL of newspapers
330
        if (!empty($this->data['newspaper'][$this->domain])) {
331
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
332
            $label = $this->data['newspaper'][$this->domain]['fr'];
333
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
334
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
335
            }
336
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
337
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
338
            }
339
        }
340
341
        // from wikidata of scientific journals
342
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
343
            $mapData['périodique'] = WikiTextUtil::wikilink(
344
                $mapData['périodique'],
345
                $this->data['scientific wiki'][$mapData['périodique']]
346
            );
347
        }
348
349
        // from YAML config
350
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
351
            $mapData['site'] = $this->config[$this->domain]['site'];
352
        }
353
        if (!empty($this->config[$this->domain]['périodique'])
354
            && (!empty($mapData['périodique'])
355
                || $template instanceof OuvrageTemplate)
356
        ) {
357
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
358
        }
359
360
        // from logic
361
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
362
            try {
363
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
364
            } catch (Throwable $e) {
365
                unset($e);
366
            }
367
        }
368
369
        return $mapData;
370
    }
371
372
    private function replaceURLbyOriginal(array $mapData): array
373
    {
374
        $mapData['url'] = $this->url;
375
376
        return $mapData;
377
    }
378
379
    /**
380
     * todo move ?
381
     * URL => "parismatch.com/People/bla…"
382
     *
383
     * @param string $url
384
     *
385
     * @return string
386
     */
387
    public function url2TextStyleTitle(string $url): string
388
    {
389
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
390
        if (strlen($text) > 30) {
391
            $text = substr($text, 0, 30).'…';
392
        }
393
394
        return $text;
395
    }
396
397
    /**
398
     * Skip PDF GIF etc
399
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
400
     *
401
     * @param string $url
402
     *
403
     * @return bool
404
     */
405
    private function hasForbiddenFilenameExtension(string $url): bool
406
    {
407
        if (preg_match(
408
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|xlsx|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
409
            $url
410
        )
411
        ) {
412
            return true;
413
        }
414
415
        return false;
416
    }
417
418
    protected function importConfigAndData(): void
419
    {
420
        // todo REFAC DataObject[]
421
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
422
        $skipFromFile = file(
423
            self::SKIP_DOMAIN_FILENAME,
424
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
425
        );
426
        $this->skip_domain = ($skipFromFile) ? $skipFromFile : [];
427
428
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true);
429
        $this->data['scientific domain'] = json_decode(
430
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
431
            true
432
        );
433
        $this->data['scientific wiki'] = json_decode(
434
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
435
            true
436
        );
437
    }
438
439
}
440