Passed
Push — master ( 95294c...e75f8e )
by Dispositif
09:41
created

ExternRefTransformer::url2TextStyleTitle()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 4
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 8
ccs 0
cts 0
cp 0
crap 6
rs 10
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternDomains;
14
use App\Domain\ExternPage;
15
use App\Domain\ExternPageFactory;
16
use App\Domain\Models\Wiki\AbstractWikiTemplate;
17
use App\Domain\Models\Wiki\ArticleTemplate;
18
use App\Domain\Models\Wiki\LienWebTemplate;
19
use App\Domain\Models\Wiki\OuvrageTemplate;
20
use App\Domain\OptimizerFactory;
21
use App\Domain\Publisher\ExternMapper;
22
use App\Domain\Utils\WikiTextUtil;
23
use App\Domain\WikiTemplateFactory;
24
use App\Infrastructure\Logger;
25
use Codedungeon\PHPCliColors\Color;
26
use Exception;
27
use Normalizer;
28
use Psr\Log\LoggerInterface;
29
use Symfony\Component\Yaml\Yaml;
30
use Throwable;
31
32
/**
33
 * todo move Domain
34
 * Class ExternRefTransformer
35
 *
36
 * @package App\Application
37
 */
38
class ExternRefTransformer implements TransformerInterface
39
{
40
    const HTTP_REQUEST_LOOP_DELAY = 10;
41
42
    const LOG_REQUEST_ERROR = __DIR__.'/resources/external_request_error.log';
43
    public $skipUnauthorised = true;
44
    /**
45
     * @var array
46
     */
47
    public $summaryLog = [];
48
    /**
49
     * @var LoggerInterface
50
     */
51
    protected $log;
52
    private $config;
53
    /**
54
     * @var string|string[]
55
     */
56
    private $domain;
57
    /**
58
     * @var string
59
     */
60
    private $url;
61
    /**
62
     * @var ExternMapper
63
     */
64
    private $mapper;
65
    /**
66
     * @var array
67
     */
68
    private $data = [];
69
    /**
70
     * @var array
71
     */
72
    private $skip_domain;
73
    /**
74
     * @var ExternPage
75
     */
76
    private $externalPage;
77
78
    /**
79
     * ExternalRefTransformer constructor.
80
     *
81
     * @param LoggerInterface $log
82
     */
83
    public function __construct(LoggerInterface $log)
84
    {
85
        $this->log = $log;
86
87
        // todo REFAC DataObject[]
88
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
89
        $skipFromFile = file(
90
            __DIR__.'/resources/config_skip_domain.txt',
91
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
92
        );
93
        $this->skip_domain = ($skipFromFile) ? $skipFromFile : [];
94
95
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true);
96
        $this->data['scientific domain'] = json_decode(
97
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
98
            true
99
        );
100
        $this->data['scientific wiki'] = json_decode(
101
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
102
            true
103
        );
104
105
        $this->mapper = new ExternMapper(new Logger());
106
    }
107
108
    /**
109
     * @param string $url
110
     *
111
     * @return string
112
     * @throws Exception
113
     */
114
    public function process(string $url): string
115
    {
116
        if (!$this->isURLAuthorized($url)) {
117
            return $url;
118
        }
119
        try {
120
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
121
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
122
            $pageData = $this->externalPage->getData();
123
            $this->log->debug('metaData', $this->externalPage->getData());
124
        } catch (Exception $e) {
125
            // "410 gone" => {lien brisé}
126
            if (preg_match('#410 Gone#i', $e->getMessage())) {
127
                $this->log->notice('410 page définitivement disparue : '.$url);
128
129
                return sprintf(
130
                    '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
131
                    $url,
132
                    $this->url2TextStyleTitle($url),
133
                    date('d-m-Y')
134
                );
135
            } // 403
136
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
137
                $this->log->warning('403 Forbidden : '.$url);
138
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
139
            } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
140
                $this->log->notice('404 Not Found sur extractWebData');
141
142
                return $url;
143
            } else {
144
                //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
145
                $this->log->warning('erreur sur extractWebData '.$e->getMessage());
146
                file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
147
148
                return $url;
149
            }
150
        }
151
152
        if (empty($pageData)
153
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
154
        ) {
155
            // site avec HTML pourri
156
            return $url;
157
        }
158
159
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
160
            $this->log->notice('SKIP robots: noindex');
161
162
            return $url;
163
        }
164
165
        $mapData = $this->mapper->process($pageData);
166
167
        // check dataValide
168
        // Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
169
        if (empty($mapData) || empty($mapData['url']) || empty($mapData['titre'])) {
170
            $this->log->info('Mapping incomplet');
171
172
            return $url;
173
        }
174
175
        $this->tagAndLog($mapData);
176
        $this->addSummaryLog($mapData);
177
178
        $template = $this->chooseTemplateByData($mapData);
179
180
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
181
        $mapData = $this->replaceURLbyOriginal($mapData);
182
183
184
        if ($template instanceof ArticleTemplate) {
185
            unset($mapData['site']);
186
        }
187
        unset($mapData['DATA-TYPE']); // ugly
188
        unset($mapData['DATA-ARTICLE']); // ugly
189
190
        $template->hydrate($mapData);
191
192
        $optimizer = OptimizerFactory::fromTemplate($template);
193
        $optimizer->doTasks();
194
        $templateOptimized = $optimizer->getOptiTemplate();
195
196
        $serialized = $templateOptimized->serialize(true);
197
        $this->log->info($serialized."\n");
198
199
        return Normalizer::normalize($serialized);
200
    }
201
202
    /**
203
     * @param string $url
204
     *
205
     * @return bool
206
     * @throws Exception
207
     */
208
    protected function isURLAuthorized(string $url): bool
209
    {
210
        if (!ExternHttpClient::isWebURL($url)) {
211
            $this->log->debug('Skip : not an URL : '.$url);
212
213
            return false;
214
        }
215
216
        if ($this->hasForbiddenFilenameExtension($url)) {
217
            return false;
218
        }
219
220
        $this->url = $url;
221
        $this->domain = ExternDomains::extractSubDomain($this->url);
222
223
        if (in_array($this->domain, $this->skip_domain)) {
224
            $this->log->notice("Skip domain ".$this->domain);
225
226
            return false;
227
        }
228
229
        if (!isset($this->config[$this->domain])) {
230
            $this->log->info("Domain ".$this->domain." non configuré\n");
231
            if ($this->skipUnauthorised) {
232
                return false;
233
            }
234
        } else {
235
            echo "> Domaine ".Color::LIGHT_GREEN.$this->domain.Color::NORMAL." configuré\n";
236
        }
237
238
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
239
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
240
241
        if ($this->config[$this->domain] === 'deactivated' || isset($this->config[$this->domain]['deactivated'])) {
242
            $this->log->info("Domain ".$this->domain." desactivé\n");
243
244
            return false;
245
        }
246
247
        return true;
248
    }
249
250
    private function tagAndLog(array $mapData)
251
    {
252
        $this->log->debug('mapData', $mapData);
253
254
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
255
            $this->log->notice("Article OK");
256
        }
257
        if (isset($this->data['newspaper'][$this->domain])) {
258
            $this->log->notice('PRESSE');
259
        }
260
        if ($this->isScientificDomain()) {
261
            $this->log->notice('SCIENCE');
262
        }
263
    }
264
265
    private function isScientificDomain(): bool
266
    {
267
        if (isset($this->data['scientific domain'][$this->domain])) {
268
            return true;
269
        }
270
        if (strpos('.revues.org', $this->domain) > 0) {
271
            return true;
272
        }
273
274
        return false;
275
    }
276
277
    private function addSummaryLog(array $mapData)
278
    {
279
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
280
    }
281
282
    /**
283
     * todo refac lisible
284
     *
285
     * @param array $mapData
286
     *
287
     * @return AbstractWikiTemplate
288
     * @throws Exception
289
     */
290
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
291
    {
292
        // Logique : choix template
293
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
294
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
295
296
        if (!empty($mapData['doi'])) {
297
            $templateName = 'article';
298
        }
299
300
        if ($this->config[$this->domain]['template'] === 'article'
301
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
302
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
303
            || $this->isScientificDomain()
304
        ) {
305
            $templateName = 'article';
306
        }
307
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
308
            $templateName = 'lien web';
309
        }
310
        // date obligatoire pour {article}
311
        if (!isset($mapData['date'])) {
312
            $templateName = 'lien web';
313
        }
314
315
        $template = WikiTemplateFactory::create($templateName);
316
        $template->userSeparator = " |";
317
318
        return $template;
319
    }
320
321
    /**
322
     * Logique : remplacement titre périodique ou nom du site
323
     *
324
     * @param array $mapData
325
     * @param       $template
326
     *
327
     * @return array
328
     */
329
    private function replaceSitenameByConfig(array $mapData, $template): array
330
    {
331
        // from wikidata URL of newspapers
332
        if (!empty($this->data['newspaper'][$this->domain])) {
333
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
334
            $label = $this->data['newspaper'][$this->domain]['fr'];
335
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
336
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
337
            }
338
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
339
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
340
            }
341
        }
342
343
        // from wikidata of scientific journals
344
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
345
            $mapData['périodique'] = WikiTextUtil::wikilink(
346
                $mapData['périodique'],
347
                $this->data['scientific wiki'][$mapData['périodique']]
348
            );
349
        }
350
351
        // from YAML config
352
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
353
            $mapData['site'] = $this->config[$this->domain]['site'];
354
        }
355
        if (!empty($this->config[$this->domain]['périodique'])
356
            && (!empty($mapData['périodique'])
357
                || $template instanceof OuvrageTemplate)
358
        ) {
359
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
360
        }
361
362
        // from logic
363
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
364
            try {
365
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
366
            } catch (Throwable $e) {
367
                unset($e);
368
            }
369
        }
370
371
        return $mapData;
372
    }
373
374
    private function replaceURLbyOriginal(array $mapData): array
375
    {
376
        $mapData['url'] = $this->url;
377
378
        return $mapData;
379
    }
380
381
    /**
382
     * todo move ?
383
     * URL => "parismatch.com/People/bla…"
384
     *
385
     * @param string $url
386
     *
387
     * @return string
388
     */
389
    public function url2TextStyleTitle(string $url): string
390
    {
391
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
392
        if (strlen($text) > 30) {
393
            $text = substr($text, 0, 30).'…';
394
        }
395
396
        return $text;
397
    }
398
399
    /**
400
     * Skip PDF GIF etc
401
     * https://fr.wikipedia.org/wiki/Liste_d%27extensions_de_fichiers
402
     *
403
     * @param string $url
404
     *
405
     * @return bool
406
     */
407
    private function hasForbiddenFilenameExtension(string $url): bool
408
    {
409
        if (preg_match(
410
            '#\.(pdf|jpg|jpeg|gif|png|xls|xlsx|xlr|xml|xlt|xlsx|txt|csv|js|docx|exe|gz|zip|ini|movie|mp3|mp4|ogg|raw|rss|tar|tgz|wma)$#i',
411
            $url
412
        )
413
        ) {
414
            return true;
415
        }
416
417
        return false;
418
    }
419
420
}
421