Passed
Push — master ( e42360...d2e488 )
by Dispositif
06:33
created

ExternRefTransformer::isURLAutorized()   B

Complexity

Conditions 8
Paths 11

Size

Total Lines 36
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 72

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 20
c 1
b 0
f 0
nc 11
nop 1
dl 0
loc 36
ccs 0
cts 27
cp 0
crap 72
rs 8.4444
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternDomains;
14
use App\Domain\ExternPageFactory;
15
use App\Domain\Models\Wiki\AbstractWikiTemplate;
16
use App\Domain\Models\Wiki\ArticleTemplate;
17
use App\Domain\Models\Wiki\LienWebTemplate;
18
use App\Domain\Models\Wiki\OuvrageTemplate;
19
use App\Domain\OptimizerFactory;
20
use App\Domain\Publisher\ExternMapper;
21
use App\Domain\Utils\WikiTextUtil;
22
use App\Domain\WikiTemplateFactory;
23
use App\Infrastructure\Logger;
24
use Codedungeon\PHPCliColors\Color;
25
use Normalizer;
26
use Psr\Log\LoggerInterface;
27
use Symfony\Component\Yaml\Yaml;
28
29
/**
30
 * todo move Domain
31
 * Class ExternRefTransformer
32
 *
33
 * @package App\Application
34
 */
35
class ExternRefTransformer implements TransformerInterface
36
{
37
    const HTTP_REQUEST_LOOP_DELAY = 10;
38
39
    const SKIPPED_FILE_LOG  = __DIR__.'/resources/external_skipped.log';
40
    const LOG_REQUEST_ERROR = __DIR__.'/resources/external_request_error.log';
41
    public $skipUnauthorised = true;
42
    /**
43
     * @var array
44
     */
45
    public $summaryLog = [];
46
    /**
47
     * @var LoggerInterface
48
     */
49
    protected $log;
50
    private $config;
51
    /**
52
     * @var string|string[]
53
     */
54
    private $domain;
55
    /**
56
     * @var string
57
     */
58
    private $url;
59
    /**
60
     * @var ExternMapper
61
     */
62
    private $mapper;
63
    /**
64
     * @var array
65
     */
66
    private $data = [];
67
    /**
68
     * @var array
69
     */
70
    private $skip_domain = [];
71
    /**
72
     * @var \App\Domain\ExternPage
73
     */
74
    private $externalPage;
75
76
    /**
77
     * ExternalRefTransformer constructor.
78
     *
79
     * @param LoggerInterface $log
80
     */
81
    public function __construct(LoggerInterface $log)
82
    {
83
        $this->log = $log;
84
85
        // todo REFAC DataObject[]
86
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
87
        $skipFromFile = file(
88
            __DIR__.'/resources/config_skip_domain.txt',
89
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
90
        );
91
        $this->skip_domain = ($skipFromFile) ? $skipFromFile : [];
92
93
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true);
94
        $this->data['scientific domain'] = json_decode(
95
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
96
            true
97
        );
98
        $this->data['scientific wiki'] = json_decode(
99
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
100
            true
101
        );
102
103
        $this->mapper = new ExternMapper(new Logger());
104
    }
105
106
    /**
107
     * @param string $url
108
     *
109
     * @return string
110
     * @throws \Exception
111
     */
112
    public function process(string $url): string
113
    {
114
        if (!$this->isURLAutorized($url)) {
115
            return $url;
116
        }
117
        try {
118
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
119
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
120
            $pageData = $this->externalPage->getData();
121
            $this->log->debug('metaData', $this->externalPage->getData());
122
        } catch (\Exception $e) {
123
            // "410 gone" => {lien brisé}
124
            if (preg_match('#410 Gone#i', $e->getMessage())) {
125
                $this->log->notice('410 page définitivement disparue : '.$url);
126
127
                return sprintf(
128
                    '{{Lien brisé |url= %s |titre=%s |brisé le=%s}}',
129
                    $url,
130
                    $this->url2TextStyleTitle($url),
131
                    date('d-m-Y')
132
                );
133
            } // 403
134
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
135
                $this->log->warning('403 Forbidden : '.$url);
136
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
137
            } elseif (preg_match('#404 Not Found#i', $e->getMessage())) {
138
                $this->log->notice('404 Not Found sur extractWebData');
139
140
                return $url;
141
            } else {
142
                //  autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
143
                $this->log->warning('erreur sur extractWebData '.$e->getMessage());
144
                file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
145
146
                return $url;
147
            }
148
        }
149
150
        if (empty($pageData)
151
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
152
        ) {
153
            // site avec HTML pourri
154
            return $url;
155
        }
156
157
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
158
            $this->log->notice('SKIP robots: noindex');
159
160
            return $url;
161
        }
162
163
        $mapData = $this->mapper->process($pageData);
164
165
        // check dataValide
166
        if (empty($mapData) || empty($mapData['url']) || empty($mapData['titre'])) {
167
            $this->skip_domain[] = $this->domain;
168
            $this->log->info('Mapping incomplet');
169
            // Todo : temp data
170
            try {
171
                file_put_contents(self::SKIPPED_FILE_LOG, $this->domain.",".$this->url."\n", FILE_APPEND);
172
            } catch (\Throwable $e) {
173
                unset($e);
174
            }
175
176
            return $url;
177
        }
178
179
        $this->tagAndLog($mapData);
180
        $this->addSummaryLog($mapData);
181
182
        $template = $this->chooseTemplateByData($mapData);
183
184
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
185
        $mapData = $this->replaceURLbyOriginal($mapData);
186
187
188
        if ($template instanceof ArticleTemplate) {
189
            unset($mapData['site']);
190
        }
191
        unset($mapData['DATA-TYPE']); // ugly
192
        unset($mapData['DATA-ARTICLE']); // ugly
193
194
        $template->hydrate($mapData);
195
196
        $optimizer = OptimizerFactory::fromTemplate($template);
197
        $optimizer->doTasks();
198
        $templateOptimized = $optimizer->getOptiTemplate();
199
200
        $serialized = $templateOptimized->serialize(true);
201
        $this->log->info($serialized."\n");
202
203
        return Normalizer::normalize($serialized);
204
    }
205
206
    /**
207
     * @param string $url
208
     *
209
     * @return bool
210
     * @throws \Exception
211
     */
212
    protected function isURLAutorized(string $url): bool
213
    {
214
        if (!ExternHttpClient::isWebURL($url)) {
215
            $this->log->debug('Skip : not an URL : '.$url);
216
217
            return false;
218
        }
219
220
        $this->url = $url;
221
        $this->domain = ExternDomains::extractSubDomain($this->url);
222
223
        if (in_array($this->domain, $this->skip_domain)) {
224
            $this->log->notice("Skip domain ".$this->domain);
225
226
            return false;
227
        }
228
229
        if (!isset($this->config[$this->domain])) {
230
            $this->log->info("Domain ".$this->domain." non configuré\n");
231
            if ($this->skipUnauthorised) {
232
                return false;
233
            }
234
        } else {
235
            echo "> Domaine ".Color::LIGHT_GREEN.$this->domain.Color::NORMAL." configuré\n";
236
        }
237
238
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
239
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
240
241
        if ($this->config[$this->domain] === 'desactived' || isset($this->config[$this->domain]['desactived'])) {
242
            $this->log->info("Domain ".$this->domain." desactivé\n");
243
244
            return false;
245
        }
246
247
        return true;
248
    }
249
250
    private function tagAndLog(array $mapData)
251
    {
252
        $this->log->debug('mapData', $mapData);
253
254
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
255
            $this->log->notice("Article OK");
256
        }
257
        if (isset($this->data['newspaper'][$this->domain])) {
258
            $this->log->notice('PRESSE');
259
        }
260
        if ($this->isScientificDomain()) {
261
            $this->log->notice('SCIENCE');
262
        }
263
    }
264
265
    private function isScientificDomain(): bool
266
    {
267
        if (isset($this->data['scientific domain'][$this->domain])) {
268
            return true;
269
        }
270
        if (strpos('.revues.org', $this->domain) > 0) {
271
            return true;
272
        }
273
274
        return false;
275
    }
276
277
    private function addSummaryLog(array $mapData)
278
    {
279
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
280
    }
281
282
    /**
283
     * todo refac lisible
284
     */
285
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
286
    {
287
        // Logique : choix template
288
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
289
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
290
291
        if (!empty($mapData['doi'])) {
292
            $templateName = 'article';
293
        }
294
295
        if ($this->config[$this->domain]['template'] === 'article'
296
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
297
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
298
            || $this->isScientificDomain()
299
        ) {
300
            $templateName = 'article';
301
        }
302
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
303
            $templateName = 'lien web';
304
        }
305
        // date obligatoire pour {article}
306
        if (!isset($mapData['date'])) {
307
            $templateName = 'lien web';
308
        }
309
310
        $template = WikiTemplateFactory::create($templateName);
311
        $template->userSeparator = " |";
312
313
        return $template;
314
    }
315
316
    /**
317
     * Logique : remplacement titre périodique ou nom du site
318
     *
319
     * @param array $mapData
320
     * @param       $template
321
     *
322
     * @return array
323
     */
324
    private function replaceSitenameByConfig(array $mapData, $template): array
325
    {
326
        // from wikidata URL of newspapers
327
        if (!empty($this->data['newspaper'][$this->domain])) {
328
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
329
            $label = $this->data['newspaper'][$this->domain]['fr'];
330
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
331
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
332
            }
333
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
334
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
335
            }
336
        }
337
338
        // from wikidata of scientific journals
339
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
340
            $mapData['périodique'] = WikiTextUtil::wikilink(
341
                $mapData['périodique'],
342
                $this->data['scientific wiki'][$mapData['périodique']]
343
            );
344
        }
345
346
        // from YAML config
347
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
348
            $mapData['site'] = $this->config[$this->domain]['site'];
349
        }
350
        if (!empty($this->config[$this->domain]['périodique'])
351
            && (!empty($mapData['périodique'])
352
                || $template instanceof OuvrageTemplate)
353
        ) {
354
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
355
        }
356
357
        // from logic
358
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
359
            try {
360
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
361
            } catch (\Throwable $e) {
362
                unset($e);
363
            }
364
        }
365
366
        return $mapData;
367
    }
368
369
    private function replaceURLbyOriginal(array $mapData): array
370
    {
371
        $mapData['url'] = $this->url;
372
373
        return $mapData;
374
    }
375
376
    /**
377
     * todo move ?
378
     * URL => "parismatch.com/People/bla…"
379
     *
380
     * @param string $url
381
     *
382
     * @return string
383
     */
384
    public function url2TextStyleTitle(string $url): string
385
    {
386
        $text = str_replace(['https://', 'http://', 'www.'], '', $url);
387
        if (strlen($text) > 30) {
388
            $text = substr($text, 0, 30).'…';
389
        }
390
391
        return $text;
392
    }
393
394
}
395