Passed
Push — master ( b3f7dd...feb615 )
by Dispositif
06:47
created

ExternRefTransformer   F

Complexity

Total Complexity 62

Size/Duplication

Total Lines 335
Duplicated Lines 0 %

Test Coverage

Coverage 0%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 155
dl 0
loc 335
ccs 0
cts 173
cp 0
rs 3.44
c 1
b 0
f 0
wmc 62

9 Methods

Rating   Name   Duplication   Size   Complexity  
C replaceSitenameByConfig() 0 43 16
C process() 0 88 15
B chooseTemplateByData() 0 29 11
B isURLAutorized() 0 36 8
A tagAndLog() 0 12 5
A addSummaryLog() 0 3 1
A isScientificDomain() 0 10 3
A __construct() 0 23 2
A replaceURLbyOriginal() 0 5 1

How to fix   Complexity   

Complex Class

Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.

1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Application\Http\ExternHttpClient;
13
use App\Domain\ExternDomains;
14
use App\Domain\ExternPageFactory;
15
use App\Domain\Models\Wiki\AbstractWikiTemplate;
16
use App\Domain\Models\Wiki\ArticleTemplate;
17
use App\Domain\Models\Wiki\LienWebTemplate;
18
use App\Domain\Models\Wiki\OuvrageTemplate;
19
use App\Domain\OptimizerFactory;
20
use App\Domain\Publisher\ExternMapper;
21
use App\Domain\Utils\WikiTextUtil;
22
use App\Domain\WikiTemplateFactory;
23
use App\Infrastructure\Logger;
24
use Codedungeon\PHPCliColors\Color;
25
use Normalizer;
26
use Psr\Log\LoggerInterface;
27
use Symfony\Component\Yaml\Yaml;
28
29
/**
30
 * todo move Domain
31
 * Class ExternRefTransformer
32
 *
33
 * @package App\Application
34
 */
35
class ExternRefTransformer implements TransformerInterface
36
{
37
    const HTTP_REQUEST_LOOP_DELAY = 20;
38
39
    const SKIPPED_FILE_LOG  = __DIR__.'/resources/external_skipped.log';
40
    const LOG_REQUEST_ERROR = __DIR__.'/resources/external_request_error.log';
41
    public $skipUnauthorised = true;
42
    /**
43
     * @var array
44
     */
45
    public $summaryLog = [];
46
    /**
47
     * @var LoggerInterface
48
     */
49
    protected $log;
50
    private $config;
51
    /**
52
     * @var string|string[]
53
     */
54
    private $domain;
55
    /**
56
     * @var string
57
     */
58
    private $url;
59
    /**
60
     * @var ExternMapper
61
     */
62
    private $mapper;
63
    /**
64
     * @var array
65
     */
66
    private $data = [];
67
    /**
68
     * @var array
69
     */
70
    private $skip_domain = [];
71
    /**
72
     * @var \App\Domain\ExternPage
73
     */
74
    private $externalPage;
75
76
    /**
77
     * ExternalRefTransformer constructor.
78
     *
79
     * @param LoggerInterface $log
80
     */
81
    public function __construct(LoggerInterface $log)
82
    {
83
        $this->log = $log;
84
85
        // todo REFAC DataObject[]
86
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
87
        $skipFromFile = file(
88
            __DIR__.'/resources/config_skip_domain.txt',
89
            FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES
90
        );
91
        $this->skip_domain = ($skipFromFile) ? $skipFromFile : [];
92
93
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true);
94
        $this->data['scientific domain'] = json_decode(
95
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
96
            true
97
        );
98
        $this->data['scientific wiki'] = json_decode(
99
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
100
            true
101
        );
102
103
        $this->mapper = new ExternMapper(new Logger());
104
    }
105
106
    /**
107
     * @param string $url
108
     *
109
     * @return string
110
     * @throws \Exception
111
     */
112
    public function process(string $url): string
113
    {
114
        if (!$this->isURLAutorized($url)) {
115
            return $url;
116
        }
117
        try {
118
            sleep(self::HTTP_REQUEST_LOOP_DELAY);
119
            $this->externalPage = ExternPageFactory::fromURL($url, $this->log);
120
            $pageData = $this->externalPage->getData();
121
            $this->log->debug('metaData', $this->externalPage->getData());
122
        } catch (\Exception $e) {
123
            // "410 gone" => {lien brisé}
124
            if (preg_match('#410 Gone#i', $e->getMessage())) {
125
                $this->log->notice('410 page définitivement disparue : '.$url);
126
127
                return sprintf(
128
                    '{{Lien brisé |url= %s |titre= %s |brisé le=%s}}',
129
                    $url,
130
                    'page définitivement disparue',
131
                    date('d-m-Y')
132
                );
133
            } // 403
134
            elseif (preg_match('#403 Forbidden#i', $e->getMessage())) {
135
                $this->log->warning('403 Forbidden : '.$url);
136
                file_put_contents(self::LOG_REQUEST_ERROR, '403 Forbidden : '.$this->domain."\n", FILE_APPEND);
137
            } else {
138
                // 404 ou autre : ne pas générer de {lien brisé}, car peut-être 404 temporaire
139
                $this->log->notice('erreur sur extractWebData '.$e->getMessage());
140
                file_put_contents(self::LOG_REQUEST_ERROR, $this->domain."\n", FILE_APPEND);
141
142
                return $url;
143
            }
144
        }
145
146
        if (empty($pageData)
147
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
148
        ) {
149
            // site avec HTML pourri
150
            return $url;
151
        }
152
153
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
154
            $this->log->notice('SKIP robots: noindex');
155
156
            return $url;
157
        }
158
159
        $mapData = $this->mapper->process($pageData);
160
161
        // check dataValide
162
        if (empty($mapData) || empty($mapData['url']) || empty($mapData['titre'])) {
163
            $this->skip_domain[] = $this->domain;
164
            $this->log->info('Mapping incomplet');
165
            // Todo : temp data
166
            try {
167
                file_put_contents(self::SKIPPED_FILE_LOG, $this->domain.",".$this->url."\n", FILE_APPEND);
168
            } catch (\Throwable $e) {
169
                unset($e);
170
            }
171
172
            return $url;
173
        }
174
175
        $this->tagAndLog($mapData);
176
        $this->addSummaryLog($mapData);
177
178
        $template = $this->chooseTemplateByData($mapData);
179
180
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
181
        $mapData = $this->replaceURLbyOriginal($mapData);
182
183
184
        if ($template instanceof ArticleTemplate) {
185
            unset($mapData['site']);
186
        }
187
        unset($mapData['DATA-TYPE']); // ugly
188
        unset($mapData['DATA-ARTICLE']); // ugly
189
190
        $template->hydrate($mapData);
191
192
        $optimizer = OptimizerFactory::fromTemplate($template);
193
        $optimizer->doTasks();
194
        $templateOptimized = $optimizer->getOptiTemplate();
195
196
        $serialized = $templateOptimized->serialize(true);
197
        $this->log->info($serialized."\n");
198
199
        return Normalizer::normalize($serialized);
200
    }
201
202
    /**
203
     * @param string $url
204
     *
205
     * @return bool
206
     * @throws \Exception
207
     */
208
    protected function isURLAutorized(string $url): bool
209
    {
210
        if (!ExternHttpClient::isWebURL($url)) {
211
            $this->log->debug('Skip : not an URL : '.$url);
212
213
            return false;
214
        }
215
216
        $this->url = $url;
217
        $this->domain = ExternDomains::extractSubDomain($this->url);
218
219
        if (in_array($this->domain, $this->skip_domain)) {
220
            $this->log->notice("Skip domain ".$this->domain);
221
222
            return false;
223
        }
224
225
        if (!isset($this->config[$this->domain])) {
226
            $this->log->info("Domain ".$this->domain." non configuré\n");
227
            if ($this->skipUnauthorised) {
228
                return false;
229
            }
230
        } else {
231
            echo "> Domaine ".Color::LIGHT_GREEN.$this->domain.Color::NORMAL." configuré\n";
232
        }
233
234
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
235
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
236
237
        if ($this->config[$this->domain] === 'desactived' || isset($this->config[$this->domain]['desactived'])) {
238
            $this->log->info("Domain ".$this->domain." desactivé\n");
239
240
            return false;
241
        }
242
243
        return true;
244
    }
245
246
    private function tagAndLog(array $mapData)
247
    {
248
        $this->log->debug('mapData', $mapData);
249
250
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
251
            $this->log->notice("Article OK");
252
        }
253
        if (isset($this->data['newspaper'][$this->domain])) {
254
            $this->log->notice('PRESSE');
255
        }
256
        if ($this->isScientificDomain()) {
257
            $this->log->notice('SCIENCE');
258
        }
259
    }
260
261
    private function isScientificDomain(): bool
262
    {
263
        if (isset($this->data['scientific domain'][$this->domain])) {
264
            return true;
265
        }
266
        if (strpos('.revues.org', $this->domain) > 0) {
267
            return true;
268
        }
269
270
        return false;
271
    }
272
273
    private function addSummaryLog(array $mapData)
274
    {
275
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
276
    }
277
278
    /**
279
     * todo refac lisible
280
     */
281
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
282
    {
283
        // Logique : choix template
284
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
285
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
286
287
        if (!empty($mapData['doi'])) {
288
            $templateName = 'article';
289
        }
290
291
        if ($this->config[$this->domain]['template'] === 'article'
292
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
293
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
294
            || $this->isScientificDomain()
295
        ) {
296
            $templateName = 'article';
297
        }
298
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
299
            $templateName = 'lien web';
300
        }
301
        // date obligatoire pour {article}
302
        if (!isset($mapData['date'])) {
303
            $templateName = 'lien web';
304
        }
305
306
        $template = WikiTemplateFactory::create($templateName);
307
        $template->userSeparator = " |";
308
309
        return $template;
310
    }
311
312
    /**
313
     * Logique : remplacement titre périodique ou nom du site
314
     *
315
     * @param array $mapData
316
     * @param       $template
317
     *
318
     * @return array
319
     */
320
    private function replaceSitenameByConfig(array $mapData, $template): array
321
    {
322
        // from wikidata URL of newspapers
323
        if (!empty($this->data['newspaper'][$this->domain])) {
324
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
325
            $label = $this->data['newspaper'][$this->domain]['fr'];
326
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
327
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
328
            }
329
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
330
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
331
            }
332
        }
333
334
        // from wikidata of scientific journals
335
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
336
            $mapData['périodique'] = WikiTextUtil::wikilink(
337
                $mapData['périodique'],
338
                $this->data['scientific wiki'][$mapData['périodique']]
339
            );
340
        }
341
342
        // from YAML config
343
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
344
            $mapData['site'] = $this->config[$this->domain]['site'];
345
        }
346
        if (!empty($this->config[$this->domain]['périodique'])
347
            && (!empty($mapData['périodique'])
348
                || $template instanceof OuvrageTemplate)
349
        ) {
350
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
351
        }
352
353
        // from logic
354
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
355
            try {
356
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
357
            } catch (\Throwable $e) {
358
                unset($e);
359
            }
360
        }
361
362
        return $mapData;
363
    }
364
365
    private function replaceURLbyOriginal(array $mapData): array
366
    {
367
        $mapData['url'] = $this->url;
368
369
        return $mapData;
370
    }
371
372
}
373