Passed
Branch master (7baf30)
by Dispositif
02:38
created

ExternRefTransformer   C

Complexity

Total Complexity 56

Size/Duplication

Total Lines 307
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 56
eloc 140
dl 0
loc 307
rs 5.5199
c 0
b 0
f 0

15 Methods

Rating   Name   Duplication   Size   Complexity  
A replaceSomeData() 0 16 2
A extractPageDataFromUrl() 0 8 1
A isSiteBlackListed() 0 7 3
A fallbackIfSitenameNull() 0 10 4
A emptyPageData() 0 11 4
A emptyMapData() 0 8 4
C process() 0 58 12
B chooseTemplateByData() 0 31 11
A __construct() 0 14 1
A unsetAccesLibre() 0 6 3
A validateConfigWebDomain() 0 15 4
A logDebugConfigWebDomain() 0 6 2
A replaceURLbyOriginal() 0 5 1
A optimizeAndSerialize() 0 10 1
A correctSiteViaWebarchiver() 0 7 3

How to fix   Complexity   

Complex Class

Complex classes like ExternRefTransformer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use ExternRefTransformer, and based on these observations, apply Extract Interface, too.

1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Domain\ExternLink\Validators\RobotNoIndexValidator;
13
use App\Domain\InfrastructurePorts\DeadlinkArchiverInterface;
14
use App\Domain\InfrastructurePorts\ExternHttpClientInterface;
15
use App\Domain\InfrastructurePorts\InternetDomainParserInterface;
16
use App\Domain\Models\Summary;
17
use App\Domain\Models\Wiki\AbstractWikiTemplate;
18
use App\Domain\Models\Wiki\ArticleTemplate;
19
use App\Domain\Models\Wiki\LienWebTemplate;
20
use App\Domain\Publisher\ExternMapper;
21
use App\Domain\Utils\WikiTextUtil;
22
use App\Domain\WikiOptimizer\OptimizerFactory;
23
use App\Domain\WikiTemplateFactory;
24
use Exception;
25
use Normalizer;
26
use Psr\Log\LoggerInterface;
27
use Psr\Log\NullLogger;
28
use Throwable;
29
30
/**
31
 * TODO refac too big (responsibility)
32
 */
33
class ExternRefTransformer implements ExternRefTransformerInterface
34
{
35
    use SummaryExternTrait, PublisherLogicTrait;
0 ignored issues
show
Bug introduced by
The trait App\Domain\ExternLink\SummaryExternTrait requires the property $citationNumber which is not provided by App\Domain\ExternLink\ExternRefTransformer.
Loading history...
36
37
    public const HTTP_REQUEST_LOOP_DELAY = 10;
38
    public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt';
39
    public const REPLACE_404 = true;
40
    public const REPLACE_410 = true;
41
    public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml';
42
    public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json';
43
    public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json';
44
    public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json';
45
46
    public bool $skipSiteBlacklisted = true;
47
    public bool $skipRobotNoIndex = true;
48
    public array $summaryLog = [];
49
50
    protected $config;
51
    protected string $registrableDomain;
52
    protected string $url;
53
    protected array $publisherData = [];
54
    protected array $skip_domain = [];
55
    protected ExternPage $externalPage;
56
    protected ?Summary $summary;
57
    protected ?string $originDomain;
58
    protected array $options = [];
59
    private readonly ExternHttpErrorLogic $externHttpErrorLogic;
60
    private readonly CheckURL $urlChecker;
61
62
    public function __construct(
63
        protected ExternMapper $mapper,
64
        protected ExternHttpClientInterface $httpClient,
65
        protected InternetDomainParserInterface $domainParser,
66
        protected LoggerInterface $log = new NullLogger(),
67
        protected ?DeadlinkArchiverInterface $deadlinkArchiver = null
68
    )
69
    {
70
        $this->importConfigAndData();
71
        $this->externHttpErrorLogic = new ExternHttpErrorLogic(
0 ignored issues
show
Bug introduced by
The property externHttpErrorLogic is declared read-only in App\Domain\ExternLink\ExternRefTransformer.
Loading history...
72
            new DeadLinkTransformer($deadlinkArchiver, $domainParser, null, $log),
73
            $log
74
        );
75
        $this->urlChecker = new CheckURL($domainParser, $log);
0 ignored issues
show
Bug introduced by
The property urlChecker is declared read-only in App\Domain\ExternLink\ExternRefTransformer.
Loading history...
76
    }
77
78
    /**
79
     * Transform "http://bla" => "{lien web|...}}", "{article}" or "{lien brisé}".
80
     *
81
     * TODO Refac : chain of responsibility or composite pattern
82
     * todo refac : return data DTO ? to much responsability!
83
     *
84
     * @throws Exception
85
     */
86
    public function process(string $url, Summary $summary = new Summary(), array $options = []): string
87
    {
88
        $this->url = $url;
89
        $this->options = $options; // used only to pass RegistrableDomain of archived deadlink
90
91
        if (!$this->urlChecker->isURLAuthorized($url)) {
92
            return $url;
93
        }
94
        $this->registrableDomain = $this->urlChecker->getRegistrableDomain($url); // hack
95
        if ($this->isSiteBlackListed()) {
96
            return $url;
97
        }
98
99
        if (!$this->validateConfigWebDomain($this->registrableDomain)) {
100
            return $url;
101
        }
102
103
        try {
104
            $url = WikiTextUtil::normalizeUrlForTemplate($url);
105
            $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !!
106
        } catch (Exception $exception) {
107
            return $this->externHttpErrorLogic->manageByHttpErrorMessage($exception->getMessage(), $url);
108
        }
109
        if ($this->emptyPageData($pageData, $url)) {
110
            return $url;
111
        }
112
        $noIndexValidator = new RobotNoIndexValidator($pageData, $url, $this->log); // todo inject
113
        if ($noIndexValidator->validate() && $this->skipRobotNoIndex) {
114
            // TODO ? return {lien web| titre=Titre inconnu... |note=noindex }
115
            // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986
116
            return $url;
117
        }
118
119
        $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess
120
        if ($this->emptyMapData($mappedData, $url)) {
121
            // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ...
122
            return $url;
123
        }
124
        $mappedData = $this->unsetAccesLibre($mappedData);
125
126
        $this->addSummaryLog($mappedData, $summary);
127
        $this->tagAndLog($mappedData);
128
129
        $template = $this->chooseTemplateByData($this->registrableDomain, $mappedData);
130
131
        $mappedData = $this->replaceSomeData($mappedData, $template); // template specif + data + url
132
133
        $serialized = $this->optimizeAndSerialize($template, $mappedData);
134
135
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
136
        if (!empty($normalized) && is_string($normalized)) {
137
            return $normalized;
138
        }
139
        if (!empty($serialized)) {
140
            return $serialized;
141
        }
142
143
        return $url; // error fallback
144
    }
145
146
    protected function isSiteBlackListed(): bool
147
    {
148
        if ($this->skipSiteBlacklisted && in_array($this->registrableDomain, $this->skip_domain)) {
149
            $this->log->notice("Skip web site " . $this->registrableDomain);
150
            return true;
151
        }
152
        return false;
153
    }
154
155
    /**
156
     * todo move transformer
157
     */
158
    protected function validateConfigWebDomain(string $domain): bool
159
    {
160
        $this->logDebugConfigWebDomain($domain);
161
162
        // todo move to config
163
        $this->config[$domain] ??= [];
164
        $this->config[$domain] = is_array($this->config[$domain]) ? $this->config[$domain] : [];
165
166
        if ($this->config[$domain] === 'deactivated' || isset($this->config[$domain]['deactivated'])) {
167
            $this->log->info("Domain " . $domain . " disabled\n");
168
169
            return false;
170
        }
171
172
        return true;
173
    }
174
175
    protected function logDebugConfigWebDomain(string $domain): void
176
    {
177
        if (!isset($this->config[$domain])) {
178
            $this->log->debug("Domain " . $domain . " non configuré");
179
        } else {
180
            $this->log->debug("Domain " . $domain . " configuré");
181
        }
182
    }
183
184
    /**
185
     * Stay
186
     * @throws Exception
187
     */
188
    protected function extractPageDataFromUrl(string $url): array
189
    {
190
        sleep(self::HTTP_REQUEST_LOOP_DELAY);
191
        $this->externalPage = ExternPageFactory::fromURL($url, $this->domainParser, $this->httpClient, $this->log);
192
        $pageData = $this->externalPage->getData();
193
        $this->log->debug('metaData', $pageData);
194
195
        return $pageData;
196
    }
197
198
    // stay
199
200
    protected function emptyPageData(array $pageData, string $url): bool
201
    {
202
        if ($pageData === []
203
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
204
        ) {
205
            $this->log->notice('No metadata : ' . $url);
206
207
            return true;
208
        }
209
210
        return false;
211
    }
212
213
    /**
214
     * check dataValide
215
     * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
216
     */
217
    protected function emptyMapData(array $mapData, string $url): bool
218
    {
219
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
220
            $this->log->info('Mapping incomplet : ' . $url);
221
222
            return true;
223
        }
224
        return false;
225
    }
226
227
    // stay
228
229
    /**
230
     * Pas de 'accès url=libre' # débat février 2021
231
     */
232
    protected function unsetAccesLibre(array $mapData): array
233
    {
234
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
235
            unset($mapData['accès url']);
236
        }
237
        return $mapData;
238
    }
239
240
    /**
241
     * todo Stay ?
242
     * todo refac lisible
243
     * @throws Exception
244
     */
245
    protected function chooseTemplateByData(string $domain, array $mapData): AbstractWikiTemplate
246
    {
247
        // Logique : choix template
248
        $this->config[$domain]['template'] ??= [];
249
        $mapData['DATA-ARTICLE'] ??= false;
250
251
        if (!empty($mapData['doi'])) {
252
            $templateName = 'article';
253
        }
254
255
        if ($this->config[$domain]['template'] === 'article'
256
            || ($this->config[$domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
257
            || ($mapData['DATA-ARTICLE'] && !empty($this->publisherData['newspaper'][$domain]))
258
            || $this->isScientificDomain()
259
        ) {
260
            $templateName = 'article';
261
        }
262
        if (!isset($templateName) || $this->config[$domain]['template'] === 'lien web') {
263
            $templateName = 'lien web';
264
        }
265
266
        // date obligatoire pour {article}
267
        if (!isset($mapData['date'])) {
268
            $templateName = 'lien web';
269
        }
270
271
        $template = WikiTemplateFactory::create($templateName);
272
        $template->userSeparator = " |";
273
        $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0);
274
275
        return $template;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $template could return the type null which is incompatible with the type-hinted return App\Domain\Models\Wiki\AbstractWikiTemplate. Consider adding an additional type-check to rule them out.
Loading history...
276
    }
277
278
    protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array
279
    {
280
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
281
        $mapData = $this->fallbackIfSitenameNull($mapData, $template);
282
        $mapData = $this->correctSiteViaWebarchiver($mapData);
283
284
        $mapData = $this->replaceURLbyOriginal($mapData);
285
286
        if ($template instanceof ArticleTemplate) {
287
            unset($mapData['site']);
288
        }
289
        unset($mapData['DATA-TYPE']); // ugly
290
        unset($mapData['DATA-ARTICLE']); // ugly
291
        unset($mapData['url-access']);
292
293
        return $mapData;
294
    }
295
296
    // postprocess data
297
298
    protected function fallbackIfSitenameNull(array $mapData, AbstractWikiTemplate $template): array
299
    {
300
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
301
            try {
302
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
303
            } catch (Throwable $e) {
304
                unset($e);
305
            }
306
        }
307
        return $mapData;
308
    }
309
310
    protected function replaceURLbyOriginal(array $mapData): array
311
    {
312
        $mapData['url'] = $this->url;
313
314
        return $mapData;
315
    }
316
317
    /**
318
     *
319
     * @throws Exception
320
     */
321
    protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string
322
    {
323
        $template->hydrate($mapData);
324
        $optimizer = OptimizerFactory::fromTemplate($template);
325
        $optimizer->doTasks();
326
        $templateOptimized = $optimizer->getOptiTemplate();
327
328
        $serialized = $templateOptimized->serialize(true);
329
        $this->log->info('Serialized 444: ' . $serialized . "\n");
330
        return $serialized;
331
    }
332
333
    protected function correctSiteViaWebarchiver(array $mapData): array
334
    {
335
        if (!empty($this->options['originalRegistrableDomain']) && $mapData['site']) {
336
            $mapData['site'] = $this->options['originalRegistrableDomain'].' via '.$mapData['site'];
337
        }
338
339
        return $mapData;
340
    }
341
}
342