Passed
Branch master (f55b85)
by Dispositif
03:58 queued 01:26
created

ExternRefTransformer::optimizeAndSerialize()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 7
nc 1
nop 2
dl 0
loc 10
rs 10
c 0
b 0
f 0
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\ExternLink;
11
12
use App\Domain\Models\Summary;
13
use App\Domain\Models\Wiki\AbstractWikiTemplate;
14
use App\Domain\Models\Wiki\ArticleTemplate;
15
use App\Domain\Models\Wiki\LienWebTemplate;
16
use App\Domain\OptimizerFactory;
17
use App\Domain\Publisher\ExternMapper;
18
use App\Domain\Utils\WikiTextUtil;
19
use App\Domain\WikiTemplateFactory;
20
use Exception;
21
use Normalizer;
22
use Psr\Log\LoggerInterface;
23
use Psr\Log\NullLogger;
24
use Throwable;
25
26
/**
27
 * TODO refac too big (responsibility)
28
 */
29
class ExternRefTransformer implements ExternRefTransformerInterface
30
{
31
    use SummaryExternTrait, RobotsRulesTrait, PublisherLogicTrait;
32
33
    public const HTTP_REQUEST_LOOP_DELAY = 10;
34
    public const SKIP_DOMAIN_FILENAME = __DIR__ . '/../resources/config_skip_domain.txt';
35
    public const REPLACE_404 = true;
36
    public const CONFIG_PRESSE = __DIR__ . '/../resources/config_presse.yaml';
37
    public const CONFIG_NEWSPAPER_JSON = __DIR__ . '/../resources/data_newspapers.json';
38
    public const CONFIG_SCIENTIFIC_JSON = __DIR__ . '/../resources/data_scientific_domain.json';
39
    public const CONFIG_SCIENTIFIC_WIKI_JSON = __DIR__ . '/../resources/data_scientific_wiki.json';
40
41
    public $skipSiteBlacklisted = true;
42
    public $skipRobotNoIndex = true;
43
    /**
44
     * @var array
45
     */
46
    public $summaryLog = [];
47
    /**
48
     * @var LoggerInterface
49
     */
50
    protected $log;
51
    protected $config;
52
    /**
53
     * @var string
54
     */
55
    protected $registrableDomain;
56
    /**
57
     * @var string
58
     */
59
    protected $url;
60
    /**
61
     * @var ExternMapper
62
     */
63
    protected $mapper;
64
    /**
65
     * @var array
66
     */
67
    protected $publisherData = [];
68
    /**
69
     * @var array
70
     */
71
    protected $skip_domain;
72
    /**
73
     * @var ExternPage
74
     */
75
    protected $externalPage;
76
    /**
77
     * @var Summary|null
78
     */
79
    protected $summary;
80
    /**
81
     * @var ExternHttpClientInterface
82
     */
83
    protected $httpClient;
84
    private $externHttpErrorLogic;
85
    /**
86
     * @var CheckURL
87
     */
88
    private $urlChecker;
89
90
    public function __construct(ExternMapper $externMapper, ExternHttpClientInterface $httpClient, ?LoggerInterface $logger)
91
    {
92
        $this->log = $logger ?? new NullLogger();
93
        $this->importConfigAndData();
94
        $this->mapper = $externMapper;
95
        $this->httpClient = $httpClient;
96
        $this->externHttpErrorLogic = new ExternHttpErrorLogic($this->log);
97
        $this->urlChecker = new CheckURL($logger);
98
    }
99
100
    /**
101
     * TODO Refac : chain of responsibility or composite pattern
102
     * @throws Exception
103
     */
104
    public function process(string $url, Summary $summary): string
105
    {
106
        $this->url = $url;
107
        if (!$this->urlChecker->isURLAuthorized($url)) {
108
            return $url;
109
        }
110
        $this->registrableDomain = $this->urlChecker->getRegistrableDomain($url); // hack
111
        if ($this->isSiteBlackListed()) {
112
            return $url;
113
        }
114
115
        if (!$this->validateConfigWebDomain($this->registrableDomain)) {
0 ignored issues
show
Bug introduced by
It seems like $this->registrableDomain can also be of type null; however, parameter $domain of App\Domain\ExternLink\Ex...lidateConfigWebDomain() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

115
        if (!$this->validateConfigWebDomain(/** @scrutinizer ignore-type */ $this->registrableDomain)) {
Loading history...
116
            return $url;
117
        }
118
119
        try {
120
            $url = WikiTextUtil::normalizeUrlForTemplate($url);
121
            $pageData = $this->extractPageDataFromUrl($url); // ['JSON-LD'] & ['meta'] !!
122
        } catch (Exception $exception) {
123
            return $this->externHttpErrorLogic->manageHttpErrors($exception->getMessage(), $url);
124
        }
125
        if ($this->emptyPageData($pageData, $url)) {
126
            return $url;
127
        }
128
        if ($this->isRobotNoIndex($pageData, $url) && $this->skipRobotNoIndex) {
129
            // TODO ? return {lien web| titre=Titre inconnu...
130
            // http://www.nydailynews.com/entertainment/jessica-barth-details-alleged-harvey-weinstein-encounter-article-1.3557986
131
            return $url;
132
        }
133
134
        $mappedData = $this->mapper->process($pageData); // only json-ld or only meta, after postprocess
135
        if ($this->emptyMapData($mappedData, $url)) {
136
            // TODO ? return {lien web| titre=Titre inconnu... site=prettydomain ...
137
            return $url;
138
        }
139
        $mappedData = $this->unsetAccesLibre($mappedData);
140
141
        $this->addSummaryLog($mappedData, $summary);
142
        $this->tagAndLog($mappedData);
143
144
        $template = $this->chooseTemplateByData($this->registrableDomain, $mappedData);
0 ignored issues
show
Bug introduced by
It seems like $this->registrableDomain can also be of type null; however, parameter $domain of App\Domain\ExternLink\Ex...:chooseTemplateByData() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

144
        $template = $this->chooseTemplateByData(/** @scrutinizer ignore-type */ $this->registrableDomain, $mappedData);
Loading history...
145
146
        $mappedData = $this->replaceSomeData($mappedData, $template); // template specif + data + url
147
        $serialized = $this->optimizeAndSerialize($template, $mappedData);
148
149
        $normalized = Normalizer::normalize($serialized); // sometimes :bool
150
        if (!empty($normalized) && is_string($normalized)) {
151
            return $normalized;
152
        }
153
        if (!empty($serialized)) {
154
            return $serialized;
155
        }
156
157
        return $url; // error fallback
158
    }
159
160
    protected function isSiteBlackListed(): bool
161
    {
162
        if ($this->skipSiteBlacklisted && in_array($this->registrableDomain, $this->skip_domain)) {
163
            $this->log->notice("Skip web site " . $this->registrableDomain);
164
            return true;
165
        }
166
        return false;
167
    }
168
169
    /**
170
     * todo move transformer
171
     */
172
    protected function validateConfigWebDomain(string $domain): bool
173
    {
174
        $this->logDebugConfigWebDomain($domain);
175
176
        // todo move to config
177
        $this->config[$domain] = $this->config[$domain] ?? [];
178
        $this->config[$domain] = is_array($this->config[$domain]) ? $this->config[$domain] : [];
179
180
        if ($this->config[$domain] === 'deactivated' || isset($this->config[$domain]['deactivated'])) {
181
            $this->log->info("Domain " . $domain . " disabled\n");
182
183
            return false;
184
        }
185
186
        return true;
187
    }
188
189
    /**
190
     * @return void
191
     */
192
    protected function logDebugConfigWebDomain(string $domain): void
193
    {
194
        if (!isset($this->config[$domain])) {
195
            $this->log->debug("Domain " . $domain . " non configuré");
196
        } else {
197
            $this->log->debug("Domain " . $domain . " configuré");
198
        }
199
    }
200
201
    /**
202
     * Stay
203
     * @throws Exception
204
     */
205
    protected function extractPageDataFromUrl(string $url): array
206
    {
207
        sleep(self::HTTP_REQUEST_LOOP_DELAY);
208
        $this->externalPage = ExternPageFactory::fromURL($url, $this->httpClient, $this->log);
209
        $pageData = $this->externalPage->getData();
210
        $this->log->debug('metaData', $pageData);
211
212
        return $pageData;
213
    }
214
215
    // stay
216
217
    protected function emptyPageData(array $pageData, string $url): bool
218
    {
219
        if ($pageData === []
220
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
221
        ) {
222
            $this->log->notice('No metadata : ' . $url);
223
224
            return true;
225
        }
226
227
        return false;
228
    }
229
230
    /**
231
     * check dataValide
232
     * Pas de skip domaine car s'agit peut-être d'un 404 ou erreur juste sur cette URL
233
     */
234
    protected function emptyMapData(array $mapData, string $url): bool
235
    {
236
        if ($mapData === [] || empty($mapData['url']) || empty($mapData['titre'])) {
237
            $this->log->info('Mapping incomplet : ' . $url);
238
239
            return true;
240
        }
241
        return false;
242
    }
243
244
    // stay
245
246
    /**
247
     * Pas de 'accès url=libre' # débat février 2021
248
     */
249
    protected function unsetAccesLibre(array $mapData): array
250
    {
251
        if (isset($mapData['accès url']) && $mapData['accès url'] === 'libre') {
252
            unset($mapData['accès url']);
253
        }
254
        return $mapData;
255
    }
256
257
    /**
258
     * todo Stay ?
259
     * todo refac lisible
260
     *
261
     * @param array $mapData
262
     *
263
     * @return AbstractWikiTemplate
264
     * @throws Exception
265
     */
266
    protected function chooseTemplateByData(string $domain, array $mapData): AbstractWikiTemplate
267
    {
268
        // Logique : choix template
269
        $this->config[$domain]['template'] = $this->config[$domain]['template'] ?? [];
270
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
271
272
        if (!empty($mapData['doi'])) {
273
            $templateName = 'article';
274
        }
275
276
        if ($this->config[$domain]['template'] === 'article'
277
            || ($this->config[$domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
278
            || ($mapData['DATA-ARTICLE'] && !empty($this->publisherData['newspaper'][$domain]))
279
            || $this->isScientificDomain()
280
        ) {
281
            $templateName = 'article';
282
        }
283
        if (!isset($templateName) || $this->config[$domain]['template'] === 'lien web') {
284
            $templateName = 'lien web';
285
        }
286
287
        // date obligatoire pour {article}
288
        if (!isset($mapData['date'])) {
289
            $templateName = 'lien web';
290
        }
291
292
        $template = WikiTemplateFactory::create($templateName);
293
        $template->userSeparator = " |";
294
        $this->summary->memo['count ' . $templateName] = 1 + ($this->summary->memo['count ' . $templateName] ?? 0);
295
296
        return $template;
297
    }
298
299
    protected function replaceSomeData(array $mapData, AbstractWikiTemplate $template): array
300
    {
301
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
302
        $mapData = $this->fallbackIfSitenameNull($mapData, $template);
303
304
        $mapData = $this->replaceURLbyOriginal($mapData);
305
306
        if ($template instanceof ArticleTemplate) {
307
            unset($mapData['site']);
308
        }
309
        unset($mapData['DATA-TYPE']); // ugly
310
        unset($mapData['DATA-ARTICLE']); // ugly
311
        unset($mapData['url-access']);
312
313
        return $mapData;
314
    }
315
316
    // postprocess data
317
318
    protected function fallbackIfSitenameNull(array $mapData, AbstractWikiTemplate $template): array
319
    {
320
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
321
            try {
322
                $mapData['site'] = $this->externalPage->getPrettyDomainName();
323
            } catch (Throwable $e) {
324
                unset($e);
325
            }
326
        }
327
        return $mapData;
328
    }
329
330
    protected function replaceURLbyOriginal(array $mapData): array
331
    {
332
        $mapData['url'] = $this->url;
333
334
        return $mapData;
335
    }
336
337
    /**
338
     * @param AbstractWikiTemplate $template
339
     * @param array $mapData
340
     *
341
     * @return string
342
     * @throws Exception
343
     */
344
    protected function optimizeAndSerialize(AbstractWikiTemplate $template, array $mapData): string
345
    {
346
        $template->hydrate($mapData);
347
        $optimizer = OptimizerFactory::fromTemplate($template);
348
        $optimizer->doTasks();
349
        $templateOptimized = $optimizer->getOptiTemplate();
350
351
        $serialized = $templateOptimized->serialize(true);
352
        $this->log->info('Serialized 444: ' . $serialized . "\n");
353
        return $serialized;
354
    }
355
}
356