Passed
Push — dev ( b7aeac...61ab03 )
by Dispositif
03:20
created

ExternRefTransformer::isScientificDomain()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 5
nc 3
nop 0
dl 0
loc 10
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Application;
11
12
use App\Domain\ExternDomains;
13
use App\Domain\ExternPageFactory;
14
use App\Domain\Models\Wiki\AbstractWikiTemplate;
15
use App\Domain\Models\Wiki\ArticleTemplate;
16
use App\Domain\Models\Wiki\LienWebTemplate;
17
use App\Domain\Models\Wiki\OuvrageTemplate;
18
use App\Domain\Publisher\ExternMapper;
19
use App\Domain\Utils\WikiTextUtil;
20
use App\Domain\WikiTemplateFactory;
21
use App\Infrastructure\Logger;
22
use Codedungeon\PHPCliColors\Color;
23
use Normalizer;
24
use Psr\Log\LoggerInterface;
25
use Symfony\Component\Yaml\Yaml;
26
27
/**
28
 * todo move Domain
29
 * Class ExternRefTransformer
30
 *
31
 * @package App\Application
32
 */
33
class ExternRefTransformer implements TransformerInterface
34
{
35
36
    const SKIPPED_FILE_LOG  = __DIR__.'/resources/external_skipped.log';
37
    const LOG_REQUEST_ERROR = __DIR__.'/resources/external_request_error.log';
38
    public $skipUnauthorised = true;
39
    /**
40
     * @var array
41
     */
42
    public $summaryLog = [];
43
    /**
44
     * @var LoggerInterface
45
     */
46
    protected $log;
47
    private $config;
48
    /**
49
     * @var string|string[]
50
     */
51
    private $domain;
52
    /**
53
     * @var string
54
     */
55
    private $url;
56
    /**
57
     * @var ExternMapper
58
     */
59
    private $mapper;
60
    /**
61
     * @var array
62
     */
63
    private $data = [];
64
    /**
65
     * @var array
66
     */
67
    private $skip_domain = [];
68
    /**
69
     * @var \App\Domain\ExternPage
70
     */
71
    private $externalPage;
72
73
    /**
74
     * ExternalRefTransformer constructor.
75
     *
76
     * @param LoggerInterface $log
77
     */
78
    public function __construct(LoggerInterface $log)
79
    {
80
        $this->log = $log;
81
82
        // todo REFAC DataObject[]
83
        $this->config = Yaml::parseFile(__DIR__.'/resources/config_presse.yaml');
84
        $skipFromFile = file(__DIR__.'/resources/config_skip_domain.txt');
85
        $this->skip_domain = ($skipFromFile) ? $skipFromFile : [];
86
87
        $this->data['newspaper'] = json_decode(file_get_contents(__DIR__.'/resources/data_newspapers.json'), true);
88
        $this->data['scientific domain'] = json_decode(
89
            file_get_contents(__DIR__.'/resources/data_scientific_domain.json'),
90
            true
91
        );
92
        $this->data['scientific wiki'] = json_decode(
93
            file_get_contents(__DIR__.'/resources/data_scientific_wiki.json'),
94
            true
95
        );
96
97
        $this->mapper = new ExternMapper(new Logger());
98
    }
99
100
    /**
101
     * @param string $string
102
     *
103
     * @return string
104
     * @throws \Exception
105
     */
106
    public function process(string $string): string
107
    {
108
        if (!$this->isURLAutorized($string)) {
109
            return $string;
110
        }
111
        try {
112
            sleep(5);
113
            $this->externalPage = ExternPageFactory::fromURL($string, $this->log);
114
            $pageData = $this->externalPage->getData();
115
            $this->log->debug('metaData', $this->externalPage->getData());
116
        } catch (\Exception $e) {
117
            // ne pas générer de {lien brisé}, car peut-être 404 temporaire
118
            $this->log->notice('erreur sur extractWebData '.$e->getMessage());
119
            file_put_contents(self::LOG_REQUEST_ERROR, $this->domain);
120
        }
121
122
        if (empty($pageData)
123
            || (empty($pageData['JSON-LD']) && empty($pageData['meta']))
124
        ) {
125
            // site avec HTML pourri
126
            return $string;
127
        }
128
129
        if (isset($pageData['robots']) && strpos($pageData['robots'], 'noindex') !== false) {
130
            $this->log->notice('SKIP robots: noindex');
131
132
            return $string;
133
        }
134
135
        $mapData = $this->mapper->process($pageData);
136
137
        // check dataValide
138
        if (empty($mapData) || empty($mapData['url']) || empty($mapData['titre'])) {
139
            $this->skip_domain[] = $this->domain;
140
            $this->log->info('Mapping incomplet');
141
            // Todo : temp data
142
            try {
143
                file_put_contents(self::SKIPPED_FILE_LOG, $this->domain.",".$this->url."\n", FILE_APPEND);
144
            } catch (\Throwable $e) {
145
                unset($e);
146
            }
147
148
            return $string;
149
        }
150
151
        $this->tagAndLog($mapData);
152
        $this->addSummaryLog($mapData);
153
154
        $template = $this->chooseTemplateByData($mapData);
155
156
        $mapData = $this->replaceSitenameByConfig($mapData, $template);
157
        $mapData = $this->replaceURLbyOriginal($mapData);
158
159
        $template->hydrate($mapData);
160
161
        $serialized = $template->serialize(true);
162
        $this->log->info($serialized."\n");
163
164
        return Normalizer::normalize($serialized);
165
    }
166
167
    /**
168
     * @param string $string
169
     *
170
     * @return bool
171
     * @throws \Exception
172
     */
173
    protected function isURLAutorized(string $string): bool
174
    {
175
        if (!preg_match('#^http?s://[^ ]+$#i', $string)) {
176
            return false;
177
        }
178
179
        $this->url = $string;
180
        $this->domain = ExternDomains::extractSubDomain($this->url);
181
182
        if (in_array($this->domain, $this->skip_domain)) {
183
            return false;
184
        }
185
186
        if (!isset($this->config[$this->domain])) {
187
            $this->log->info("Domain ".$this->domain." non configuré\n");
188
            if ($this->skipUnauthorised) {
189
                return false;
190
            }
191
        } else {
192
            echo "> Domaine ".Color::LIGHT_GREEN.$this->domain.Color::NORMAL." configuré\n";
193
        }
194
195
        $this->config[$this->domain] = $this->config[$this->domain] ?? [];
196
        $this->config[$this->domain] = is_array($this->config[$this->domain]) ? $this->config[$this->domain] : [];
197
198
        if ($this->config[$this->domain] === 'desactived' || isset($this->config[$this->domain]['desactived'])) {
199
            $this->log->info("Domain ".$this->domain." desactivé\n");
200
201
            return false;
202
        }
203
204
        return true;
205
    }
206
207
    private function tagAndLog(array $mapData)
208
    {
209
        $this->log->debug('mapData', $mapData);
210
211
        if (isset($mapData['DATA-ARTICLE']) && $mapData['DATA-ARTICLE']) {
212
            $this->log->notice("Article OK");
213
        }
214
        if (isset($this->data['newspaper'][$this->domain])) {
215
            $this->log->notice('PRESSE');
216
        }
217
        if ($this->isScientificDomain()) {
218
            $this->log->notice('SCIENCE');
219
        }
220
    }
221
222
    private function isScientificDomain(): bool
223
    {
224
        if (isset($this->data['scientific domain'][$this->domain])) {
225
            return true;
226
        }
227
        if (strpos('.revues.org', $this->domain) > 0) {
228
            return true;
229
        }
230
231
        return false;
232
    }
233
234
    private function addSummaryLog(array $mapData)
235
    {
236
        $this->summaryLog[] = $mapData['site'] ?? $mapData['périodique'] ?? '?';
237
    }
238
239
    /**
240
     * todo refac lisible
241
     */
242
    private function chooseTemplateByData(array $mapData): AbstractWikiTemplate
243
    {
244
        // Logique : choix template
245
        $this->config[$this->domain]['template'] = $this->config[$this->domain]['template'] ?? [];
246
        $mapData['DATA-ARTICLE'] = $mapData['DATA-ARTICLE'] ?? false;
247
248
        if ($this->config[$this->domain]['template'] === 'article'
249
            || ($this->config[$this->domain]['template'] === 'auto' && $mapData['DATA-ARTICLE'])
250
            || ($mapData['DATA-ARTICLE'] && !empty($this->data['newspaper'][$this->domain]))
251
            || $this->isScientificDomain()
252
        ) {
253
            $templateName = 'article';
254
        }
255
        if (!isset($templateName) || $this->config[$this->domain]['template'] === 'lien web') {
256
            $templateName = 'lien web';
257
        }
258
        $template = WikiTemplateFactory::create($templateName);
259
        $template->userSeparator = " |";
260
261
        return $template;
262
    }
263
264
    /**
265
     * Logique : remplacement titre périodique ou nom du site
266
     *
267
     * @param array $mapData
268
     * @param       $template
269
     *
270
     * @return array
271
     */
272
    private function replaceSitenameByConfig(array $mapData, $template): array
273
    {
274
        // from wikidata URL of newspapers
275
        if (!empty($this->data['newspaper'][$this->domain])) {
276
            $frwiki = $this->data['newspaper'][$this->domain]['frwiki'];
277
            $label = $this->data['newspaper'][$this->domain]['fr'];
278
            if (isset($mapData['site']) || $template instanceof LienWebTemplate) {
279
                $mapData['site'] = WikiTextUtil::wikilink($label, $frwiki);
280
            }
281
            if (isset($mapData['périodique']) || $template instanceof ArticleTemplate) {
282
                $mapData['périodique'] = WikiTextUtil::wikilink($label, $frwiki);
283
            }
284
        }
285
286
        // from wikidata of scientific journals
287
        if (isset($mapData['périodique']) && isset($this->data['scientific wiki'][$mapData['périodique']])) {
288
            $mapData['périodique'] = WikiTextUtil::wikilink(
289
                $mapData['périodique'],
290
                $this->data['scientific wiki'][$mapData['périodique']]
291
            );
292
        }
293
294
        // from YAML config
295
        if (!empty($this->config[$this->domain]['site']) && $template instanceof LienWebTemplate) {
296
            $mapData['site'] = $this->config[$this->domain]['site'];
297
        }
298
        if (!empty($this->config[$this->domain]['périodique'])
299
            && (!empty($mapData['périodique'])
300
                || $template instanceof OuvrageTemplate)
301
        ) {
302
            $mapData['périodique'] = $this->config[$this->domain]['périodique'];
303
        }
304
305
        // from logic
306
        if (empty($mapData['site']) && $template instanceof LienWebTemplate) {
307
            $mapData['site'] = $this->externalPage->getPrettyDomainName();
308
        }
309
310
        return $mapData;
311
    }
312
313
    private function replaceURLbyOriginal(array $mapData):array
314
    {
315
        $mapData['url'] = $this->url;
316
        return $mapData;
317
    }
318
319
}
320