1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* This file is part of dispositif/wikibot application |
4
|
|
|
* 2019 © Philippe M. <[email protected]> |
5
|
|
|
* For the full copyright and MIT license information, please view the LICENSE file. |
6
|
|
|
*/ |
7
|
|
|
|
8
|
|
|
declare(strict_types=1); |
9
|
|
|
|
10
|
|
|
namespace App\Domain\Publisher; |
11
|
|
|
|
12
|
|
|
use App\Domain\Enums\Language; |
13
|
|
|
use App\Domain\Utils\ArrayProcessTrait; |
14
|
|
|
use DateTime; |
15
|
|
|
use Exception; |
16
|
|
|
use Psr\Log\LoggerInterface; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Generic mapper for press/revue article on web. |
20
|
|
|
* Using JSON-LD and meta tags to obtain {article} data. |
21
|
|
|
* Generic mapper for web pages URL to wiki-template references. |
22
|
|
|
* Converting to {article}, {lien web} or {lien brisé} |
23
|
|
|
* Using JSON-LD, Open Graph and Dublin Core meta extracted from HTML. |
24
|
|
|
* Class ExternMapper |
25
|
|
|
* |
26
|
|
|
* @package App\Domain\Publisher |
27
|
|
|
*/ |
28
|
|
|
class ExternMapper implements MapperInterface |
29
|
|
|
{ |
30
|
|
|
use ArrayProcessTrait, ExternOGMapperTrait, ExternLDMapperTrait; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* @var LoggerInterface |
34
|
|
|
*/ |
35
|
|
|
private $log; |
36
|
|
|
|
37
|
|
|
public function __construct(LoggerInterface $log) |
38
|
|
|
{ |
39
|
|
|
$this->log = $log; |
40
|
|
|
} |
41
|
|
|
|
42
|
|
|
public function process($data): array |
43
|
|
|
{ |
44
|
|
|
$dat = $this->processMapping($data); |
45
|
|
|
|
46
|
|
|
return (!empty($dat)) ? $this->postProcess($dat) : []; |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
protected function processMapping($data): array |
50
|
|
|
{ |
51
|
|
|
$mapJson = []; |
52
|
|
|
$mapMeta = []; |
53
|
|
|
if (!empty($data['JSON-LD'])) { |
54
|
|
|
$mapJson = $this->processJsonMapping($data['JSON-LD']); |
55
|
|
|
} |
56
|
|
|
if (!empty($data['meta'])) { |
57
|
|
|
$mapMeta = $this->mapLienwebFromMeta($data['meta']); |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
// langue absente JSON-LD mais array_merge risqué (doublon) |
61
|
|
|
if (!empty($mapJson)) { |
62
|
|
|
if (!isset($mapJson['langue']) && isset($mapMeta['langue'])) { |
63
|
|
|
$mapJson['langue'] = $mapMeta['langue']; |
64
|
|
|
$mapJson['DATA-TYPE'] = 'JSON-LD+META'; |
65
|
|
|
} |
66
|
|
|
|
67
|
|
|
return $mapJson; |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
return $mapMeta; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* todo move to mapper ? |
75
|
|
|
* |
76
|
|
|
* @param array $LDdata |
77
|
|
|
* |
78
|
|
|
* @return array |
79
|
|
|
*/ |
80
|
|
|
private function processJsonMapping(array $LDdata): array |
81
|
|
|
{ |
82
|
|
|
if ($this->checkJSONLD($LDdata)) { |
83
|
|
|
return $this->mapArticleDataFromJSONLD($LDdata); |
84
|
|
|
} |
85
|
|
|
// gestion des multiples objets comme Figaro |
86
|
|
|
foreach ($LDdata as $dat) { |
87
|
|
|
if (is_array($dat) && $this->checkJSONLD($dat)) { |
88
|
|
|
return $this->mapArticleDataFromJSONLD($dat); |
89
|
|
|
} |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
return []; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
protected function checkJSONLD(array $jsonLD): bool |
96
|
|
|
{ |
97
|
|
|
return isset($jsonLD['headline']) && isset($jsonLD['@type']); |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* todo Refac/move domain special mapping |
102
|
|
|
* todo Config parameter for post-process |
103
|
|
|
* |
104
|
|
|
* @param array $dat |
105
|
|
|
* |
106
|
|
|
* @return array |
107
|
|
|
*/ |
108
|
|
|
protected function postProcess(array $dat): array |
109
|
|
|
{ |
110
|
|
|
$dat = $this->deleteEmptyValueArray($dat); |
111
|
|
|
if (isset($dat['langue']) && 'fr' === $dat['langue']) { |
112
|
|
|
unset($dat['langue']); |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
// Ça m'énerve ! Gallica met "vidéo" pour livre numérisé |
116
|
|
|
if (isset($dat['site']) && $dat['site'] === 'Gallica') { |
117
|
|
|
unset($dat['format']); |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
return $dat; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
protected function isAnArticle(?string $str): bool |
124
|
|
|
{ |
125
|
|
|
if (in_array($str, ['article', 'journalArticle'])) { |
126
|
|
|
return true; |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
return false; |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
protected function convertURLaccess($data): ?string |
133
|
|
|
{ |
134
|
|
|
// NYT, Figaro |
135
|
|
|
if (isset($data['isAccessibleForFree'])) { |
136
|
|
|
return $data['isAccessibleForFree'] ? 'ouvert' : 'limité'; |
137
|
|
|
} |
138
|
|
|
if (isset($data['DC.rights'])) { |
139
|
|
|
return (in_array($data['DC.rights'], ['free', 'public domain'])) ? 'ouvert' : 'limité'; |
140
|
|
|
} |
141
|
|
|
if (isset($data['og:article:content_tier'])) { |
142
|
|
|
return ($data['og:article:content_tier'] === 'free') ? 'ouvert' : 'limité'; |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
return null; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Réduit le nombre d'auteurs si > 3. |
150
|
|
|
* En $modeEtAll=true vérification pour "et al.=oui". |
151
|
|
|
* TODO : wikifyPressAgency() |
152
|
|
|
* |
153
|
|
|
* @param string|null $authors |
154
|
|
|
* @param bool $modeEtAl |
155
|
|
|
* |
156
|
|
|
* @return string|null |
157
|
|
|
*/ |
158
|
|
|
protected function authorsEtAl(?string $authors, $modeEtAl = false): ?string |
159
|
|
|
{ |
160
|
|
|
if (empty($authors)) { |
161
|
|
|
return null; |
162
|
|
|
} |
163
|
|
|
// conserve juste les 3 premiers auteurs TODO : refactor |
164
|
|
|
// Bob, Martin ; Yul, Bar ; ... ; ... |
165
|
|
|
if (preg_match('#([^;]+;[^;]+);[^;]+;.+#', $authors, $matches)) { |
166
|
|
|
return ($modeEtAl) ? 'oui' : $matches[1]; |
167
|
|
|
} |
168
|
|
|
// Bob Martin, Yul Bar, ..., ...,... |
169
|
|
|
if (preg_match('#([^,]+,[^,]+),[^,]+,.+#', $authors, $matches)) { |
170
|
|
|
return ($modeEtAl) ? 'oui' : $matches[1]; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
return ($modeEtAl) ? null : $authors; |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
protected function convertDCpage(array $meta): ?string |
177
|
|
|
{ |
178
|
|
|
if (isset($meta['citation_firstpage'])) { |
179
|
|
|
$page = $meta['citation_firstpage']; |
180
|
|
|
if (isset($meta['citation_lastpage'])) { |
181
|
|
|
$page .= '–'.$meta['citation_lastpage']; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
return (string)$page; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
return null; |
188
|
|
|
} |
189
|
|
|
|
190
|
|
|
// TODO encodage + normalizer |
191
|
|
|
public function clean(?string $str = null): ?string |
192
|
|
|
{ |
193
|
|
|
if ($str === null) { |
194
|
|
|
return null; |
195
|
|
|
} |
196
|
|
|
$str = str_replace([''',''', ''', "\n", " ", "|", "é"], ["'","'", "'", '', ' ', '/', |
197
|
|
|
"é"], $str); |
198
|
|
|
|
199
|
|
|
return html_entity_decode($str); |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
protected function convertOGtype2format(?string $ogType): ?string |
203
|
|
|
{ |
204
|
|
|
if (empty($ogType)) { |
205
|
|
|
return null; |
206
|
|
|
} |
207
|
|
|
// og:type = default: website / video.movie / video.tv_show video.other / article, book, profile |
208
|
|
|
if (strpos($ogType, 'video') !== false) { |
209
|
|
|
return 'vidéo'; |
210
|
|
|
} |
211
|
|
|
if (strpos($ogType, 'book') !== false) { |
212
|
|
|
return 'livre'; |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
return null; |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
/** |
219
|
|
|
* https://developers.facebook.com/docs/internationalization#locales |
220
|
|
|
*/ |
221
|
|
|
protected function convertLangue(?string $lang = null): ?string |
222
|
|
|
{ |
223
|
|
|
if (empty($lang)) { |
224
|
|
|
return null; |
225
|
|
|
} |
226
|
|
|
// en_GB |
227
|
|
|
if (preg_match('#^([a-z]{2})_[A-Z]{2}$#', $lang, $matches)) { |
228
|
|
|
return $matches[1]; |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
return Language::all2wiki($lang); |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
protected function convertAuteur($data, $indice) |
235
|
|
|
{ |
236
|
|
|
// author=Bob |
237
|
|
|
if (isset($data['author']) && is_string($data['author']) && $indice === 1) { |
238
|
|
|
return html_entity_decode($data['author']); |
239
|
|
|
} |
240
|
|
|
|
241
|
|
|
// author ['name'=>'Bob','@type'=>'Person'] |
242
|
|
|
if (0 === $indice |
243
|
|
|
&& isset($data['author']) |
244
|
|
|
&& isset($data['author']['name']) |
245
|
|
|
&& (!isset($data['author']['@type']) |
246
|
|
|
|| 'Person' === $data['author']['@type']) |
247
|
|
|
) { |
248
|
|
|
if (is_string($data['author']['name'])) { |
249
|
|
|
return html_entity_decode($data['author']['name']); |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
return html_entity_decode($data['author']['name'][0]); |
253
|
|
|
} |
254
|
|
|
|
255
|
|
|
// author [ 0 => ['name'=>'Bob'], 1=> ...] |
256
|
|
|
if (isset($data['author']) && isset($data['author'][$indice]) |
257
|
|
|
&& (!isset($data['author'][$indice]['@type']) |
258
|
|
|
|| 'Person' === $data['author'][$indice]['@type']) |
259
|
|
|
) { |
260
|
|
|
if (isset($data['author'][$indice]['name']) && is_string($data['author'][$indice]['name'])) { |
261
|
|
|
return html_entity_decode($data['author'][$indice]['name']); |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
// "author" => [ "@type" => "Person", "name" => [] ] |
265
|
|
|
return html_entity_decode($data['author'][$indice]['name'][0]); |
266
|
|
|
} |
267
|
|
|
|
268
|
|
|
return null; |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
protected function convertInstitutionnel($data) |
272
|
|
|
{ |
273
|
|
|
if (isset($data['author']) && isset($data['author'][0]) && isset($data['author'][0]['@type']) |
274
|
|
|
&& 'Person' !== $data['author'][0]['@type'] |
275
|
|
|
) { |
276
|
|
|
return html_entity_decode($data['author'][0]['name']); |
277
|
|
|
} |
278
|
|
|
|
279
|
|
|
return null; |
280
|
|
|
} |
281
|
|
|
|
282
|
|
|
/** |
283
|
|
|
* @param string $str |
284
|
|
|
* |
285
|
|
|
* @return string |
286
|
|
|
*/ |
287
|
|
|
protected function convertDate(?string $str): ?string |
288
|
|
|
{ |
289
|
|
|
if (empty($str)) { |
290
|
|
|
return null; |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
// "2012" |
294
|
|
|
if (preg_match('#^[12][0-9]{3}$#', $str)) { |
295
|
|
|
return $str; |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
try { |
299
|
|
|
$date = new DateTime($str); |
300
|
|
|
} catch (Exception $e) { |
301
|
|
|
dump('EXCEPTION DATE'); |
302
|
|
|
|
303
|
|
|
return $str; |
304
|
|
|
} |
305
|
|
|
|
306
|
|
|
return $date->format('d-m-Y'); |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
/** |
310
|
|
|
* Wikification des noms/acronymes d'agences de presse. |
311
|
|
|
* |
312
|
|
|
* @param string $str |
313
|
|
|
* |
314
|
|
|
* @return string |
315
|
|
|
*/ |
316
|
|
|
protected function wikifyPressAgency(?string $str): ?string |
317
|
|
|
{ |
318
|
|
|
if (empty($str)) { |
319
|
|
|
return null; |
320
|
|
|
} |
321
|
|
|
// skip potential wikilinks |
322
|
|
|
if (strpos($str, '[') !== false) { |
323
|
|
|
return $str; |
324
|
|
|
} |
325
|
|
|
$str = preg_replace('#\b(AFP)\b#i', '[[Agence France-Presse|AFP]]', $str); |
326
|
|
|
$str = str_replace('Reuters', '[[Reuters]]', $str); |
327
|
|
|
$str = str_replace('Associated Press', '[[Associated Press]]', $str); |
328
|
|
|
$str = preg_replace('#\b(PA)\b#', '[[Press Association|PA]]', $str); |
329
|
|
|
$str = preg_replace('#\b(AP)\b#', '[[Associated Press|AP]]', $str); |
330
|
|
|
$str = str_replace('Xinhua', '[[Xinhua]]', $str); |
331
|
|
|
$str = preg_replace('#\b(ATS)\b#', '[[Agence télégraphique suisse|ATS]]', $str); |
332
|
|
|
$str = preg_replace('#\b(PC|CP)\b#', '[[La Presse canadienne|PC]]', $str); |
333
|
|
|
|
334
|
|
|
return $str; |
335
|
|
|
} |
336
|
|
|
} |
337
|
|
|
|