|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* This file is part of dispositif/wikibot application (@github) |
|
4
|
|
|
* 2019/2020 © Philippe M. <[email protected]> |
|
5
|
|
|
* For the full copyright and MIT license information, please view the license file. |
|
6
|
|
|
*/ |
|
7
|
|
|
|
|
8
|
|
|
declare(strict_types=1); |
|
9
|
|
|
|
|
10
|
|
|
namespace App\Application; |
|
11
|
|
|
|
|
12
|
|
|
use App\Domain\Models\Wiki\OuvrageTemplate; |
|
13
|
|
|
use App\Domain\OuvrageComplete; |
|
14
|
|
|
use App\Domain\OuvrageFactory; |
|
15
|
|
|
use App\Domain\OuvrageOptimize; |
|
16
|
|
|
use App\Domain\Publisher\Wikidata2Ouvrage; |
|
17
|
|
|
use App\Domain\Utils\TemplateParser; |
|
18
|
|
|
use App\Infrastructure\Logger; |
|
19
|
|
|
use App\Infrastructure\Memory; |
|
20
|
|
|
use App\Infrastructure\WikidataAdapter; |
|
21
|
|
|
use Exception; |
|
22
|
|
|
use GuzzleHttp\Client; |
|
23
|
|
|
use Normalizer; |
|
24
|
|
|
use Psr\Log\LoggerInterface; |
|
25
|
|
|
use Psr\Log\NullLogger; |
|
26
|
|
|
use Throwable; |
|
27
|
|
|
|
|
28
|
|
|
/** |
|
29
|
|
|
* Class OuvrageCompleteWorker |
|
30
|
|
|
* |
|
31
|
|
|
* @package App\Application |
|
32
|
|
|
*/ |
|
33
|
|
|
class OuvrageCompleteWorker |
|
34
|
|
|
{ |
|
35
|
|
|
/** |
|
36
|
|
|
* Exclusion requête BnF/Google/etc |
|
37
|
|
|
* Format EAN ou ISBN10 sans tiret. |
|
38
|
|
|
*/ |
|
39
|
|
|
const ISBN_EAN_SKIP |
|
40
|
|
|
= [ |
|
41
|
|
|
'9782918758440', // Profils de lignes du réseau ferré français vol.2 |
|
42
|
|
|
'9782918758341', // Profils de lignes du réseau ferré français vol.1 |
|
43
|
|
|
]; |
|
44
|
|
|
/** |
|
45
|
|
|
* @var QueueInterface |
|
46
|
|
|
*/ |
|
47
|
|
|
private $queueAdapter; |
|
48
|
|
|
/** |
|
49
|
|
|
* @var string |
|
50
|
|
|
*/ |
|
51
|
|
|
private $raw = ''; |
|
52
|
|
|
private $page; // article title |
|
53
|
|
|
|
|
54
|
|
|
private $summaryLog = []; |
|
55
|
|
|
private $notCosmetic = false; |
|
56
|
|
|
private $major = false; |
|
57
|
|
|
/** |
|
58
|
|
|
* @var OuvrageTemplate |
|
59
|
|
|
*/ |
|
60
|
|
|
private $ouvrage; |
|
61
|
|
|
/** |
|
62
|
|
|
* @var LoggerInterface|NullLogger |
|
63
|
|
|
*/ |
|
64
|
|
|
private $log; |
|
65
|
|
|
|
|
66
|
1 |
|
public function __construct(QueueInterface $queueAdapter, ?LoggerInterface $log = null) |
|
67
|
|
|
{ |
|
68
|
1 |
|
$this->queueAdapter = $queueAdapter; |
|
69
|
1 |
|
$this->log = $log ?? new NullLogger(); |
|
70
|
1 |
|
} |
|
71
|
|
|
|
|
72
|
1 |
|
public function run(?int $limit = 10000) |
|
73
|
|
|
{ |
|
74
|
1 |
|
$memory = new Memory(); |
|
75
|
1 |
|
while ($limit > 0) { |
|
76
|
1 |
|
$limit--; |
|
77
|
1 |
|
sleep(1); |
|
78
|
1 |
|
$row = $this->getNewRow2Complete(); |
|
79
|
1 |
|
$this->raw = $row['raw']; |
|
80
|
1 |
|
$this->page = $row['page']; |
|
81
|
|
|
|
|
82
|
1 |
|
echo sprintf( |
|
83
|
1 |
|
"-------------------------------\n%s [%s]\n%s\n%s\n", |
|
84
|
1 |
|
date("Y-m-d H:i:s"), |
|
85
|
1 |
|
WikiBotConfig::getGitVersion() ?? '', |
|
86
|
1 |
|
$this->page, |
|
87
|
1 |
|
$this->raw |
|
88
|
|
|
); |
|
89
|
|
|
|
|
90
|
1 |
|
$this->log->info($memory->getMemory(true)); |
|
91
|
|
|
|
|
92
|
|
|
// initialise variables |
|
93
|
1 |
|
$this->summaryLog = []; |
|
94
|
1 |
|
$this->ouvrage = null; |
|
95
|
1 |
|
$this->notCosmetic = false; |
|
96
|
1 |
|
$this->major = false; |
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
try { |
|
100
|
1 |
|
$parse = TemplateParser::parseAllTemplateByName('ouvrage', $this->raw); |
|
101
|
1 |
|
$origin = $parse['ouvrage'][0]['model'] ?? null; |
|
102
|
|
|
} catch (Throwable $e) { |
|
103
|
|
|
echo sprintf("*** ERREUR impossible de transformer en modèle %s \n", $this->raw); |
|
104
|
|
|
continue; |
|
105
|
|
|
} |
|
106
|
|
|
|
|
107
|
1 |
|
if (!$origin instanceof OuvrageTemplate) { |
|
108
|
|
|
echo sprintf("*** ERREUR impossible de transformer en modèle %s \n", $this->raw); |
|
109
|
|
|
continue; |
|
110
|
|
|
} |
|
111
|
|
|
|
|
112
|
|
|
// Final optimizing (with online predictions) |
|
113
|
1 |
|
$optimizer = new OuvrageOptimize($origin, $this->page, new Logger()); |
|
114
|
1 |
|
$optimizer->doTasks(); |
|
115
|
1 |
|
$this->ouvrage = $optimizer->getOuvrage(); |
|
116
|
1 |
|
$this->summaryLog = array_merge($this->summaryLog, $optimizer->getSummaryLog()); |
|
117
|
1 |
|
$this->notCosmetic = ($optimizer->notCosmetic || $this->notCosmetic); |
|
118
|
|
|
|
|
119
|
|
|
/** |
|
120
|
|
|
* RECHERCHE ONLINE |
|
121
|
|
|
*/ |
|
122
|
1 |
|
$isbn = $origin->getParam('isbn') ?? null; // avant mise en forme EAN>ISBN |
|
123
|
1 |
|
$isbn10 = $origin->getParam('isbn2') ?? $origin->getParam('isbn10') ?? null; |
|
124
|
1 |
|
if (!empty($isbn) |
|
125
|
1 |
|
&& !$origin->hasParamValue('isbn invalide') |
|
|
|
|
|
|
126
|
1 |
|
&& !$origin->hasParamValue('isbn erroné') |
|
|
|
|
|
|
127
|
|
|
) { |
|
128
|
|
|
$this->onlineIsbnSearch($isbn, $isbn10); |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
1 |
|
$this->sendCompleted(); |
|
132
|
1 |
|
unset($optimizer); |
|
133
|
1 |
|
unset($parse); |
|
134
|
1 |
|
unset($origin); |
|
135
|
|
|
} // END WHILE |
|
136
|
|
|
|
|
137
|
1 |
|
return true; |
|
138
|
|
|
} |
|
139
|
|
|
|
|
140
|
|
|
/** |
|
141
|
|
|
* Get array (title+raw strings) to complete from AMQP queue, SQL Select or file reading. |
|
142
|
|
|
* |
|
143
|
|
|
* @return array |
|
144
|
|
|
* @throws Exception |
|
145
|
|
|
*/ |
|
146
|
1 |
|
private function getNewRow2Complete(): array |
|
147
|
|
|
{ |
|
148
|
1 |
|
$row = $this->queueAdapter->getNewRaw(); |
|
149
|
1 |
|
if (empty($row) || empty($row['raw'])) { |
|
150
|
|
|
echo "STOP: no more queue to process \n"; |
|
151
|
|
|
throw new Exception('no more queue to process'); |
|
152
|
|
|
} |
|
153
|
|
|
|
|
154
|
1 |
|
return $row; |
|
155
|
|
|
} |
|
156
|
|
|
|
|
157
|
|
|
/** |
|
158
|
|
|
* @param string $isbn |
|
159
|
|
|
* @param string|null $isbn10 |
|
160
|
|
|
* |
|
161
|
|
|
* @return bool |
|
162
|
|
|
*/ |
|
163
|
|
|
private function isIsbnSkipped(string $isbn, ?string $isbn10 = null): bool |
|
164
|
|
|
{ |
|
165
|
|
|
if (in_array(str_replace('-', '', $isbn), self::ISBN_EAN_SKIP) |
|
166
|
|
|
|| ($isbn10 !== null |
|
167
|
|
|
&& in_array(str_replace('-', '', $isbn10), self::ISBN_EAN_SKIP)) |
|
168
|
|
|
) { |
|
169
|
|
|
return true; |
|
170
|
|
|
} |
|
171
|
|
|
|
|
172
|
|
|
return false; |
|
173
|
|
|
} |
|
174
|
|
|
|
|
175
|
|
|
private function onlineIsbnSearch(string $isbn, ?string $isbn10 = null) |
|
176
|
|
|
{ |
|
177
|
|
|
if ($this->isIsbnSkipped($isbn, $isbn10)) { |
|
178
|
|
|
echo "*** SKIP THAT ISBN ***\n"; |
|
179
|
|
|
|
|
180
|
|
|
// Vérifier logique return |
|
181
|
|
|
return; |
|
182
|
|
|
} |
|
183
|
|
|
|
|
184
|
|
|
online: |
|
185
|
|
|
$this->log->info("sleep 10...\n"); |
|
186
|
|
|
sleep(10); |
|
187
|
|
|
|
|
188
|
|
|
try { |
|
189
|
|
|
$this->log->info('BIBLIO NAT FRANCE...'); |
|
190
|
|
|
// BnF sait pas trouver un vieux livre (10) d'après ISBN-13... FACEPALM ! |
|
191
|
|
|
$bnfOuvrage = null; |
|
192
|
|
|
if ($isbn10) { |
|
193
|
|
|
$bnfOuvrage = OuvrageFactory::BnfFromIsbn($isbn10); |
|
194
|
|
|
sleep(2); |
|
195
|
|
|
} |
|
196
|
|
|
if (!$isbn10 || empty($bnfOuvrage) || empty($bnfOuvrage->getParam('titre'))) { |
|
197
|
|
|
$bnfOuvrage = OuvrageFactory::BnfFromIsbn($isbn); |
|
198
|
|
|
} |
|
199
|
|
|
if (isset($bnfOuvrage) and $bnfOuvrage instanceof OuvrageTemplate) { |
|
200
|
|
|
$this->completeOuvrage($bnfOuvrage); |
|
201
|
|
|
|
|
202
|
|
|
// Wikidata requests from $infos (ISBN/ISNI) |
|
203
|
|
|
if (!empty($bnfOuvrage->getInfos())) { |
|
204
|
|
|
$this->log->info('WIKIDATA...'); |
|
205
|
|
|
|
|
206
|
|
|
// TODO move to factory |
|
207
|
|
|
$wikidataAdapter = new WikidataAdapter( |
|
208
|
|
|
new Client(['timeout' => 30, 'headers' => ['User-Agent' => getenv('USER_AGENT')]]) |
|
209
|
|
|
); |
|
210
|
|
|
$wdComplete = new Wikidata2Ouvrage($wikidataAdapter, clone $bnfOuvrage, $this->page); |
|
211
|
|
|
$this->completeOuvrage($wdComplete->getOuvrage()); |
|
212
|
|
|
} |
|
213
|
|
|
} |
|
214
|
|
|
} catch (Throwable $e) { |
|
215
|
|
|
echo sprintf( |
|
216
|
|
|
"*** ERREUR BnF Isbn Search %s %s %s \n", |
|
217
|
|
|
$e->getMessage(), |
|
218
|
|
|
$e->getFile(), |
|
219
|
|
|
$e->getLine() |
|
220
|
|
|
); |
|
221
|
|
|
} |
|
222
|
|
|
|
|
223
|
|
|
if (!isset($bnfOuvrage) || !$this->skipGoogle($bnfOuvrage)) { |
|
224
|
|
|
try { |
|
225
|
|
|
$this->log->info('GOOGLE...'); |
|
226
|
|
|
|
|
227
|
|
|
$googleOuvrage = OuvrageFactory::GoogleFromIsbn($isbn); |
|
228
|
|
|
$this->completeOuvrage($googleOuvrage); |
|
229
|
|
|
} catch (Throwable $e) { |
|
230
|
|
|
echo "*** ERREUR GOOGLE Isbn Search ***".$e->getMessage()."\n"; |
|
231
|
|
|
if (strpos($e->getMessage(), 'Could not resolve host: www.googleapis.com') === false) { |
|
232
|
|
|
throw $e; |
|
233
|
|
|
} |
|
234
|
|
|
unset($e); |
|
235
|
|
|
} |
|
236
|
|
|
} |
|
237
|
|
|
|
|
238
|
|
|
if (!isset($bnfOuvrage) && !isset($googleOuvrage)) { |
|
239
|
|
|
try { |
|
240
|
|
|
$this->log->info('OpenLibrary...'); |
|
241
|
|
|
$openLibraryOuvrage = OuvrageFactory::OpenLibraryFromIsbn($isbn); |
|
242
|
|
|
if (!empty($openLibraryOuvrage)) { |
|
243
|
|
|
$this->completeOuvrage($openLibraryOuvrage); |
|
244
|
|
|
} |
|
245
|
|
|
} catch (Throwable $e) { |
|
246
|
|
|
echo '**** ERREUR OpenLibrary Isbn Search'; |
|
247
|
|
|
} |
|
248
|
|
|
} |
|
249
|
|
|
} |
|
250
|
|
|
|
|
251
|
|
|
// private function onlineQuerySearch(string $query) |
|
252
|
|
|
// { |
|
253
|
|
|
// echo "sleep 40..."; |
|
254
|
|
|
// sleep(20); |
|
255
|
|
|
// onlineQuerySearch: |
|
256
|
|
|
// |
|
257
|
|
|
// try { |
|
258
|
|
|
// dump('GOOGLE SEARCH...'); |
|
259
|
|
|
// // $googleOuvrage = OuvrageFactory::GoogleFromIsbn($isbn); |
|
260
|
|
|
// $adapter = new GoogleBooksAdapter(); |
|
261
|
|
|
// $data = $adapter->search('blabla'); |
|
262
|
|
|
// dump($data); |
|
263
|
|
|
// //die; |
|
264
|
|
|
// // return $import->getOuvrage(); |
|
265
|
|
|
// // $this->completeOuvrage($googleOuvrage); |
|
266
|
|
|
// } catch (Throwable $e) { |
|
267
|
|
|
// echo "*** ERREUR GOOGLE QuerySearch *** ".$e->getMessage()."\n"; |
|
268
|
|
|
// echo "sleep 30min"; |
|
269
|
|
|
// sleep(60 * 30); |
|
270
|
|
|
// echo "Wake up\n"; |
|
271
|
|
|
// goto onlineQuerySearch; |
|
272
|
|
|
// } |
|
273
|
|
|
// } |
|
274
|
|
|
|
|
275
|
|
|
private function completeOuvrage(OuvrageTemplate $onlineOuvrage) |
|
276
|
|
|
{ |
|
277
|
|
|
$this->log->info($onlineOuvrage->serialize(true)); |
|
278
|
|
|
$optimizer = new OuvrageOptimize($onlineOuvrage, $this->page, new Logger()); |
|
279
|
|
|
$onlineOptimized = ($optimizer)->doTasks()->getOuvrage(); |
|
280
|
|
|
|
|
281
|
|
|
$completer = new OuvrageComplete($this->ouvrage, $onlineOptimized, new Logger()); |
|
282
|
|
|
$this->ouvrage = $completer->getResult(); |
|
283
|
|
|
|
|
284
|
|
|
// todo move that optimizing in OuvrageComplete ? |
|
285
|
|
|
$optimizer = new OuvrageOptimize($this->ouvrage, $this->page, new Logger()); |
|
286
|
|
|
$this->ouvrage = $optimizer->doTasks()->getOuvrage(); |
|
287
|
|
|
|
|
288
|
|
|
$this->log->info('Summary', $completer->getSummaryLog()); |
|
289
|
|
|
|
|
290
|
|
|
if ($completer->major) { |
|
291
|
|
|
$this->major = true; |
|
292
|
|
|
} |
|
293
|
|
|
$this->notCosmetic = ($completer->notCosmetic || $this->notCosmetic); |
|
294
|
|
|
$this->summaryLog = array_merge($this->summaryLog, $completer->getSummaryLog()); |
|
295
|
|
|
unset($optimizer); |
|
296
|
|
|
unset($completer); |
|
297
|
|
|
} |
|
298
|
|
|
|
|
299
|
1 |
|
private function sendCompleted() |
|
300
|
|
|
{ |
|
301
|
1 |
|
$isbn13 = $this->ouvrage->getParam('isbn') ?? null; |
|
302
|
|
|
|
|
303
|
|
|
$finalData = [ |
|
304
|
|
|
// 'page' => |
|
305
|
1 |
|
'raw' => $this->raw, |
|
306
|
1 |
|
'opti' => $this->serializeFinalOpti(), |
|
307
|
1 |
|
'optidate' => date("Y-m-d H:i:s"), |
|
308
|
1 |
|
'modifs' => mb_substr(implode(',', $this->summaryLog), 0, 250), |
|
309
|
1 |
|
'notcosmetic' => ($this->notCosmetic) ? 1 : 0, |
|
310
|
1 |
|
'major' => ($this->major) ? 1 : 0, |
|
311
|
1 |
|
'isbn' => substr($isbn13, 0, 20), |
|
312
|
1 |
|
'version' => WikiBotConfig::getGitVersion() ?? null, |
|
313
|
|
|
]; |
|
314
|
1 |
|
$this->log->info('finalData', $finalData); |
|
315
|
|
|
// Json ? |
|
316
|
1 |
|
$result = $this->queueAdapter->sendCompletedData($finalData); |
|
317
|
|
|
|
|
318
|
1 |
|
$this->log->notice($result ? 'OK DB' : 'erreur sendCompletedData()'); |
|
319
|
1 |
|
} |
|
320
|
|
|
|
|
321
|
|
|
/** |
|
322
|
|
|
* Final serialization of the completed OuvrageTemplate. |
|
323
|
|
|
* |
|
324
|
|
|
* @return string |
|
325
|
|
|
*/ |
|
326
|
1 |
|
private function serializeFinalOpti(): string |
|
327
|
|
|
{ |
|
328
|
|
|
// // Améliore style compact : plus espacé |
|
329
|
|
|
// if ('|' === $this->ouvrage->userSeparator) { |
|
330
|
|
|
// $this->ouvrage->userSeparator = ' |'; |
|
331
|
|
|
// } |
|
332
|
1 |
|
$finalOpti = $this->ouvrage->serialize(true); |
|
333
|
1 |
|
$finalOpti = Normalizer::normalize($finalOpti); |
|
334
|
|
|
|
|
335
|
1 |
|
return $finalOpti; |
|
336
|
|
|
} |
|
337
|
|
|
|
|
338
|
|
|
private function skipGoogle($bnfOuvrage): bool |
|
339
|
|
|
{ |
|
340
|
|
|
if ($bnfOuvrage instanceof OuvrageTemplate |
|
341
|
|
|
&& $bnfOuvrage->hasParamValue('titre') |
|
|
|
|
|
|
342
|
|
|
&& ($this->ouvrage->hasParamValue('lire en ligne') |
|
|
|
|
|
|
343
|
|
|
|| $this->ouvrage->hasParamValue('présentation en ligne')) |
|
|
|
|
|
|
344
|
|
|
) { |
|
345
|
|
|
return true; |
|
346
|
|
|
} |
|
347
|
|
|
|
|
348
|
|
|
return false; |
|
349
|
|
|
} |
|
350
|
|
|
} |
|
351
|
|
|
|
This function has been deprecated. The supplier of the function has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.