Passed
Branch dev (b7aeac)
by Dispositif
03:10
created

GoogleTransformer::extractGoogleExternal()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 7
nc 2
nop 1
dl 0
loc 14
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain;
11
12
use App\Domain\Models\Wiki\GoogleLivresTemplate;
13
use App\Domain\Models\Wiki\OuvrageTemplate;
14
use App\Domain\Publisher\GoogleBookMapper;
15
use App\Domain\Utils\NumberUtil;
16
use App\Domain\Utils\WikiTextUtil;
17
use App\Infrastructure\GoogleApiQuota;
18
use App\Infrastructure\GoogleBooksAdapter;
19
use DomainException;
20
use Exception;
21
use Throwable;
22
23
/**
24
 * TODO REFAC : duplicate, extract methods in trait or in RefBotWorker + ExternBotWorker
25
 * --
26
 * Transform <ref>https://books.google...</ref> to <ref>{{Ouvrage|...}}.</ref>
27
 * in an article wikitext.
28
 * Class GoogleTransformer
29
 *
30
 * @package App\Domain
31
 */
32
class GoogleTransformer
33
{
34
    const SLEEP_GOOGLE_API_INTERVAL = 5;
35
36
    /**
37
     * @var array OuvrageTemplate[]
38
     */
39
    private $cacheOuvrageTemplate = [];
40
    /**
41
     * @var GoogleApiQuota
42
     */
43
    private $quota;
44
45
    /**
46
     * GoogleTransformer constructor.
47
     * todo dependency injection
48
     */
49
    public function __construct()
50
    {
51
        $this->quota = new GoogleApiQuota();
52
    }
53
54
    /**
55
     * process page wikitext. Return wikitext with the <ref> converted.
56
     *
57
     * @param string $text Page wikitext
58
     *
59
     * @return string New wikitext
60
     * @throws Throwable
61
     */
62
    public function process(string $text): string
63
    {
64
        if ($this->quota->isQuotaReached()) {
65
            throw new DomainException('Quota Google atteint');
66
        }
67
68
        $refsData = $this->extractAllGoogleRefs($text);
69
        if (!empty($refsData)) {
70
            $text = $this->processRef($text, $refsData);
71
        }
72
73
        $links = $this->extractGoogleExternal($text);
74
        if (!empty($links)) {
75
            $text = $this->processExternLinks($text, $links);
76
        }
77
78
        return $text;
79
    }
80
81
    /**
82
     * todo move
83
     *
84
     * @param string $text
85
     *
86
     * @return array
87
     */
88
    public function extractGoogleExternal(string $text): array
89
    {
90
        // match "* https://books.google.fr/..."
91
        if (preg_match_all(
92
            '#^\* *('.GoogleLivresTemplate::GOOGLEBOOKS_START_URL_PATTERN.'[^ <{\]}\n\r]+) *$#im',
93
            $text,
94
            $matches,
95
            PREG_SET_ORDER
96
        )
97
        ) {
98
            return $matches;
99
        }
100
101
        return [];
102
    }
103
104
    /**
105
     * TODO Duplication du dessus...
106
     *
107
     * @param string $text
108
     * @param array  $links
109
     *
110
     * @return string|string[]
111
     * @throws Throwable
112
     */
113
    private function processExternLinks(string $text, array $links)
114
    {
115
        foreach ($links as $pattern) {
116
            if ($this->quota->isQuotaReached()) {
117
                throw new DomainException('Quota Google atteint');
118
            }
119
            try {
120
                $citation = $this->convertGBurl2OuvrageCitation(WikiTextUtil::stripFinalPoint($pattern[1]));
121
            } catch (Exception $e) {
122
                echo "Exception ".$e->getMessage();
123
                continue;
124
            }
125
126
            // todo : ajout point final pour référence ???
127
            $citation .= '.';
128
129
            $newRef = str_replace($pattern[1], $citation, $pattern[0]);
130
            echo $newRef."\n";
131
132
            $text = str_replace($pattern[0], $newRef, $text);
133
134
            echo "sleep ".self::SLEEP_GOOGLE_API_INTERVAL."\n";
135
            sleep(self::SLEEP_GOOGLE_API_INTERVAL);
136
        }
137
138
        return $text;
139
    }
140
141
    /**
142
     * TODO : extract
143
     * Convert GoogleBooks URL to wiki-template {ouvrage} citation.
144
     *
145
     * @param string $url GoogleBooks URL
146
     *
147
     * @return string {{ouvrage}}
148
     * @throws Exception
149
     * @throws Throwable
150
     */
151
    public function convertGBurl2OuvrageCitation(string $url): string
152
    {
153
        if (!GoogleLivresTemplate::isGoogleBookURL($url)) {
154
            throw new DomainException('Pas de URL Google Books');
155
        }
156
157
        $gooDat = GoogleLivresTemplate::parseGoogleBookQuery($url);
158
        if (empty($gooDat['id'])) {
159
            throw new DomainException('Pas de ID Google Books');
160
        }
161
162
        try {
163
            $ouvrage = $this->generateOuvrageFromGoogleData($gooDat['id']);
164
        } catch (Throwable $e) {
165
            // ID n'existe pas sur Google Books
166
            if (strpos($e->getMessage(), '"message": "The volume ID could n')) {
167
                return sprintf(
168
                    '{{lien brisé |url= %s |titre= %s |brisé le=%s}}',
169
                    $url,
170
                    'Ouvrage inexistant sur Google Books',
171
                    date('d-m-Y')
172
                );
173
            }
174
            throw $e;
175
        }
176
177
        $cleanUrl = GoogleLivresTemplate::simplifyGoogleUrl($url);
178
        $ouvrage->unsetParam('présentation en ligne');
179
        $ouvrage->setParam('lire en ligne', $cleanUrl);
180
        $ouvrage->userSeparator = ' |';
181
182
        // Si titre absent
183
        if (!$ouvrage->hasParamValue('titre')) {
184
            throw new DomainException("Ouvrage sans titre (data Google?)");
185
        }
186
187
        // Google page => 'passage'
188
        if (!empty($gooDat['pg'])) {
189
            if (preg_match('#(?:PA|PT)([0-9]+)$#', $gooDat['pg'], $matches)) {
190
                // Exclusion de page=1, page=2 (vue par défaut sur Google Book)
191
                if (intval($matches[1]) >= 3) {
192
                    $page = $matches[1];
193
                }
194
            }
195
            // conversion chiffres Romain pour PR
196
            if (preg_match('#PR([0-9]+)$#', $gooDat['pg'], $matches)) {
197
                // Exclusion de page=1, page=2 (vue par défaut sur Google Book)
198
                if (intval($matches[1]) >= 3) {
199
                    $page = NumberUtil::arab2roman(intval($matches[1]), true);
200
                }
201
            }
202
203
            if (!empty($page)) {
204
                $ouvrage->setParam('passage', $page);
205
                // ajout commentaire '<!-- utile? -->' ?
206
            }
207
        }
208
209
        $optimizer = new OuvrageOptimize($ouvrage);
210
        $optimizer->doTasks();
211
        $ouvrage2 = $optimizer->getOuvrage();
212
213
        return $ouvrage2->serialize();
214
    }
215
216
    /**
217
     * todo: move (injection) to other class.
218
     * Generate wiki-template {ouvrage} from GoogleBook ID.
219
     *
220
     * @param string $id GoogleBooks ID
221
     *
222
     * @return OuvrageTemplate
223
     * @throws Exception
224
     */
225
    private function generateOuvrageFromGoogleData(string $id): OuvrageTemplate
226
    {
227
        // return cached OuvrageTemplate
228
        if (isset($this->cacheOuvrageTemplate[$id])) {
229
            return clone $this->cacheOuvrageTemplate[$id];
230
        }
231
232
        // Get Google data by ID ZvhBAAAAcAAJ
233
        $adapter = new GoogleBooksAdapter();
234
        $volume = $adapter->getDataByGoogleId($id);
235
236
        $mapper = new GoogleBookMapper();
237
        $mapper->mapLanguageData(true);
238
        $data = $mapper->process($volume);
239
240
        // Generate wiki-template {ouvrage}
241
        $ouvrage = WikiTemplateFactory::create('ouvrage');
242
        $ouvrage->hydrate($data);
243
244
        // cache
245
        $this->cacheOuvrageTemplate[$id] = clone $ouvrage;
246
247
        return $ouvrage;
248
    }
249
250
    /**
251
     * Extract all <ref>/{ref} with only GoogleBooks URL.
252
     * Todo : supprimer point final URL
253
     *
254
     * @param string $text Page wikitext
255
     *
256
     * @return array [0 => ['<ref>http...</ref>', 'http://'], 1 => ...]
257
     */
258
    private function extractAllGoogleRefs(string $text): array
259
    {
260
        // <ref>...</ref> or {{ref|...}}
261
        // GoogleLivresTemplate::GOOGLEBOOK_URL_PATTERN
262
        if (preg_match_all(
263
            '#(?:<ref[^>]*>|{{ref\|) ?('.GoogleLivresTemplate::GOOGLEBOOKS_START_URL_PATTERN
264
            .'[^>\]} \n]+) ?(?:</ref>|}})#i',
265
            $text,
266
            $matches,
267
            PREG_SET_ORDER
268
        )
269
        ) {
270
            return $matches;
271
        }
272
273
        return [];
274
    }
275
276
    private function processRef(string $text, array $refsData): string
277
    {
278
        foreach ($refsData as $ref) {
279
            if ($this->quota->isQuotaReached()) {
280
                throw new DomainException('Quota Google atteint');
281
            }
282
            try {
283
                $citation = $this->convertGBurl2OuvrageCitation(WikiTextUtil::stripFinalPoint($ref[1]));
284
                sleep(2);
285
            } catch (Exception $e) {
286
                echo "Exception ".$e->getMessage();
287
                continue;
288
            }
289
290
            // ajout point final pour référence
291
            $citation .= '.';
292
293
            $newRef = str_replace($ref[1], $citation, $ref[0]);
294
            echo $newRef."\n";
295
296
            $text = str_replace($ref[0], $newRef, $text);
297
298
            echo "sleep ".self::SLEEP_GOOGLE_API_INTERVAL."\n";
299
            sleep(self::SLEEP_GOOGLE_API_INTERVAL);
300
        }
301
302
        return $text;
303
    }
304
305
}
306