Passed
Push — master ( 5b891d...3a1d1d )
by Dispositif
03:12
created

GoogleBooksUtil::parseGoogleDomain()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 9
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 4
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 9
ccs 5
cts 5
cp 1
crap 3
rs 10
1
<?php
2
/*
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019-2023 © Philippe M./Irønie  <[email protected]>
5
 * For the full copyright and MIT license information, view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\ArrayProcessTrait;
13
use DomainException;
14
use Exception;
15
16
/**
17
 * Static methods for Google Books URL parsing.
18
 * identique à https://www.google.fr/books/edition/_/43cIAQAAMAAJ?gbpv=1&dq=orgues+basilique+saint+quentin
19
 */
20
abstract class GoogleBooksUtil
21
{
22
    use ArrayProcessTrait;
23
24
    final public const DEFAULT_GOOGLEBOOKS_URL = 'https://books.google.com/books';
25
    /**
26
     * todo refac regex with end of URL
27
     */
28
    final public const GOOGLEBOOKS_START_URL_PATTERN = '(?:https?://(?:books|play)\.google\.[a-z\.]{2,6}/(?:books)?(?:books/[^\?]+\.html)?(?:/reader)?\?(?:[a-zA-Z=&]+&)?(?:[&=A-Z0-9-_%\+]+&)?(?:id|isbn)=|https://www\.google\.[a-z\.]{2,6}/books/edition/[^/]+/)';
29
30
    final public const GOOGLEBOOKS_NEW_START_URL_PATTERN = 'https://www\.google\.[a-z.]{2,6}/books/edition/[^/]+/';
31
32
    final public const GOOGLEBOOKS_ID_REGEX = '[0-9A-Za-z_\-]{12}';
33
34
    /**
35
     * todo : add frontcover ?
36
     * q : keywords search (may be empty) / dq : quoted phrase search
37
     */
38
    final public const GOOGLEBOOKS_KEEP_PARAMETERS = ['id', 'isbn', 'pg', 'printsec', 'q', 'dq', 'gbpv'];
39
40
    final public const TRACKING_PARAMETERS = [
41
        'xtor',
42
        'ved',
43
        'ots',
44
        'sig',
45
        'source',
46 2
        'utm_source',
47
        'utm_medium',
48 2
        'utm_campaign',
49 2
        'utm_term',
50 2
        'utm_content',
51 2
    ];
52
53
    /**
54
     * Check if URL contains tracking parameters.
55 1
     */
56
    public static function isTrackingUrl(string $url): bool
57
    {
58
        $urlData = self::parseGoogleBookQuery($url);
59
60
        return !empty(array_intersect_key(array_flip(self::TRACKING_PARAMETERS), $urlData));
61
    }
62
63
    /**
64
     * Parse URL argument from ?query and #fragment.
65 22
     * Do not remove empty values.
66
     */
67
    public static function parseGoogleBookQuery(string $url): array
68 22
    {
69 22
        $queryData = parse_url($url, PHP_URL_QUERY); // after ?
70
        $fragmentData = parse_url($url, PHP_URL_FRAGMENT); // after #
71 22
        // queryData precedence over fragmentData
72
        parse_str(implode('&', [$fragmentData, $queryData]), $urlData);
73 22
74
        return self::arrayKeysToLower($urlData);
75
    }
76
77
    /**
78
     * TODO refac (responsability).
79
     *
80
     * Clean the google book old URL : delete tracking and user optional params,
81
     * also redondat search query params.
82
     * Skip the process for new URL 2019 format.
83
     *
84 13
     * @throws Exception
85
     */
86 13
    public static function simplifyGoogleUrl(string $url): string
87
    {
88
        if (!self::isGoogleBookURL($url)) {
89
            // not DomainException for live testing with OuvrageOptimize
90
            throw new Exception('not a Google Book URL');
91 13
        }
92 13
93
        if (self::isNewGoogleBookUrl($url)) {
94
            if (!self::getIDFromNewGBurl($url)) {
95 13
                throw new DomainException('no Google Book ID in URL');
96
            }
97
98
            return $url;
99 13
        }
100
101
        $gooDat = self::parseGoogleBookQuery($url);
102
103 13
        if (empty($gooDat['id']) && empty($gooDat['isbn'])) {
104 13
            throw new DomainException("no GoogleBook 'id' or 'isbn' in URL");
105 13
        }
106 13
        if (isset($gooDat['id']) && !self::validateGoogleBooksId($gooDat['id'])) {
107
            throw new DomainException("GoogleBook 'id' malformed");
108
        }
109
110
        $dat = self::parseAndCleanParams($gooDat);
111
        $googleURL = self::modifyGoogleDomainURL($url);
112
113
        // todo verify http_build_query() enc_type parameter
114
        // todo http_build_query() process an urlencode, but a not encoded q= value ("fu+bar") is beautiful
115
        return $googleURL . '?' . http_build_query($dat);
116
    }
117
118
    /**
119
     * Check google URL pattern.
120
     */
121
    public static function isGoogleBookURL(string $text): bool
122
    {
123
        return preg_match('#^' . self::GOOGLEBOOKS_START_URL_PATTERN . '[^>\]} \n]+$#i', $text) > 0;
124
    }
125 13
126
    /**
127 6
     * Extract domain from google URL.
128 4
     * return '.fr', '.com,'.co.uk', '.co.ma' or null
129
     */
130
    private static function extractGoogleDomain(string $url): ?string
131 2
    {
132
        $host = parse_url($url, PHP_URL_HOST); // "books.google.fr"
133
        if (!empty($host) && preg_match('#google((?:\.[a-z]{2,3})?\.[a-z]{2,3})$#', $host, $matches) > 0) {
134 13
135 12
            return $matches[1] ?? null; // .fr
136
        }
137 13
138 8
        return null;
139
    }
140
141 13
    /**
142
     * Google style url_encode(). No UTF-8 encoding.
143
     */
144 13
    public static function googleUrlEncode(string $str): string
145 13
    {
146 13
        return str_replace(' ', '+', trim(urldecode($str)));
147
    }
148
149
    /**
150 13
     * New Google Books format (nov 2019).
151
     * Example : https://www.google.fr/books/edition/_/U4NmPwAACAAJ?hl=en
152
     */
153
    private static function isNewGoogleBookUrl(string $url): bool
154
    {
155
        return (bool)preg_match(
156
            '#^' . self::GOOGLEBOOKS_NEW_START_URL_PATTERN . self::GOOGLEBOOKS_ID_REGEX . '(?:&.+)?#',
157
            $url
158
        );
159
    }
160 24
161
    /**
162 24
     * @param string[] $gooDat
163 23
     *
164
     * @return string[]
165
     */
166 1
    protected static function parseAndCleanParams(array $gooDat): array
167
    {
168
        $dat = [];
169
        // keep only a few parameters (+'q' ?)
170
        // q : keywords search / dq : quoted phrase search
171
        // q can be empty !!!!
172
        foreach (self::GOOGLEBOOKS_KEEP_PARAMETERS as $keep) {
173
            if (isset($gooDat[$keep])) {
174
                $dat[$keep] = $gooDat[$keep];
175
            }
176 13
        }
177
178 13
        // 1 exemple : https://fr.wikipedia.org/w/index.php?title=Foudre_de_Catatumbo&diff=next&oldid=168721836&diffmode=source
179 13
        // 1. mettre URL &dq= pour final
180
        //
181 13
        // 2. si q!=dq (changement ultérieur formulaire recherche) alors q= prévaut pour résultat final
182
        // 2. mettre URL &q pour final
183
        //
184
        // 3. Recherche global sur http://books.google.fr => pg= dq= (#q= avec q==dq)
185
        // 3. dans ce cas (q==dq), url final avec seulement dq= donne résultat OK
186
        //
187
        // 4 . if you use a url without &redir_esc=y#v=onepage for a book with "Preview" available,
188
        // usually &dq shows the highlighted text in full page view whereas &q shows the snippet view (so you have to
189
        // click on the snippet to see the full page).
190
        // &dq allows highlighting in books where there is "Preview" available and &pg=PTx is in the URL
191
        //
192
        // #v=onepage ou #v=snippet
193
        if (isset($dat['q']) && isset($dat['dq'])) {
194 1
            // si q==dq alors dq prévaut pour affichage (sinon affichage différent avec url seulement q=)
195
            if ($dat['q'] === $dat['dq']) {
196 1
                unset($dat['q']);
197
            } // si q!=dq (exemple : nouveaux mots clés dans formulaire recherche) alors q= prévaut pour résultat final
198
            else {
199
                unset($dat['dq']);
200
            }
201
        }
202
        if (empty($dat['q'])) {
203
            unset($dat['q']);
204
        }
205
        if (empty($dat['dq'])) {
206
            unset($dat['dq']);
207
        }
208
209
        return $dat;
210
    }
211
212
    /**
213
     * Changed : do not replace '.com' Googledomain name. This method is useless.
214
     * Naive replacement of Google domain name.
215
     */
216
    protected static function modifyGoogleDomainURL(string $url): string
217
    {
218
        $defaultGoogleDomainURL = self::DEFAULT_GOOGLEBOOKS_URL;
219
        $gooDomain = self::extractGoogleDomain($url); // '.fr', '.co.uk'…
220
221
        if ($gooDomain) {
222
            $defaultGoogleDomainURL = str_replace('.com', $gooDomain, $defaultGoogleDomainURL);
223
        }
224
225
        return $defaultGoogleDomainURL;
226
    }
227
228
    /**
229
     * Extract ID from new Google Books URL.
230
     * https://www.google.fr/books/edition/_/U4NmPwAACAAJ?hl=en => U4NmPwAACAAJ
231
     */
232
    private static function getIDFromNewGBurl(string $url): ?string
233
    {
234
        if (preg_match(
235
            '#^' . self::GOOGLEBOOKS_NEW_START_URL_PATTERN . '(' . self::GOOGLEBOOKS_ID_REGEX . ')(?:&.+)?#',
236
            $url,
237
            $matches
238
        )
239
        ) {
240
            return $matches[1];
241
        }
242
243
        return null;
244
    }
245
246
    protected static function validateGoogleBooksId(string $id): bool
247
    {
248
        return preg_match('#' . self::GOOGLEBOOKS_ID_REGEX . '#', $id) > 0;
249
    }
250
}
251