Passed
Push — master ( 03a489...acb598 )
by Dispositif
07:31
created

GoogleBooksUtil::googleUrlEncode()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\ArrayProcessTrait;
13
use DomainException;
14
use Exception;
15
16
/**
17
 * Static methods for Google Books URL parsing.
18
 * Class GoogleBooksUtil
19
 *
20
 * @package App\Domain\Publisher
21
 */
22
abstract class GoogleBooksUtil
23
{
24
    use ArrayProcessTrait;
25
26
    const DEFAULT_GOOGLEBOOKS_URL = 'https://books.google.com/books';
27
    /**
28
     * todo refac regex with end of URL
29
     */
30
    const GOOGLEBOOKS_START_URL_PATTERN = '(?:https?://(?:books|play)\.google\.[a-z\.]{2,6}/(?:books)?(?:books/[^\?]+\.html)?(?:/reader)?\?(?:[a-zA-Z=&]+&)?(?:[&=A-Z0-9-_%\+]+&)?(?:id|isbn)=|https://www\.google\.[a-z\.]{2,6}/books/edition/_/)';
31
32
    const GOOGLEBOOKS_ID_REGEX = '[0-9A-Za-z_\-]{12}';
33
34
    const TRACKING_PARAMETERS
35
        = [
36
            'xtor',
37
            'ved',
38
            'ots',
39
            'sig',
40
            'source',
41
            'utm_source',
42
            'utm_medium',
43
            'utm_campaign',
44
            'utm_term',
45
            'utm_content',
46 2
        ];
47
48 2
    public static function isTrackingUrl(string $url): bool
49 2
    {
50 2
        $data = self::parseGoogleBookQuery($url);
51 2
        foreach ($data as $param => $value) {
52
            if (in_array($param, self::TRACKING_PARAMETERS)) {
53
                return true;
54
            }
55 1
        }
56
57
        return false;
58
    }
59
60
    /**
61
     * Parse URL argument from ?query and #fragment.
62
     *
63
     * @param string $url
64
     *
65 22
     * @return array
66
     */
67
    public static function parseGoogleBookQuery(string $url): array
68 22
    {
69 22
        // Note : Also datas in URL after the '#' !!! (URL fragment)
70
        $queryData = parse_url($url, PHP_URL_QUERY); // after ?
71 22
        $fragmentData = parse_url($url, PHP_URL_FRAGMENT); // after #
72
        // queryData precedence over fragmentData
73 22
        parse_str(implode('&', [$fragmentData, $queryData]), $val);
74
75
        return self::arrayKeysToLower($val);
76
    }
77
78
    /**
79
     * Clean the google book URL from optional&tracking data.
80
     *
81
     * @param string $url
82
     *
83
     * @return string URL
84 13
     * @throws Exception
85
     */
86 13
    public static function simplifyGoogleUrl(string $url): string
87
    {
88
        if (!self::isGoogleBookURL($url)) {
89
            // not DomainException for live testing with OuvrageOptimize
90
            throw new Exception('not a Google Book URL');
91 13
        }
92 13
93
94
        $gooDat = self::parseGoogleBookQuery($url);
95 13
96
        // New format https://www.google.fr/books/edition/_/U4NmPwAACAAJ?hl=en
97
        if (self::isNewGoogleBookUrl($url) && self::getIDFromNewGBurl($url)) {
98
            $gooDat['id'] = self::getIDFromNewGBurl($url);
99 13
        }
100
101
        if (empty($gooDat['id']) && empty($gooDat['isbn'])) {
102
            throw new DomainException("no GoogleBook 'id' or 'isbn' in URL");
103 13
        }
104 13
        if (isset($gooDat['id']) && !preg_match('#'.self::GOOGLEBOOKS_ID_REGEX.'#', $gooDat['id'])) {
105 13
            throw new DomainException("GoogleBook 'id' malformed");
106 13
        }
107
108
        $dat = [];
109
        // keep only a few parameters (+'q' ?)
110
        // q : keywords search / dq : quoted phrase search
111
        // q can be empty !!!!
112
        $keeps = ['id', 'isbn', 'pg', 'printsec', 'q', 'dq'];
113
        foreach ($keeps as $keep) {
114
            if (isset($gooDat[$keep])) {
115
                $dat[$keep] = $gooDat[$keep];
116
            }
117
        }
118
119
        // 1 exemple : https://fr.wikipedia.org/w/index.php?title=Foudre_de_Catatumbo&diff=next&oldid=168721836&diffmode=source
120
        // 1. mettre URL &dq= pour final
121
        //
122
        // 2. si q!=dq (changement ultérieur formulaire recherche) alors q= prévaut pour résultat final
123
        // 2. mettre URL &q pour final
124
        //
125 13
        // 3. Recherche global sur http://books.google.fr => pg= dq= (#q= avec q==dq)
126
        // 3. dans ce cas (q==dq), url final avec seulement dq= donne résultat OK
127 6
        //
128 4
        // 4 . if you use a url without &redir_esc=y#v=onepage for a book with "Preview" available,
129
        // usually &dq shows the highlighted text in full page view whereas &q shows the snippet view (so you have to
130
        // click on the snippet to see the full page).
131 2
        // &dq allows highlighting in books where there is "Preview" available and &pg=PTx is in the URL
132
        //
133
        // #v=onepage ou #v=snippet
134 13
        if (isset($dat['q']) && isset($dat['dq'])) {
135 12
            // si q==dq alors dq prévaut pour affichage (sinon affichage différent avec url seulement q=)
136
            if ($dat['q'] === $dat['dq']) {
137 13
                unset($dat['q']);
138 8
            } // si q!=dq (exemple : nouveaux mots clés dans formulaire recherche) alors q= prévaut pour résultat final
139
            else {
140
                unset($dat['dq']);
141 13
            }
142
        }
143
        if (empty($dat['q'])) {
144 13
            unset($dat['q']);
145 13
        }
146 13
        if (empty($dat['dq'])) {
147
            unset($dat['dq']);
148
        }
149
150 13
        $googleURL = self::DEFAULT_GOOGLEBOOKS_URL;
151
152
        // domain .com .fr
153
        $gooDomain = self::parseGoogleDomain($url);
154
        if ($gooDomain) {
155
            $googleURL = str_replace('.com', $gooDomain, $googleURL);
156
        }
157
158
        // todo verify http_build_query() enc_type parameter
159
        // todo http_build_query() process an urlencode, but a not encoded q= value ("fu+bar") is beautiful
160 24
        return $googleURL.'?'.http_build_query($dat);
161
    }
162 24
163 23
    /**
164
     * Check google URL pattern.
165
     *
166 1
     * @param string $text
167
     *
168
     * @return bool
169
     */
170
    public static function isGoogleBookURL(string $text): bool
171
    {
172
        if (preg_match('#^'.self::GOOGLEBOOKS_START_URL_PATTERN.'[^>\]} \n]+$#i', $text) > 0) {
173
            return true;
174
        }
175
176 13
        return false;
177
    }
178 13
179 13
    /**
180
     * return '.fr' or '.com'.
181 13
     *
182
     * @param string $url
183
     *
184
     * @return string|null
185
     */
186
    private static function parseGoogleDomain(string $url): ?string
187
    {
188
        $host = parse_url($url, PHP_URL_HOST);
189
        if (!empty($host) && preg_match('#\.[a-z]{2,3}$#', $host, $matches) > 0) {
190
            // Maroc : google.co.ma (sous-domaine!!)
191
            return str_replace(['.ma', '.uk', '.au'], ['.co.ma', '.co.uk', '.com.au'], $matches[0]); // .fr
192
        }
193
194 1
        return null;
195
    }
196 1
197
    /**
198
     * Instead of url_encode(). No UTF-8 encoding.
199
     *
200
     * @param string $str
201
     *
202
     * @return string
203
     */
204
    public static function googleUrlEncode(string $str): string
205
    {
206
        return str_replace(' ', '+', trim(urldecode($str)));
207
    }
208
209
    /**
210
     * New Google Books format (nov 2019).
211
     * Example : https://www.google.fr/books/edition/_/U4NmPwAACAAJ?hl=en
212
     *
213
     * @param string $url
214
     *
215
     * @return bool
216
     */
217
    private static function isNewGoogleBookUrl(string $url): bool
218
    {
219
        if (preg_match(
220
            '#^https://www\.google\.[a-z.]{2,6}/books/edition/_/'.self::GOOGLEBOOKS_ID_REGEX.'(?:&.+)?#',
221
            $url
222
        )
223
        ) {
224
            return true;
225
        }
226
227
        return false;
228
    }
229
230
    private static function getIDFromNewGBurl(string $url): ?string
231
    {
232
        if (preg_match(
233
            '#^https://www\.google\.[a-z.]{2,6}/books/edition/_/('.self::GOOGLEBOOKS_ID_REGEX.')(?:&.+)?#',
234
            $url,
235
            $matches
236
        )
237
        ) {
238
            return $matches[1];
239
        }
240
241
        return null;
242
    }
243
}
244