Test Failed
Push — master ( 61ab03...c8a89b )
by Dispositif
02:49
created

GoogleBooksUtil::isTrackingUrl()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 5
c 0
b 0
f 0
nc 3
nop 1
dl 0
loc 10
rs 10
1
<?php
2
/**
3
 * This file is part of dispositif/wikibot application (@github)
4
 * 2019/2020 © Philippe M. <[email protected]>
5
 * For the full copyright and MIT license information, please view the license file.
6
 */
7
8
declare(strict_types=1);
9
10
namespace App\Domain\Publisher;
11
12
use App\Domain\Utils\ArrayProcessTrait;
13
use DomainException;
14
use Exception;
15
16
/**
17
 * Static methods for Google Books URL parsing.
18
 * Class GoogleBooksUtil
19
 *
20
 * @package App\Domain\Publisher
21
 */
22
abstract class GoogleBooksUtil
23
{
24
    use ArrayProcessTrait;
25
26
    const DEFAULT_GOOGLEBOOKS_URL = 'https://books.google.com/books';
27
    /**
28
     * todo refac regex with end of URL
29
     */
30
    const GOOGLEBOOKS_START_URL_PATTERN = 'https?://(?:books|play)\.google\.[a-z\.]{2,6}/(?:books)?(?:books/[^\?]+\.html)?(?:/reader)?\?(?:[a-zA-Z=&]+&)?id=';
31
32
    const TRACKING_PARAMETERS
33
        = [
34
            'xtor',
35
            'ved',
36
            'ots',
37
            'sig',
38
            'source',
39
            'utm_source',
40
            'utm_medium',
41
            'utm_campaign',
42
            'utm_term',
43
            'utm_content',
44
        ];
45
46
    public static function isTrackingUrl(string $url): bool
47
    {
48
        $data = self::parseGoogleBookQuery($url);
49
        foreach ($data as $param => $value) {
50
            if (in_array($param, self::TRACKING_PARAMETERS)) {
51
                return true;
52
            }
53
        }
54
55
        return false;
56
    }
57
58
    /**
59
     * Parse URL argument from ?query and #fragment.
60
     *
61
     * @param string $url
62
     *
63
     * @return array
64
     */
65
    public static function parseGoogleBookQuery(string $url): array
66
    {
67
        // Note : Also datas in URL after the '#' !!! (URL fragment)
68
        $queryData = parse_url($url, PHP_URL_QUERY); // after ?
69
        $fragmentData = parse_url($url, PHP_URL_FRAGMENT); // after #
70
        // queryData precedence over fragmentData
71
        parse_str(implode('&', [$fragmentData, $queryData]), $val);
72
73
        return self::arrayKeysToLower($val);
74
    }
75
76
    /**
77
     * Clean the google book URL from optional&tracking data.
78
     *
79
     * @param string $url
80
     *
81
     * @return string URL
82
     * @throws Exception
83
     */
84
    public static function simplifyGoogleUrl(string $url): string
85
    {
86
        if (!self::isGoogleBookURL($url)) {
87
            // not DomainException for live testing with OuvrageOptimize
88
            throw new Exception('not a Google Book URL');
89
        }
90
91
        $gooDat = self::parseGoogleBookQuery($url);
92
        if (empty($gooDat['id'])) {
93
            throw new DomainException("no GoogleBook 'id' in URL");
94
        }
95
        if (!preg_match('#[0-9A-Za-z_\-]{12}#', $gooDat['id'])) {
96
            throw new DomainException("GoogleBook 'id' malformed");
97
        }
98
99
        $dat = [];
100
        // keep only a few parameters (+'q' ?)
101
        // q : keywords search / dq : quoted phrase search
102
        // q can be empty !!!!
103
        $keeps = ['id', 'pg', 'printsec', 'q', 'dq'];
104
        foreach ($keeps as $keep) {
105
            if (isset($gooDat[$keep])) {
106
                $dat[$keep] = $gooDat[$keep];
107
            }
108
        }
109
110
        // 1 exemple : https://fr.wikipedia.org/w/index.php?title=Foudre_de_Catatumbo&diff=next&oldid=168721836&diffmode=source
111
        // 1. mettre URL &dq= pour final
112
        //
113
        // 2. si q!=dq (changement ultérieur formulaire recherche) alors q= prévaut pour résultat final
114
        // 2. mettre URL &q pour final
115
        //
116
        // 3. Recherche global sur http://books.google.fr => pg= dq= (#q= avec q==dq)
117
        // 3. dans ce cas (q==dq), url final avec seulement dq= donne résultat OK
118
        //
119
        // 4 . if you use a url without &redir_esc=y#v=onepage for a book with "Preview" available,
120
        // usually &dq shows the highlighted text in full page view whereas &q shows the snippet view (so you have to
121
        // click on the snippet to see the full page).
122
        // &dq allows highlighting in books where there is "Preview" available and &pg=PTx is in the URL
123
        //
124
        // #v=onepage ou #v=snippet
125
        if (isset($dat['q']) && isset($dat['dq'])) {
126
            // si q==dq alors dq prévaut pour affichage (sinon affichage différent avec url seulement q=)
127
            if ($dat['q'] === $dat['dq']) {
128
                unset($dat['q']);
129
            } // si q!=dq (exemple : nouveaux mots clés dans formulaire recherche) alors q= prévaut pour résultat final
130
            else {
131
                unset($dat['dq']);
132
            }
133
        }
134
        if (empty($dat['q'])) {
135
            unset($dat['q']);
136
        }
137
        if (empty($dat['dq'])) {
138
            unset($dat['dq']);
139
        }
140
141
        $googleURL = self::DEFAULT_GOOGLEBOOKS_URL;
142
143
        // domain .com .fr
144
        $gooDomain = self::parseGoogleDomain($url);
145
        if ($gooDomain) {
146
            $googleURL = str_replace('.com', $gooDomain, $googleURL);
147
        }
148
149
        // todo http_build_query() process an urlencode, but a not encoded q= value ("fu+bar") is beautiful
150
        return $googleURL.'?'.http_build_query($dat);
151
    }
152
153
    /**
154
     * Check google URL pattern.
155
     *
156
     * @param string $text
157
     *
158
     * @return bool
159
     */
160
    public static function isGoogleBookURL(string $text): bool
161
    {
162
        if (preg_match('#^'.self::GOOGLEBOOKS_START_URL_PATTERN.'[^>\]} \n]+$#i', $text) > 0) {
163
            return true;
164
        }
165
166
        return false;
167
    }
168
169
    /**
170
     * return '.fr' or '.com'.
171
     *
172
     * @param string $url
173
     *
174
     * @return string|null
175
     */
176
    private static function parseGoogleDomain(string $url): ?string
177
    {
178
        $host = parse_url($url, PHP_URL_HOST);
179
        if (!empty($host) && preg_match('#\.[a-z]{2,3}$#', $host, $matches) > 0) {
180
            // Maroc : google.co.ma (sous-domaine!!)
181
            return str_replace(['.ma', '.uk', '.au'], ['.co.ma', '.co.uk', '.com.au'], $matches[0]); // .fr
182
        }
183
184
        return null;
185
    }
186
187
    /**
188
     * Instead of url_encode(). No UTF-8 encoding.
189
     *
190
     * @param string $str
191
     *
192
     * @return string
193
     */
194
    public static function googleUrlEncode(string $str): string
195
    {
196
        return str_replace(' ', '+', trim(urldecode($str)));
197
    }
198
}
199