QueryRegexFinder::loadRules()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 1
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace WebThumbnailer\Finder;
6
7
use WebThumbnailer\Application\ConfigManager;
8
use WebThumbnailer\Application\WebAccess\WebAccess;
9
use WebThumbnailer\Application\WebAccess\WebAccessCUrl;
10
use WebThumbnailer\Application\WebAccess\WebAccessFactory;
11
use WebThumbnailer\Exception\BadRulesException;
12
use WebThumbnailer\Utils\FinderUtils;
13
14
/**
15
 * Generic Finder using regex rules on remote web content.
16
 * It will use regex rules to resolve a thumbnail in web a page.
17
 *
18
 * Mandatory rules:
19
 *   - image_regex
20
 *   - thumbnail_url
21
 *
22
 * Example:
23
 *   1. `http://domain.tld/page` content will be downloaded.
24
 *   2. `image_regex` will be apply on the content
25
 *   3. Matches will be use to generate `thumbnail_url`.
26
 */
27
class QueryRegexFinder extends FinderCommon
28
{
29
    /** @var WebAccess instance. */
30
    protected $webAccess;
31
32
    /** @var string thumbnail_url rule. */
33
    protected $thumbnailUrlFormat;
34
35
    /** @var string Regex to apply on provided URL. */
36
    protected $urlRegex;
37
38
    /**
39
     * @inheritdoc
40
     * @param mixed[]|null $rules   All existing rules loaded from JSON files.
41
     * @param mixed[]|null $options Options provided by the user to retrieve a thumbnail.
42
     *
43
     * @throws BadRulesException
44
     */
45
    public function __construct(string $domain, string $url, ?array $rules, ?array $options)
46
    {
47
        $this->webAccess = WebAccessFactory::getWebAccess($url);
48
        $this->url = $url;
49
        $this->domain = $domain;
50
        $this->loadRules($rules);
51
        $this->finderOptions = $options;
52
    }
53
54
    /**
55
     * This finder downloads target URL page, and apply the regex given in rules on its content
56
     * to extract the thumbnail image.
57
     * The thumb URL must include ${number} to be replaced from the regex match.
58
     * Also replace eventual URL options.
59
     *
60
     * @inheritdoc
61
     *
62
     * @throws BadRulesException
63
     */
64
    public function find()
65
    {
66
        $thumbnail = $content = null;
67
        $callback = $this->webAccess instanceof WebAccessCUrl
68
            ? $this->getCurlCallback($content, $thumbnail)
69
            : null;
70
        list($headers, $content) = $this->webAccess->getContent(
71
            $this->url,
72
            (int) ConfigManager::get('settings.default.timeout', 30),
73
            (int) ConfigManager::get('settings.default.max_img_dl', 16777216),
74
            $callback,
75
            $content
76
        );
77
        if (
78
            empty($content)
79
            || empty($headers)
80
            || (empty($thumbnail) && strpos($headers[0], '200') === false)
81
        ) {
82
            return false;
83
        }
84
85
        // With curl, the thumb is extracted during the download
86
        if ($this->webAccess instanceof WebAccessCUrl && ! empty($thumbnail)) {
87
            return $thumbnail;
88
        }
89
90
        return $this->extractThumbContent($content);
91
    }
92
93
    /**
94
     * Get a callback for curl write function.
95
     *
96
     * @param string|null $content   A variable reference in which the downloaded content should be stored.
97
     * @param string|null $thumbnail A variable reference in which extracted thumb URL should be stored.
98
     *
99
     * @return callable CURLOPT_WRITEFUNCTION callback
100
     */
101
    protected function getCurlCallback(?string &$content, ?string &$thumbnail): callable
102
    {
103
        $isRedirected = false;
104
105
        /**
106
         * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
107
         *
108
         * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
109
         * Then we extract the title and the charset and stop the download when it's done.
110
         *
111
         * Note that when using CURLOPT_WRITEFUNCTION, we have to manually handle the content retrieved,
112
         * hence the $content reference variable.
113
         *
114
         * @param resource $ch   cURL resource
115
         * @param string   $data chunk of data being downloaded
116
         *
117
         * @return int|false length of $data or false if we need to stop the download
118
         */
119
        return function ($ch, $data) use (&$content, &$thumbnail, &$isRedirected) {
120
            $content .= $data;
121
            $responseCode = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
122
123
            if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
124
                $isRedirected = true;
125
                return strlen($data);
126
            }
127
            if (!empty($responseCode) && $responseCode !== 200) {
128
                return false;
129
            }
130
            // After a redirection, the content type will keep the previous request value
131
            // until it finds the next content-type header.
132
            if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
133
                $contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
134
            }
135
            if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
136
                return false;
137
            }
138
            if (empty($thumbnail)) {
139
                $thumbnail = $this->extractThumbContent($data);
140
            }
141
            // We got everything we want, stop the download.
142
            if (!empty($responseCode) && !empty($contentType) && !empty($thumbnail)) {
143
                return false;
144
            }
145
146
            return strlen($data);
147
        };
148
    }
149
150
    /**
151
     * @param string $content to extract thumb from
152
     *
153
     * @return string|false Thumbnail URL or false if not found
154
     *
155
     * @throws BadRulesException
156
     */
157
    public function extractThumbContent(string $content)
158
    {
159
        $thumbnailUrl = $this->thumbnailUrlFormat;
160
        if (preg_match($this->urlRegex, $content, $matches) !== 0) {
161
            $total = count($matches);
162
            for ($i = 1; $i < $total; $i++) {
163
                $thumbnailUrl = str_replace('${' . $i . '}', $matches[$i], $thumbnailUrl);
164
            }
165
166
            // Match only options (not ${number})
167
            if (preg_match_all('/\${((?!\d)\w+?)}/', $thumbnailUrl, $optionsMatch, PREG_PATTERN_ORDER)) {
168
                foreach ($optionsMatch[1] as $value) {
169
                    $thumbnailUrl = $this->replaceOption($thumbnailUrl, $value);
170
                }
171
            }
172
            return $thumbnailUrl;
173
        }
174
175
        return false;
176
    }
177
178
    /** @inheritdoc */
179
    public function checkRules(?array $rules): bool
180
    {
181
        if (count($rules ?? []) > 0 && !FinderUtils::checkMandatoryRules($rules, ['image_regex', 'thumbnail_url'])) {
182
            throw new BadRulesException();
183
        }
184
185
        return true;
186
    }
187
188
    /**
189
     * @inheritdoc
190
     *
191
     * @throws BadRulesException
192
     */
193
    public function loadRules(?array $rules): void
194
    {
195
        $this->checkRules($rules);
196
        $this->urlRegex = FinderUtils::buildRegex($rules['image_regex'], 'im');
197
        $this->thumbnailUrlFormat = $rules['thumbnail_url'];
198
    }
199
200
    /** @inheritdoc */
201
    public function getName(): string
202
    {
203
        return 'Query Regex';
204
    }
205
}
206