Completed
Push — master ( e15538...7938c9 )
by Arthur
02:07
created

QueryRegexFinder::getCurlCallback()   C

Complexity

Conditions 13
Paths 1

Size

Total Lines 46
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 13
eloc 18
nc 1
nop 2
dl 0
loc 46
rs 5.1118
c 0
b 0
f 0

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace WebThumbnailer\Finder;
4
5
use WebThumbnailer\Application\ConfigManager;
6
use WebThumbnailer\Application\WebAccess\WebAccess;
7
use WebThumbnailer\Application\WebAccess\WebAccessCUrl;
8
use WebThumbnailer\Application\WebAccess\WebAccessFactory;
9
use WebThumbnailer\Exception\BadRulesException;
10
use WebThumbnailer\Utils\FinderUtils;
11
12
/**
13
 * Class QueryRegexFinder
14
 *
15
 * Generic Finder using regex rules on remote web content.
16
 * It will use regex rules to resolve a thumbnail in web a page.
17
 *
18
 * Mandatory rules:
19
 *   - image_regex
20
 *   - thumbnail_url
21
 *
22
 * Example:
23
 *   1. `http://domain.tld/page` content will be downloaded.
24
 *   2. `image_regex` will be apply on the content
25
 *   3. Matches will be use to generate `thumbnail_url`.
26
 *
27
 * @package WebThumbnailer\Finder
28
 */
29
class QueryRegexFinder extends FinderCommon
30
{
31
    /**
32
     * @var WebAccess instance.
33
     */
34
    protected $webAccess;
35
36
    /**
37
    * @var string thumbnail_url rule.
38
    */
39
    protected $thumbnailUrlFormat;
40
41
    /**
42
     * @var string Regex to apply on provided URL.
43
     */
44
    protected $urlRegex;
45
46
    /**
47
     * @inheritdoc
48
     *
49
     * @throws BadRulesException
50
     */
51
    public function __construct($domain, $url, $rules, $options)
52
    {
53
        $this->webAccess = WebAccessFactory::getWebAccess($url);
54
        $this->url = $url;
55
        $this->domain = $domain;
56
        $this->loadRules($rules);
57
        $this->finderOptions = $options;
58
    }
59
60
    /**
61
     * This finder downloads target URL page, and apply the regex given in rules on its content
62
     * to extract the thumbnail image.
63
     * The thumb URL must include ${number} to be replaced from the regex match.
64
     * Also replace eventual URL options.
65
     *
66
     * @inheritdoc
67
     *
68
     * @throws BadRulesException
69
     */
70
    public function find()
71
    {
72
        $thumbnail = $content = null;
73
        $callback = $this->webAccess instanceof WebAccessCUrl
74
            ? $this->getCurlCallback($content, $thumbnail)
75
            : null;
76
        list($headers, $content) = $this->webAccess->getContent(
77
            $this->url,
78
            (int) ConfigManager::get('settings.default.timeout', 30),
79
            (int) ConfigManager::get('settings.default.max_img_dl', 16777216),
80
            $callback,
81
            $content
82
        );
83
        if (empty($content)
84
            || empty($headers)
85
            || (empty($thumbnail) && strpos($headers[0], '200') === false)
86
        ) {
87
            return false;
88
        }
89
90
        // With curl, the thumb is extracted during the download
91
        if ($this->webAccess instanceof WebAccessCUrl && ! empty($thumbnail)) {
92
            return $thumbnail;
93
        }
94
95
        return $this->extractThumbContent($content);
96
    }
97
98
    /**
99
     * Get a callback for curl write function.
100
     *
101
     * @param string $content   A variable reference in which the downloaded content should be stored.
102
     * @param string $thumbnail A variable reference in which extracted thumb URL should be stored.
103
     *
104
     * @return \Closure CURLOPT_WRITEFUNCTION callback
105
     */
106
    protected function getCurlCallback(&$content, &$thumbnail)
107
    {
108
        $isRedirected = false;
109
110
        /**
111
         * cURL callback function for CURLOPT_WRITEFUNCTION (called during the download).
112
         *
113
         * While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text'
114
         * Then we extract the title and the charset and stop the download when it's done.
115
         *
116
         * Note that when using CURLOPT_WRITEFUNCTION, we have to manually handle the content retrieved,
117
         * hence the $content reference variable.
118
         *
119
         * @param resource $ch   cURL resource
120
         * @param string   $data chunk of data being downloaded
121
         *
122
         * @return int|bool length of $data or false if we need to stop the download
123
         */
124
        return function (&$ch, $data) use (&$content, &$thumbnail, &$isRedirected) {
125
            $content .= $data;
126
            $responseCode = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
127
128
            if (!empty($responseCode) && in_array($responseCode, [301, 302])) {
129
                $isRedirected = true;
130
                return strlen($data);
131
            }
132
            if (!empty($responseCode) && $responseCode !== 200) {
133
                return false;
134
            }
135
            // After a redirection, the content type will keep the previous request value
136
            // until it finds the next content-type header.
137
            if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) {
138
                $contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
139
            }
140
            if (!empty($contentType) && strpos($contentType, 'text/html') === false) {
141
                return false;
142
            }
143
            if (empty($thumbnail)) {
144
                $thumbnail = $this->extractThumbContent($data);
145
            }
146
            // We got everything we want, stop the download.
147
            if (!empty($responseCode) && !empty($contentType) && !empty($thumbnail)) {
148
                return false;
149
            }
150
151
            return strlen($data);
152
        };
153
    }
154
155
    /**
156
     * @param $content
157
     * @return bool|mixed|string
158
     * @throws BadRulesException
159
     */
160
    public function extractThumbContent($content)
161
    {
162
        $thumbnailUrl = $this->thumbnailUrlFormat;
163
        if (preg_match($this->urlRegex, $content, $matches) !== 0) {
164
            $total = count($matches);
165
            for ($i = 1; $i < $total; $i++) {
166
                $thumbnailUrl = str_replace('${'. $i . '}', $matches[$i], $thumbnailUrl);
167
            }
168
169
            // Match only options (not ${number})
170
            if (preg_match_all('/\${((?!\d)\w+?)}/', $thumbnailUrl, $optionsMatch, PREG_PATTERN_ORDER)) {
171
                foreach ($optionsMatch[1] as $value) {
172
                    $thumbnailUrl = $this->replaceOption($thumbnailUrl, $value);
173
                }
174
            }
175
            return $thumbnailUrl;
176
        }
177
        return false;
178
    }
179
180
    /**
181
     * @inheritdoc
182
     */
183
    public function checkRules($rules)
184
    {
185
        if (! FinderUtils::checkMandatoryRules($rules, [
186
            'image_regex',
187
            'thumbnail_url'
188
        ])) {
189
            throw new BadRulesException();
190
        }
191
    }
192
193
    /**
194
     * @inheritdoc
195
     *
196
     * @throws BadRulesException
197
     */
198
    public function loadRules($rules)
199
    {
200
        $this->checkRules($rules);
201
        $this->urlRegex = FinderUtils::buildRegex($rules['image_regex'], 'im');
202
        $this->thumbnailUrlFormat = $rules['thumbnail_url'];
203
    }
204
205
    /**
206
     * @inheritdoc
207
     */
208
    public function getName()
209
    {
210
        return 'Query Regex';
211
    }
212
}
213