1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace WebThumbnailer\Finder; |
4
|
|
|
|
5
|
|
|
use WebThumbnailer\Application\ConfigManager; |
6
|
|
|
use WebThumbnailer\Application\WebAccess\WebAccess; |
7
|
|
|
use WebThumbnailer\Application\WebAccess\WebAccessCUrl; |
8
|
|
|
use WebThumbnailer\Application\WebAccess\WebAccessFactory; |
9
|
|
|
use WebThumbnailer\Exception\BadRulesException; |
10
|
|
|
use WebThumbnailer\Utils\FinderUtils; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Class QueryRegexFinder |
14
|
|
|
* |
15
|
|
|
* Generic Finder using regex rules on remote web content. |
16
|
|
|
* It will use regex rules to resolve a thumbnail in web a page. |
17
|
|
|
* |
18
|
|
|
* Mandatory rules: |
19
|
|
|
* - image_regex |
20
|
|
|
* - thumbnail_url |
21
|
|
|
* |
22
|
|
|
* Example: |
23
|
|
|
* 1. `http://domain.tld/page` content will be downloaded. |
24
|
|
|
* 2. `image_regex` will be apply on the content |
25
|
|
|
* 3. Matches will be use to generate `thumbnail_url`. |
26
|
|
|
* |
27
|
|
|
* @package WebThumbnailer\Finder |
28
|
|
|
*/ |
29
|
|
|
class QueryRegexFinder extends FinderCommon |
30
|
|
|
{ |
31
|
|
|
/** |
32
|
|
|
* @var WebAccess instance. |
33
|
|
|
*/ |
34
|
|
|
protected $webAccess; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* @var string thumbnail_url rule. |
38
|
|
|
*/ |
39
|
|
|
protected $thumbnailUrlFormat; |
40
|
|
|
|
41
|
|
|
/** |
42
|
|
|
* @var string Regex to apply on provided URL. |
43
|
|
|
*/ |
44
|
|
|
protected $urlRegex; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* @inheritdoc |
48
|
|
|
* |
49
|
|
|
* @throws BadRulesException |
50
|
|
|
*/ |
51
|
|
|
public function __construct($domain, $url, $rules, $options) |
52
|
|
|
{ |
53
|
|
|
$this->webAccess = WebAccessFactory::getWebAccess($url); |
54
|
|
|
$this->url = $url; |
55
|
|
|
$this->domain = $domain; |
56
|
|
|
$this->loadRules($rules); |
57
|
|
|
$this->finderOptions = $options; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* This finder downloads target URL page, and apply the regex given in rules on its content |
62
|
|
|
* to extract the thumbnail image. |
63
|
|
|
* The thumb URL must include ${number} to be replaced from the regex match. |
64
|
|
|
* Also replace eventual URL options. |
65
|
|
|
* |
66
|
|
|
* @inheritdoc |
67
|
|
|
* |
68
|
|
|
* @throws BadRulesException |
69
|
|
|
*/ |
70
|
|
|
public function find() |
71
|
|
|
{ |
72
|
|
|
$thumbnail = $content = null; |
73
|
|
|
$callback = $this->webAccess instanceof WebAccessCUrl |
74
|
|
|
? $this->getCurlCallback($content, $thumbnail) |
75
|
|
|
: null; |
76
|
|
|
list($headers, $content) = $this->webAccess->getContent( |
77
|
|
|
$this->url, |
78
|
|
|
(int) ConfigManager::get('settings.default.timeout', 30), |
79
|
|
|
(int) ConfigManager::get('settings.default.max_img_dl', 16777216), |
80
|
|
|
$callback, |
81
|
|
|
$content |
82
|
|
|
); |
83
|
|
|
if (empty($content) |
84
|
|
|
|| empty($headers) |
85
|
|
|
|| (empty($thumbnail) && strpos($headers[0], '200') === false) |
86
|
|
|
) { |
87
|
|
|
return false; |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
// With curl, the thumb is extracted during the download |
91
|
|
|
if ($this->webAccess instanceof WebAccessCUrl && ! empty($thumbnail)) { |
92
|
|
|
return $thumbnail; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
return $this->extractThumbContent($content); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Get a callback for curl write function. |
100
|
|
|
* |
101
|
|
|
* @param string $content A variable reference in which the downloaded content should be stored. |
102
|
|
|
* @param string $thumbnail A variable reference in which extracted thumb URL should be stored. |
103
|
|
|
* |
104
|
|
|
* @return \Closure CURLOPT_WRITEFUNCTION callback |
105
|
|
|
*/ |
106
|
|
|
protected function getCurlCallback(&$content, &$thumbnail) |
107
|
|
|
{ |
108
|
|
|
$isRedirected = false; |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* cURL callback function for CURLOPT_WRITEFUNCTION (called during the download). |
112
|
|
|
* |
113
|
|
|
* While downloading the remote page, we check that the HTTP code is 200 and content type is 'html/text' |
114
|
|
|
* Then we extract the title and the charset and stop the download when it's done. |
115
|
|
|
* |
116
|
|
|
* Note that when using CURLOPT_WRITEFUNCTION, we have to manually handle the content retrieved, |
117
|
|
|
* hence the $content reference variable. |
118
|
|
|
* |
119
|
|
|
* @param resource $ch cURL resource |
120
|
|
|
* @param string $data chunk of data being downloaded |
121
|
|
|
* |
122
|
|
|
* @return int|bool length of $data or false if we need to stop the download |
123
|
|
|
*/ |
124
|
|
|
return function (&$ch, $data) use (&$content, &$thumbnail, &$isRedirected) { |
125
|
|
|
$content .= $data; |
126
|
|
|
$responseCode = curl_getinfo($ch, CURLINFO_RESPONSE_CODE); |
127
|
|
|
|
128
|
|
|
if (!empty($responseCode) && in_array($responseCode, [301, 302])) { |
129
|
|
|
$isRedirected = true; |
130
|
|
|
return strlen($data); |
131
|
|
|
} |
132
|
|
|
if (!empty($responseCode) && $responseCode !== 200) { |
133
|
|
|
return false; |
134
|
|
|
} |
135
|
|
|
// After a redirection, the content type will keep the previous request value |
136
|
|
|
// until it finds the next content-type header. |
137
|
|
|
if (! $isRedirected || strpos(strtolower($data), 'content-type') !== false) { |
138
|
|
|
$contentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); |
139
|
|
|
} |
140
|
|
|
if (!empty($contentType) && strpos($contentType, 'text/html') === false) { |
141
|
|
|
return false; |
142
|
|
|
} |
143
|
|
|
if (empty($thumbnail)) { |
144
|
|
|
$thumbnail = $this->extractThumbContent($data); |
145
|
|
|
} |
146
|
|
|
// We got everything we want, stop the download. |
147
|
|
|
if (!empty($responseCode) && !empty($contentType) && !empty($thumbnail)) { |
148
|
|
|
return false; |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
return strlen($data); |
152
|
|
|
}; |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* @param $content |
157
|
|
|
* @return bool|mixed|string |
158
|
|
|
* @throws BadRulesException |
159
|
|
|
*/ |
160
|
|
|
public function extractThumbContent($content) |
161
|
|
|
{ |
162
|
|
|
$thumbnailUrl = $this->thumbnailUrlFormat; |
163
|
|
|
if (preg_match($this->urlRegex, $content, $matches) !== 0) { |
164
|
|
|
$total = count($matches); |
165
|
|
|
for ($i = 1; $i < $total; $i++) { |
166
|
|
|
$thumbnailUrl = str_replace('${'. $i . '}', $matches[$i], $thumbnailUrl); |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
// Match only options (not ${number}) |
170
|
|
|
if (preg_match_all('/\${((?!\d)\w+?)}/', $thumbnailUrl, $optionsMatch, PREG_PATTERN_ORDER)) { |
171
|
|
|
foreach ($optionsMatch[1] as $value) { |
172
|
|
|
$thumbnailUrl = $this->replaceOption($thumbnailUrl, $value); |
173
|
|
|
} |
174
|
|
|
} |
175
|
|
|
return $thumbnailUrl; |
176
|
|
|
} |
177
|
|
|
return false; |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
/** |
181
|
|
|
* @inheritdoc |
182
|
|
|
*/ |
183
|
|
|
public function checkRules($rules) |
184
|
|
|
{ |
185
|
|
|
if (! FinderUtils::checkMandatoryRules($rules, [ |
186
|
|
|
'image_regex', |
187
|
|
|
'thumbnail_url' |
188
|
|
|
])) { |
189
|
|
|
throw new BadRulesException(); |
190
|
|
|
} |
191
|
|
|
} |
192
|
|
|
|
193
|
|
|
/** |
194
|
|
|
* @inheritdoc |
195
|
|
|
* |
196
|
|
|
* @throws BadRulesException |
197
|
|
|
*/ |
198
|
|
|
public function loadRules($rules) |
199
|
|
|
{ |
200
|
|
|
$this->checkRules($rules); |
201
|
|
|
$this->urlRegex = FinderUtils::buildRegex($rules['image_regex'], 'im'); |
202
|
|
|
$this->thumbnailUrlFormat = $rules['thumbnail_url']; |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
/** |
206
|
|
|
* @inheritdoc |
207
|
|
|
*/ |
208
|
|
|
public function getName() |
209
|
|
|
{ |
210
|
|
|
return 'Query Regex'; |
211
|
|
|
} |
212
|
|
|
} |
213
|
|
|
|