1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Wallabag\CoreBundle\Helper; |
4
|
|
|
|
5
|
|
|
use GuzzleHttp\Psr7\Uri; |
6
|
|
|
use GuzzleHttp\Psr7\UriResolver; |
7
|
|
|
use Http\Client\Common\HttpMethodsClient; |
8
|
|
|
use Http\Client\Common\Plugin\ErrorPlugin; |
9
|
|
|
use Http\Client\Common\PluginClient; |
10
|
|
|
use Http\Client\HttpClient; |
11
|
|
|
use Http\Discovery\MessageFactoryDiscovery; |
12
|
|
|
use Http\Message\MessageFactory; |
13
|
|
|
use Psr\Http\Message\ResponseInterface; |
14
|
|
|
use Psr\Log\LoggerInterface; |
15
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
16
|
|
|
use Symfony\Component\Finder\Finder; |
17
|
|
|
use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; |
18
|
|
|
|
19
|
|
|
class DownloadImages |
20
|
|
|
{ |
21
|
|
|
const REGENERATE_PICTURES_QUALITY = 80; |
22
|
|
|
|
23
|
|
|
private $client; |
24
|
|
|
private $baseFolder; |
25
|
|
|
private $logger; |
26
|
|
|
private $mimeGuesser; |
27
|
|
|
private $wallabagUrl; |
28
|
|
|
|
29
|
|
|
public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null) |
30
|
|
|
{ |
31
|
|
|
$this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find()); |
32
|
|
|
$this->baseFolder = $baseFolder; |
33
|
|
|
$this->wallabagUrl = rtrim($wallabagUrl, '/'); |
34
|
|
|
$this->logger = $logger; |
35
|
|
|
$this->mimeGuesser = new MimeTypeExtensionGuesser(); |
36
|
|
|
|
37
|
|
|
$this->setFolder(); |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
public function getBaseFolder() |
41
|
|
|
{ |
42
|
|
|
return $this->baseFolder; |
43
|
|
|
} |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Process the html and extract images URLs from it. |
47
|
|
|
* |
48
|
|
|
* @param string $html |
49
|
|
|
* |
50
|
|
|
* @return string[] |
51
|
|
|
*/ |
52
|
|
|
public static function extractImagesUrlsFromHtml($html) |
53
|
|
|
{ |
54
|
|
|
$crawler = new Crawler($html); |
55
|
|
|
$imagesCrawler = $crawler->filterXpath('//img'); |
56
|
|
|
$imagesUrls = $imagesCrawler->extract(['src']); |
57
|
|
|
$imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler); |
58
|
|
|
|
59
|
|
|
return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls)); |
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* Process the html and extract image from it, save them to local and return the updated html. |
64
|
|
|
* |
65
|
|
|
* @param int $entryId ID of the entry |
66
|
|
|
* @param string $html |
67
|
|
|
* @param string $url Used as a base path for relative image and folder |
68
|
|
|
* |
69
|
|
|
* @return string |
70
|
|
|
*/ |
71
|
|
|
public function processHtml($entryId, $html, $url) |
72
|
|
|
{ |
73
|
|
|
$imagesUrls = self::extractImagesUrlsFromHtml($html); |
74
|
|
|
|
75
|
|
|
$relativePath = $this->getRelativePath($entryId); |
76
|
|
|
|
77
|
|
|
// download and save the image to the folder |
78
|
|
|
foreach ($imagesUrls as $image) { |
79
|
|
|
$imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath); |
80
|
|
|
|
81
|
|
|
if (false === $imagePath) { |
82
|
|
|
continue; |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
// if image contains "&" and we can't find it in the html it might be because it's encoded as & |
86
|
|
|
if (false !== stripos($image, '&') && false === stripos($html, $image)) { |
87
|
|
|
$image = str_replace('&', '&', $image); |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
$html = str_replace($image, $imagePath, $html); |
91
|
|
|
} |
92
|
|
|
|
93
|
|
|
return $html; |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Process a single image: |
98
|
|
|
* - retrieve it |
99
|
|
|
* - re-saved it (for security reason) |
100
|
|
|
* - return the new local path. |
101
|
|
|
* |
102
|
|
|
* @param int $entryId ID of the entry |
103
|
|
|
* @param string $imagePath Path to the image to retrieve |
104
|
|
|
* @param string $url Url from where the image were found |
105
|
|
|
* @param string $relativePath Relative local path to saved the image |
106
|
|
|
* |
107
|
|
|
* @return string|false Relative url to access the image from the web |
108
|
|
|
*/ |
109
|
|
|
public function processSingleImage($entryId, $imagePath, $url, $relativePath = null) |
110
|
|
|
{ |
111
|
|
|
if (null === $imagePath) { |
|
|
|
|
112
|
|
|
return false; |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
if (null === $relativePath) { |
116
|
|
|
$relativePath = $this->getRelativePath($entryId); |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
$this->logger->debug('DownloadImages: working on image: ' . $imagePath); |
120
|
|
|
|
121
|
|
|
$folderPath = $this->baseFolder . '/' . $relativePath; |
122
|
|
|
|
123
|
|
|
// build image path |
124
|
|
|
$absolutePath = $this->getAbsoluteLink($url, $imagePath); |
125
|
|
|
if (false === $absolutePath) { |
126
|
|
|
$this->logger->error('DownloadImages: Can not determine the absolute path for that image, skipping.'); |
127
|
|
|
|
128
|
|
|
return false; |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
try { |
132
|
|
|
$res = $this->client->get($absolutePath); |
133
|
|
|
} catch (\Exception $e) { |
134
|
|
|
$this->logger->error('DownloadImages: Can not retrieve image, skipping.', ['exception' => $e]); |
135
|
|
|
|
136
|
|
|
return false; |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
$ext = $this->getExtensionFromResponse($res, $imagePath); |
140
|
|
|
if (false === $res) { |
|
|
|
|
141
|
|
|
return false; |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
$hashImage = hash('crc32', $absolutePath); |
145
|
|
|
$localPath = $folderPath . '/' . $hashImage . '.' . $ext; |
|
|
|
|
146
|
|
|
|
147
|
|
|
try { |
148
|
|
|
$im = imagecreatefromstring((string) $res->getBody()); |
149
|
|
|
} catch (\Exception $e) { |
150
|
|
|
$im = false; |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
if (false === $im) { |
154
|
|
|
$this->logger->error('DownloadImages: Error while regenerating image', ['path' => $localPath]); |
155
|
|
|
|
156
|
|
|
return false; |
157
|
|
|
} |
158
|
|
|
|
159
|
|
|
switch ($ext) { |
160
|
|
|
case 'gif': |
161
|
|
|
// use Imagick if available to keep GIF animation |
162
|
|
|
if (class_exists('\\Imagick')) { |
163
|
|
|
try { |
164
|
|
|
$imagick = new \Imagick(); |
165
|
|
|
$imagick->readImageBlob($res->getBody()); |
166
|
|
|
$imagick->setImageFormat('gif'); |
167
|
|
|
$imagick->writeImages($localPath, true); |
168
|
|
|
} catch (\Exception $e) { |
169
|
|
|
// if Imagick fail, fallback to the default solution |
170
|
|
|
imagegif($im, $localPath); |
171
|
|
|
} |
172
|
|
|
} else { |
173
|
|
|
imagegif($im, $localPath); |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
$this->logger->debug('DownloadImages: Re-creating gif'); |
177
|
|
|
break; |
178
|
|
|
case 'jpeg': |
179
|
|
|
case 'jpg': |
180
|
|
|
imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY); |
181
|
|
|
$this->logger->debug('DownloadImages: Re-creating jpg'); |
182
|
|
|
break; |
183
|
|
|
case 'png': |
184
|
|
|
imagealphablending($im, false); |
185
|
|
|
imagesavealpha($im, true); |
186
|
|
|
imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9)); |
|
|
|
|
187
|
|
|
$this->logger->debug('DownloadImages: Re-creating png'); |
188
|
|
|
} |
189
|
|
|
|
190
|
|
|
imagedestroy($im); |
191
|
|
|
|
192
|
|
|
return $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext; |
193
|
|
|
} |
194
|
|
|
|
195
|
|
|
/** |
196
|
|
|
* Remove all images for the given entry id. |
197
|
|
|
* |
198
|
|
|
* @param int $entryId ID of the entry |
199
|
|
|
*/ |
200
|
|
|
public function removeImages($entryId) |
201
|
|
|
{ |
202
|
|
|
$relativePath = $this->getRelativePath($entryId); |
203
|
|
|
$folderPath = $this->baseFolder . '/' . $relativePath; |
204
|
|
|
|
205
|
|
|
$finder = new Finder(); |
206
|
|
|
$finder |
207
|
|
|
->files() |
208
|
|
|
->ignoreDotFiles(true) |
209
|
|
|
->in($folderPath); |
210
|
|
|
|
211
|
|
|
foreach ($finder as $file) { |
212
|
|
|
@unlink($file->getRealPath()); |
|
|
|
|
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
@rmdir($folderPath); |
|
|
|
|
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
/** |
219
|
|
|
* Generate the folder where we are going to save images based on the entry url. |
220
|
|
|
* |
221
|
|
|
* @param int $entryId ID of the entry |
222
|
|
|
* @param bool $createFolder Should we create the folder for the given id? |
223
|
|
|
* |
224
|
|
|
* @return string |
225
|
|
|
*/ |
226
|
|
|
public function getRelativePath($entryId, $createFolder = true) |
227
|
|
|
{ |
228
|
|
|
$hashId = hash('crc32', $entryId); |
229
|
|
|
$relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId; |
230
|
|
|
$folderPath = $this->baseFolder . '/' . $relativePath; |
231
|
|
|
|
232
|
|
|
if (!file_exists($folderPath) && $createFolder) { |
233
|
|
|
mkdir($folderPath, 0777, true); |
234
|
|
|
} |
235
|
|
|
|
236
|
|
|
$this->logger->debug('DownloadImages: Folder used for that Entry id', ['folder' => $folderPath, 'entryId' => $entryId]); |
237
|
|
|
|
238
|
|
|
return $relativePath; |
239
|
|
|
} |
240
|
|
|
|
241
|
|
|
/** |
242
|
|
|
* Get images urls from the srcset image attribute. |
243
|
|
|
* |
244
|
|
|
* @return array An array of urls |
245
|
|
|
*/ |
246
|
|
|
private static function getSrcsetUrls(Crawler $imagesCrawler) |
247
|
|
|
{ |
248
|
|
|
$urls = []; |
249
|
|
|
$iterator = $imagesCrawler->getIterator(); |
250
|
|
|
|
251
|
|
|
while ($iterator->valid()) { |
252
|
|
|
$srcsetAttribute = $iterator->current()->getAttribute('srcset'); |
253
|
|
|
|
254
|
|
|
if ('' !== $srcsetAttribute) { |
255
|
|
|
// Couldn't start with " OR ' OR a white space |
256
|
|
|
// Could be one or more white space |
257
|
|
|
// Must be one or more digits followed by w OR x |
258
|
|
|
$pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/"; |
259
|
|
|
preg_match_all($pattern, $srcsetAttribute, $matches); |
260
|
|
|
|
261
|
|
|
$srcset = \call_user_func_array('array_merge', $matches); |
262
|
|
|
$srcsetUrls = array_map(function ($src) { |
263
|
|
|
return trim(explode(' ', $src, 2)[0]); |
264
|
|
|
}, $srcset); |
265
|
|
|
$urls = array_merge($srcsetUrls, $urls); |
266
|
|
|
} |
267
|
|
|
|
268
|
|
|
$iterator->next(); |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
return $urls; |
272
|
|
|
} |
273
|
|
|
|
274
|
|
|
/** |
275
|
|
|
* Setup base folder where all images are going to be saved. |
276
|
|
|
*/ |
277
|
|
|
private function setFolder() |
278
|
|
|
{ |
279
|
|
|
// if folder doesn't exist, attempt to create one and store the folder name in property $folder |
280
|
|
|
if (!file_exists($this->baseFolder)) { |
281
|
|
|
mkdir($this->baseFolder, 0755, true); |
282
|
|
|
} |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* Make an $url absolute based on the $base. |
287
|
|
|
* |
288
|
|
|
* @see Graby->makeAbsoluteStr |
289
|
|
|
* |
290
|
|
|
* @param string $base Base url |
291
|
|
|
* @param string $url Url to make it absolute |
292
|
|
|
* |
293
|
|
|
* @return false|string |
294
|
|
|
*/ |
295
|
|
|
private function getAbsoluteLink($base, $url) |
296
|
|
|
{ |
297
|
|
|
if (preg_match('!^https?://!i', $url)) { |
298
|
|
|
// already absolute |
299
|
|
|
return $url; |
300
|
|
|
} |
301
|
|
|
|
302
|
|
|
$base = new Uri($base); |
303
|
|
|
|
304
|
|
|
// in case the url has no scheme & host |
305
|
|
|
if ('' === $base->getAuthority() || '' === $base->getScheme()) { |
306
|
|
|
$this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]); |
307
|
|
|
|
308
|
|
|
return false; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
return (string) UriResolver::resolve($base, new Uri($url)); |
312
|
|
|
} |
313
|
|
|
|
314
|
|
|
/** |
315
|
|
|
* Retrieve and validate the extension from the response of the url of the image. |
316
|
|
|
* |
317
|
|
|
* @param ResponseInterface $res Http Response |
318
|
|
|
* @param string $imagePath Path from the src image from the content (used for log only) |
319
|
|
|
* |
320
|
|
|
* @return string|false Extension name or false if validation failed |
321
|
|
|
*/ |
322
|
|
|
private function getExtensionFromResponse(ResponseInterface $res, $imagePath) |
323
|
|
|
{ |
324
|
|
|
$ext = $this->mimeGuesser->guess(current($res->getHeader('content-type'))); |
325
|
|
|
$this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]); |
326
|
|
|
|
327
|
|
|
// ok header doesn't have the extension, try a different way |
328
|
|
|
if (empty($ext)) { |
329
|
|
|
$types = [ |
330
|
|
|
'jpeg' => "\xFF\xD8\xFF", |
331
|
|
|
'gif' => 'GIF', |
332
|
|
|
'png' => "\x89\x50\x4e\x47\x0d\x0a", |
333
|
|
|
]; |
334
|
|
|
$bytes = substr((string) $res->getBody(), 0, 8); |
335
|
|
|
|
336
|
|
|
foreach ($types as $type => $header) { |
337
|
|
|
if (0 === strpos($bytes, $header)) { |
338
|
|
|
$ext = $type; |
339
|
|
|
break; |
340
|
|
|
} |
341
|
|
|
} |
342
|
|
|
|
343
|
|
|
$this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]); |
344
|
|
|
} |
345
|
|
|
|
346
|
|
|
if (!\in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) { |
347
|
|
|
$this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath); |
348
|
|
|
|
349
|
|
|
return false; |
350
|
|
|
} |
351
|
|
|
|
352
|
|
|
return $ext; |
353
|
|
|
} |
354
|
|
|
} |
355
|
|
|
|