Completed
Push — master ( cc47d7...1bf541 )
by Jeremy
34s queued 13s
created

DownloadImages::getBaseFolder()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 3
rs 10
1
<?php
2
3
namespace Wallabag\CoreBundle\Helper;
4
5
use GuzzleHttp\Psr7\Uri;
6
use GuzzleHttp\Psr7\UriResolver;
7
use Http\Client\Common\HttpMethodsClient;
8
use Http\Client\Common\Plugin\ErrorPlugin;
9
use Http\Client\Common\PluginClient;
10
use Http\Client\HttpClient;
11
use Http\Discovery\MessageFactoryDiscovery;
12
use Http\Message\MessageFactory;
13
use Psr\Http\Message\ResponseInterface;
14
use Psr\Log\LoggerInterface;
15
use Symfony\Component\DomCrawler\Crawler;
16
use Symfony\Component\Finder\Finder;
17
use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser;
18
19
class DownloadImages
20
{
21
    const REGENERATE_PICTURES_QUALITY = 80;
22
23
    private $client;
24
    private $baseFolder;
25
    private $logger;
26
    private $mimeGuesser;
27
    private $wallabagUrl;
28
29
    public function __construct(HttpClient $client, $baseFolder, $wallabagUrl, LoggerInterface $logger, MessageFactory $messageFactory = null)
30
    {
31
        $this->client = new HttpMethodsClient(new PluginClient($client, [new ErrorPlugin()]), $messageFactory ?: MessageFactoryDiscovery::find());
32
        $this->baseFolder = $baseFolder;
33
        $this->wallabagUrl = rtrim($wallabagUrl, '/');
34
        $this->logger = $logger;
35
        $this->mimeGuesser = new MimeTypeExtensionGuesser();
36
37
        $this->setFolder();
38
    }
39
40
    public function getBaseFolder()
41
    {
42
        return $this->baseFolder;
43
    }
44
45
    /**
46
     * Process the html and extract images URLs from it.
47
     *
48
     * @param string $html
49
     *
50
     * @return string[]
51
     */
52
    public static function extractImagesUrlsFromHtml($html)
53
    {
54
        $crawler = new Crawler($html);
55
        $imagesCrawler = $crawler->filterXpath('//img');
56
        $imagesUrls = $imagesCrawler->extract(['src']);
57
        $imagesSrcsetUrls = self::getSrcsetUrls($imagesCrawler);
58
59
        return array_unique(array_merge($imagesUrls, $imagesSrcsetUrls));
60
    }
61
62
    /**
63
     * Process the html and extract image from it, save them to local and return the updated html.
64
     *
65
     * @param int    $entryId ID of the entry
66
     * @param string $html
67
     * @param string $url     Used as a base path for relative image and folder
68
     *
69
     * @return string
70
     */
71
    public function processHtml($entryId, $html, $url)
72
    {
73
        $imagesUrls = self::extractImagesUrlsFromHtml($html);
74
75
        $relativePath = $this->getRelativePath($entryId);
76
77
        // download and save the image to the folder
78
        foreach ($imagesUrls as $image) {
79
            $imagePath = $this->processSingleImage($entryId, $image, $url, $relativePath);
80
81
            if (false === $imagePath) {
82
                continue;
83
            }
84
85
            // if image contains "&" and we can't find it in the html it might be because it's encoded as &amp;
86
            if (false !== stripos($image, '&') && false === stripos($html, $image)) {
87
                $image = str_replace('&', '&amp;', $image);
88
            }
89
90
            $html = str_replace($image, $imagePath, $html);
91
        }
92
93
        return $html;
94
    }
95
96
    /**
97
     * Process a single image:
98
     *     - retrieve it
99
     *     - re-saved it (for security reason)
100
     *     - return the new local path.
101
     *
102
     * @param int    $entryId      ID of the entry
103
     * @param string $imagePath    Path to the image to retrieve
104
     * @param string $url          Url from where the image were found
105
     * @param string $relativePath Relative local path to saved the image
106
     *
107
     * @return string|false Relative url to access the image from the web
108
     */
109
    public function processSingleImage($entryId, $imagePath, $url, $relativePath = null)
110
    {
111
        if (null === $imagePath) {
0 ignored issues
show
introduced by
The condition null === $imagePath is always false.
Loading history...
112
            return false;
113
        }
114
115
        if (null === $relativePath) {
116
            $relativePath = $this->getRelativePath($entryId);
117
        }
118
119
        $this->logger->debug('DownloadImages: working on image: ' . $imagePath);
120
121
        $folderPath = $this->baseFolder . '/' . $relativePath;
122
123
        // build image path
124
        $absolutePath = $this->getAbsoluteLink($url, $imagePath);
125
        if (false === $absolutePath) {
126
            $this->logger->error('DownloadImages: Can not determine the absolute path for that image, skipping.');
127
128
            return false;
129
        }
130
131
        try {
132
            $res = $this->client->get($absolutePath);
133
        } catch (\Exception $e) {
134
            $this->logger->error('DownloadImages: Can not retrieve image, skipping.', ['exception' => $e]);
135
136
            return false;
137
        }
138
139
        $ext = $this->getExtensionFromResponse($res, $imagePath);
140
        if (false === $res) {
0 ignored issues
show
introduced by
The condition false === $res is always false.
Loading history...
141
            return false;
142
        }
143
144
        $hashImage = hash('crc32', $absolutePath);
145
        $localPath = $folderPath . '/' . $hashImage . '.' . $ext;
0 ignored issues
show
Bug introduced by
Are you sure $ext of type false|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

145
        $localPath = $folderPath . '/' . $hashImage . '.' . /** @scrutinizer ignore-type */ $ext;
Loading history...
146
147
        try {
148
            $im = imagecreatefromstring((string) $res->getBody());
149
        } catch (\Exception $e) {
150
            $im = false;
151
        }
152
153
        if (false === $im) {
154
            $this->logger->error('DownloadImages: Error while regenerating image', ['path' => $localPath]);
155
156
            return false;
157
        }
158
159
        switch ($ext) {
160
            case 'gif':
161
                // use Imagick if available to keep GIF animation
162
                if (class_exists('\\Imagick')) {
163
                    try {
164
                        $imagick = new \Imagick();
165
                        $imagick->readImageBlob($res->getBody());
166
                        $imagick->setImageFormat('gif');
167
                        $imagick->writeImages($localPath, true);
168
                    } catch (\Exception $e) {
169
                        // if Imagick fail, fallback to the default solution
170
                        imagegif($im, $localPath);
171
                    }
172
                } else {
173
                    imagegif($im, $localPath);
174
                }
175
176
                $this->logger->debug('DownloadImages: Re-creating gif');
177
                break;
178
            case 'jpeg':
179
            case 'jpg':
180
                imagejpeg($im, $localPath, self::REGENERATE_PICTURES_QUALITY);
181
                $this->logger->debug('DownloadImages: Re-creating jpg');
182
                break;
183
            case 'png':
184
                imagealphablending($im, false);
185
                imagesavealpha($im, true);
186
                imagepng($im, $localPath, ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9));
0 ignored issues
show
Bug introduced by
ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9) of type double is incompatible with the type integer expected by parameter $quality of imagepng(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

186
                imagepng($im, $localPath, /** @scrutinizer ignore-type */ ceil(self::REGENERATE_PICTURES_QUALITY / 100 * 9));
Loading history...
187
                $this->logger->debug('DownloadImages: Re-creating png');
188
        }
189
190
        imagedestroy($im);
191
192
        return $this->wallabagUrl . '/assets/images/' . $relativePath . '/' . $hashImage . '.' . $ext;
193
    }
194
195
    /**
196
     * Remove all images for the given entry id.
197
     *
198
     * @param int $entryId ID of the entry
199
     */
200
    public function removeImages($entryId)
201
    {
202
        $relativePath = $this->getRelativePath($entryId);
203
        $folderPath = $this->baseFolder . '/' . $relativePath;
204
205
        $finder = new Finder();
206
        $finder
207
            ->files()
208
            ->ignoreDotFiles(true)
209
            ->in($folderPath);
210
211
        foreach ($finder as $file) {
212
            @unlink($file->getRealPath());
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for unlink(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

212
            /** @scrutinizer ignore-unhandled */ @unlink($file->getRealPath());

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
213
        }
214
215
        @rmdir($folderPath);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for rmdir(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

215
        /** @scrutinizer ignore-unhandled */ @rmdir($folderPath);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
216
    }
217
218
    /**
219
     * Generate the folder where we are going to save images based on the entry url.
220
     *
221
     * @param int  $entryId      ID of the entry
222
     * @param bool $createFolder Should we create the folder for the given id?
223
     *
224
     * @return string
225
     */
226
    public function getRelativePath($entryId, $createFolder = true)
227
    {
228
        $hashId = hash('crc32', $entryId);
229
        $relativePath = $hashId[0] . '/' . $hashId[1] . '/' . $hashId;
230
        $folderPath = $this->baseFolder . '/' . $relativePath;
231
232
        if (!file_exists($folderPath) && $createFolder) {
233
            mkdir($folderPath, 0777, true);
234
        }
235
236
        $this->logger->debug('DownloadImages: Folder used for that Entry id', ['folder' => $folderPath, 'entryId' => $entryId]);
237
238
        return $relativePath;
239
    }
240
241
    /**
242
     * Get images urls from the srcset image attribute.
243
     *
244
     * @return array An array of urls
245
     */
246
    private static function getSrcsetUrls(Crawler $imagesCrawler)
247
    {
248
        $urls = [];
249
        $iterator = $imagesCrawler->getIterator();
250
251
        while ($iterator->valid()) {
252
            $srcsetAttribute = $iterator->current()->getAttribute('srcset');
253
254
            if ('' !== $srcsetAttribute) {
255
                // Couldn't start with " OR ' OR a white space
256
                // Could be one or more white space
257
                // Must be one or more digits followed by w OR x
258
                $pattern = "/(?:[^\"'\s]+\s*(?:\d+[wx])+)/";
259
                preg_match_all($pattern, $srcsetAttribute, $matches);
260
261
                $srcset = \call_user_func_array('array_merge', $matches);
262
                $srcsetUrls = array_map(function ($src) {
263
                    return trim(explode(' ', $src, 2)[0]);
264
                }, $srcset);
265
                $urls = array_merge($srcsetUrls, $urls);
266
            }
267
268
            $iterator->next();
269
        }
270
271
        return $urls;
272
    }
273
274
    /**
275
     * Setup base folder where all images are going to be saved.
276
     */
277
    private function setFolder()
278
    {
279
        // if folder doesn't exist, attempt to create one and store the folder name in property $folder
280
        if (!file_exists($this->baseFolder)) {
281
            mkdir($this->baseFolder, 0755, true);
282
        }
283
    }
284
285
    /**
286
     * Make an $url absolute based on the $base.
287
     *
288
     * @see Graby->makeAbsoluteStr
289
     *
290
     * @param string $base Base url
291
     * @param string $url  Url to make it absolute
292
     *
293
     * @return false|string
294
     */
295
    private function getAbsoluteLink($base, $url)
296
    {
297
        if (preg_match('!^https?://!i', $url)) {
298
            // already absolute
299
            return $url;
300
        }
301
302
        $base = new Uri($base);
303
304
        // in case the url has no scheme & host
305
        if ('' === $base->getAuthority() || '' === $base->getScheme()) {
306
            $this->logger->error('DownloadImages: Can not make an absolute link', ['base' => $base, 'url' => $url]);
307
308
            return false;
309
        }
310
311
        return (string) UriResolver::resolve($base, new Uri($url));
312
    }
313
314
    /**
315
     * Retrieve and validate the extension from the response of the url of the image.
316
     *
317
     * @param ResponseInterface $res       Http Response
318
     * @param string            $imagePath Path from the src image from the content (used for log only)
319
     *
320
     * @return string|false Extension name or false if validation failed
321
     */
322
    private function getExtensionFromResponse(ResponseInterface $res, $imagePath)
323
    {
324
        $ext = $this->mimeGuesser->guess(current($res->getHeader('content-type')));
325
        $this->logger->debug('DownloadImages: Checking extension', ['ext' => $ext, 'header' => $res->getHeader('content-type')]);
326
327
        // ok header doesn't have the extension, try a different way
328
        if (empty($ext)) {
329
            $types = [
330
                'jpeg' => "\xFF\xD8\xFF",
331
                'gif' => 'GIF',
332
                'png' => "\x89\x50\x4e\x47\x0d\x0a",
333
            ];
334
            $bytes = substr((string) $res->getBody(), 0, 8);
335
336
            foreach ($types as $type => $header) {
337
                if (0 === strpos($bytes, $header)) {
338
                    $ext = $type;
339
                    break;
340
                }
341
            }
342
343
            $this->logger->debug('DownloadImages: Checking extension (alternative)', ['ext' => $ext]);
344
        }
345
346
        if (!\in_array($ext, ['jpeg', 'jpg', 'gif', 'png'], true)) {
347
            $this->logger->error('DownloadImages: Processed image with not allowed extension. Skipping: ' . $imagePath);
348
349
            return false;
350
        }
351
352
        return $ext;
353
    }
354
}
355