ImageExtractor::buildImagePath()   B
last analyzed

Complexity

Conditions 9
Paths 8

Size

Total Lines 26
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 1
Metric Value
eloc 17
c 2
b 0
f 1
dl 0
loc 26
rs 8.0555
cc 9
nc 8
nop 1
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\ArticleMutatorTrait;
7
use Goose\Images\{Image, ImageUtils, LocallyStoredImage};
8
use Goose\Modules\{AbstractModule, ModuleInterface};
9
use DOMWrap\{Element, NodeList};
10
11
/**
12
 * Image Extractor
13
 *
14
 * @package Goose\Modules\Extractors
15
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16
 */
17
class ImageExtractor extends AbstractModule implements ModuleInterface {
18
    use ArticleMutatorTrait;
19
20
    /** @var string[] */
21
    private $badFileNames = [
22
        '\.html', '\.gif', '\.ico', 'button', 'twitter\.jpg', 'facebook\.jpg',
23
        'ap_buy_photo', 'digg\.jpg', 'digg\.png', 'delicious\.png',
24
        'facebook\.png', 'reddit\.jpg', 'doubleclick', 'diggthis',
25
        'diggThis', 'adserver', '\/ads\/', 'ec\.atdmt\.com', 'mediaplex\.com',
26
        'adsatt', 'view\.atdmt',
27
    ];
28
29
    /** @var string[] */
30
    private static $KNOWN_IMG_DOM_NAMES = [
31
        'yn-story-related-media',
32
        'cnn_strylccimg300cntr',
33
        'big_photo',
34
        'ap-smallphoto-a'
35
    ];
36
37
    /** @var int */
38
    private static $MAX_PARENT_DEPTH = 2;
39
40
    /** @var string[] */
41
    private static $CUSTOM_SITE_MAPPING = [];
42
43
    /** @inheritdoc  */
44
    public function run(Article $article): self {
45
        $this->article($article);
46
47
        if ($this->config()->get('image_fetch_best')) {
48
            $article->setTopImage($this->getBestImage());
49
50
            if ($this->config()->get('image_fetch_all')
51
              && $article->getTopNode() instanceof Element) {
52
                $article->setAllImages($this->getAllImages());
53
            }
54
        }
55
56
        return $this;
57
    }
58
59
    /**
60
     * @return Image|null
61
     */
62
    private function getBestImage(): ?Image {
63
        $image = $this->checkForKnownElements();
64
65
        if ($image) {
66
            return $image;
67
        }
68
69
        $image = $this->checkForMetaTag();
70
71
        if ($image) {
72
            return $image;
73
        }
74
75
        if ($this->article()->getTopNode() instanceof Element) {
76
            $image = $this->checkForLargeImages($this->article()->getTopNode(), 0, 0);
77
78
            if ($image) {
79
                return $image;
80
            }
81
        }
82
83
        return null;
84
    }
85
86
    /**
87
     * Prefer Twitter images (as they tend to have the right size for us), then Open Graph images
88
     * (which seem to be smaller), and finally linked images.
89
     *
90
     * @return Image|null
91
     */
92
    private function checkForMetaTag(): ?Image {
93
        $image = $this->checkForTwitterTag();
94
95
        if ($image) {
96
            return $image;
97
        }
98
99
        $image = $this->checkForOpenGraphTag();
100
101
        if ($image) {
102
            return $image;
103
        }
104
105
        $image = $this->checkForLinkTag();
106
107
        if ($image) {
108
            return $image;
109
        }
110
111
        return null;
112
    }
113
114
    /**
115
     * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk
116
     * so we'll go through a phased approach...
117
     * 1. get a list of ALL images from the parent node
118
     * 2. filter out any bad image names that we know of (gifs, ads, etc..)
119
     * 3. do a head request on each file to make sure it meets our bare requirements
120
     * 4. any images left over let's do a full GET request, download em to disk and check their dimensions
121
     * 5. Score images based on different factors like height/width and possibly things like color density
122
     *
123
     * @param Element $node
124
     * @param int $parentDepthLevel
125
     * @param int $siblingDepthLevel
126
     *
127
     * @return Image|null
128
     */
129
    private function checkForLargeImages(Element $node, int $parentDepthLevel, int $siblingDepthLevel): ?Image {
130
        $goodLocalImages = $this->getImageCandidates($node);
131
132
        $scoredLocalImages = $this->scoreLocalImages($goodLocalImages);
133
134
        ksort($scoredLocalImages);
135
136
        if (!empty($scoredLocalImages)) {
137
            foreach ($scoredLocalImages as $imageScore => $scoredLocalImage) {
138
                $mainImage = new Image();
139
                $mainImage->setImageSrc($scoredLocalImage->getImgSrc());
140
                $mainImage->setImageExtractionType('bigimage');
141
                $mainImage->setConfidenceScore(100 / count($scoredLocalImages));
142
                $mainImage->setImageScore($imageScore);
143
                $mainImage->setBytes($scoredLocalImage->getBytes());
144
                $mainImage->setHeight($scoredLocalImage->getHeight());
145
                $mainImage->setWidth($scoredLocalImage->getWidth());
146
147
                return $mainImage;
148
            }
149
        } else {
150
            $depthObj = $this->getDepthLevel($node, $parentDepthLevel, $siblingDepthLevel);
151
152
            if ($depthObj && NULL !== $depthObj->node) {
153
                return $this->checkForLargeImages($depthObj->node, $depthObj->parentDepth, $depthObj->siblingDepth);
154
            }
155
        }
156
157
        return null;
158
    }
159
160
    /**
161
     * @param Element $node
162
     * @param int $parentDepth
163
     * @param int $siblingDepth
164
     *
165
     * @return object|null
166
     */
167
    private function getDepthLevel(Element $node, int $parentDepth, int $siblingDepth): ?\stdClass {
168
        if (is_null($node) || !($node->parent() instanceof Element)) {
169
            return null;
170
        }
171
172
        if ($parentDepth > self::$MAX_PARENT_DEPTH) {
173
            return null;
174
        }
175
176
        // Find previous sibling element node
177
        $siblingNode = $node->preceding(function($node) {
178
            return $node instanceof Element;
179
        });
180
181
        if (is_null($siblingNode)) {
182
            return (object)[
183
                'node' => $node->parent(),
184
                'parentDepth' => $parentDepth + 1,
185
                'siblingDepth' => 0,
186
            ];
187
        }
188
189
        return (object)[
190
            'node' => $siblingNode,
191
            'parentDepth' => $parentDepth,
192
            'siblingDepth' => $siblingDepth + 1,
193
        ];
194
    }
195
196
    /**
197
     * Set image score and on locally downloaded images
198
     *
199
     * we're going to score the images in the order in which they appear so images higher up will have more importance,
200
     * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is
201
     * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa
202
     * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image
203
     *
204
     * @param LocallyStoredImage[] $locallyStoredImages
205
     *
206
     * @return LocallyStoredImage[]
207
     */
208
    private function scoreLocalImages($locallyStoredImages): array {
209
        $results = [];
210
        $i = 1;
211
        $initialArea = 0;
212
213
        // Limit to the first 30 images
214
        $locallyStoredImages = array_slice($locallyStoredImages, 0, 30);
215
216
        foreach ($locallyStoredImages as $locallyStoredImage) {
217
            $sequenceScore = 1 / $i;
218
            $area = $locallyStoredImage->getWidth() * $locallyStoredImage->getHeight();
219
220
            if ($initialArea == 0) {
221
                $initialArea = $area * 1.48;
222
                $totalScore = 1;
223
            } else {
224
                $areaDifference = $area * $initialArea;
225
                $totalScore = $sequenceScore * $areaDifference;
226
            }
227
228
            $i++;
229
230
            $results[$totalScore] = $locallyStoredImage;
231
        }
232
233
        return $results;
234
    }
235
236
    /**
237
     * @return Image[]
238
     */
239
    private function getAllImages(): array {
240
        $results = [];
241
242
        $images = $this->article()->getTopNode()->find('img');
243
244
        // Generate a complete URL for each image
245
        $imageUrls = array_map(function($image) {
246
            return $this->buildImagePath($image->attr('src'));
247
        }, $images->toArray());
248
249
        $localImages = $this->getLocallyStoredImages($imageUrls);
250
251
        foreach ($localImages as $localImage) {
252
            $image = new Image();
253
            $image->setImageSrc($localImage->getImgSrc());
254
            $image->setBytes($localImage->getBytes());
255
            $image->setHeight($localImage->getHeight());
256
            $image->setWidth($localImage->getWidth());
257
            $image->setImageExtractionType('all');
258
            $image->setConfidenceScore(0);
259
260
            $results[] = $image;
261
        }
262
263
        return $results;
264
    }
265
266
    /**
267
     * takes a list of image elements and filters out the ones with bad names
268
     *
269
     * @param \DOMWrap\NodeList $images
270
     *
271
     * @return Element[]
272
     */
273
    private function filterBadNames(NodeList $images): array {
274
        $goodImages = [];
275
276
        foreach ($images as $image) {
277
            if ($this->isOkImageFileName($image)) {
278
                $goodImages[] = $image;
279
            } else {
280
                $image->destroy();
281
            }
282
        }
283
284
        return $goodImages;
285
    }
286
287
    /**
288
     * will check the image src against a list of bad image files we know of like buttons, etc...
289
     *
290
     * @param Element $imageNode
291
     *
292
     * @return bool
293
     */
294
    private function isOkImageFileName(Element $imageNode): bool {
295
        $imgSrc = $imageNode->attr('src');
296
297
        if (empty($imgSrc)) {
298
            return false;
299
        }
300
301
        $regex = '@' . implode('|', $this->badFileNames) . '@i';
302
303
        if (preg_match($regex, $imgSrc)) {
304
            return false;
305
        }
306
307
        return true;
308
    }
309
310
    /**
311
     * @param Element $node
312
     *
313
     * @return LocallyStoredImage[]
314
     */
315
    private function getImageCandidates(Element $node): array {
316
        $images = $node->find('img');
317
        $filteredImages = $this->filterBadNames($images);
318
        $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
319
320
        return $goodImages;
321
    }
322
323
    /**
324
     * loop through all the images and find the ones that have the best bytes to even make them a candidate
325
     *
326
     * @param Element[] $images
327
     *
328
     * @return LocallyStoredImage[]
329
     */
330
    private function findImagesThatPassByteSizeTest(array $images): array {
331
        $i = 0; /** @todo Re-factor how the LocallyStoredImage => Image relation works ? Note: PHP 5.6.x adds a 3rd argument to array_filter() to pass the key as well as value. */
332
333
        // Limit to the first 30 images
334
        $images = array_slice($images, 0, 30);
335
336
        // Generate a complete URL for each image
337
        $imageUrls = array_map(function($image) {
338
            return $this->buildImagePath($image->attr('src'));
339
        }, $images);
340
341
        $localImages = $this->getLocallyStoredImages($imageUrls, true);
342
343
        $results = array_filter($localImages, function($localImage) use($images, $i) {
344
            $image = $images[$i++];
345
346
            $bytes = $localImage->getBytes();
347
348
            if ($bytes < $this->config()->get('image_min_bytes') && $bytes != 0 || $bytes > $this->config()->get('image_max_bytes')) {
0 ignored issues
show
introduced by
Consider adding parentheses for clarity. Current Interpretation: ($bytes < $this->config(...>get('image_max_bytes'), Probably Intended Meaning: $bytes < $this->config()...get('image_max_bytes'))
Loading history...
349
                $image->destroy();
350
351
                return false;
352
            }
353
354
            return true;
355
        });
356
357
        return $results;
358
    }
359
360
    /**
361
     * checks to see if we were able to find feature image tags on this page
362
     *
363
     * @return Image|null
364
     */
365
    private function checkForLinkTag(): ?Image {
366
        return $this->checkForTag('link[rel="image_src"]', 'href', 'linktag');
367
    }
368
369
    /**
370
     * checks to see if we were able to find open graph tags on this page
371
     *
372
     * @return Image|null
373
     */
374
    private function checkForOpenGraphTag(): ?Image {
375
        return $this->checkForTag('meta[property="og:image"],meta[name="og:image"]', 'content', 'opengraph');
376
    }
377
378
    /**
379
     * checks to see if we were able to find twitter tags on this page
380
     *
381
     * @return Image|null
382
     */
383
    private function checkForTwitterTag(): ?Image {
384
        return $this->checkForTag('meta[property="twitter:image"],meta[name="twitter:image"],meta[property="twitter:image:src"],meta[name="twitter:image:src"]', 'content', 'twitter');
385
    }
386
387
    /**
388
     * @param string $selector
389
     * @param string $attr
390
     * @param string $type
391
     *
392
     * @return Image|null
393
     */
394
    private function checkForTag(string $selector, string $attr, string $type): ?Image {
395
        $meta = $this->article()->getRawDoc()->find($selector);
396
397
        if (!$meta->count()) {
398
            return null;
399
        }
400
401
        $node = $meta->first();
402
403
        if (!($node instanceof Element)) {
404
            return null;
405
        }
406
407
        if (!$node->hasAttribute($attr) || !$node->attr($attr)) {
408
            return null;
409
        }
410
411
        $imagePath = $this->buildImagePath($node->attr($attr));
412
        $mainImage = new Image();
413
        $mainImage->setImageSrc($imagePath);
414
        $mainImage->setImageExtractionType($type);
415
        $mainImage->setConfidenceScore(100);
416
417
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
418
419
        if (!empty($locallyStoredImage)) {
420
            $mainImage->setBytes($locallyStoredImage->getBytes());
421
            $mainImage->setHeight($locallyStoredImage->getHeight());
422
            $mainImage->setWidth($locallyStoredImage->getWidth());
423
        }
424
425
        return $this->ensureMinimumImageSize($mainImage);
426
    }
427
428
    /**
429
     * @param Image $mainImage
430
     *
431
     * @return Image|null
432
     */
433
    private function ensureMinimumImageSize(Image $mainImage): ?Image {
434
        if ($mainImage->getWidth() >= $this->config()->get('image_min_width')
435
          && $mainImage->getHeight() >= $this->config()->get('image_min_height')) {
436
            return $mainImage;
437
        }
438
439
        return null;
440
    }
441
442
    /**
443
     * @param string $imageSrc
444
     * @param bool $returnAll
445
     *
446
     * @return LocallyStoredImage|null
447
     */
448
    private function getLocallyStoredImage(string $imageSrc, bool $returnAll = false): ?LocallyStoredImage {
449
        $locallyStoredImages = ImageUtils::storeImagesToLocalFile([$imageSrc], $returnAll, $this->config());
450
451
        return array_shift($locallyStoredImages);
452
    }
453
454
    /**
455
     * @param string[] $imageSrcs
456
     * @param bool $returnAll
457
     *
458
     * @return LocallyStoredImage[]
459
     */
460
    private function getLocallyStoredImages($imageSrcs, bool $returnAll = false): array {
461
        return ImageUtils::storeImagesToLocalFile($imageSrcs, $returnAll, $this->config());
462
    }
463
464
    /**
465
     * @return string
466
     */
467
    private function getCleanDomain(): string {
468
        return implode('.', array_slice(explode('.', $this->article()->getDomain()), -2, 2));
0 ignored issues
show
Bug introduced by
It seems like $this->article()->getDomain() can also be of type null; however, parameter $string of explode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

468
        return implode('.', array_slice(explode('.', /** @scrutinizer ignore-type */ $this->article()->getDomain()), -2, 2));
Loading history...
469
    }
470
471
    /**
472
     * In here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have
473
     * known  places to look for good images.
474
     *
475
     * @todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites
476
     *
477
     * @return Image|null
478
     */
479
    private function checkForKnownElements(): ?Image {
480
        if (!$this->article()->getRawDoc()) {
481
            return null;
482
        }
483
484
        $knownImgDomNames = self::$KNOWN_IMG_DOM_NAMES;
485
486
        $domain = $this->getCleanDomain();
487
488
        $customSiteMapping = $this->customSiteMapping();
489
490
        if (isset($customSiteMapping[$domain])) {
491
            foreach (explode('|', $customSiteMapping[$domain]) as $class) {
492
                $knownImgDomNames[] = $class;
493
            }
494
        }
495
496
        $knownImage = null;
497
498
        foreach ($knownImgDomNames as $knownName) {
499
            $known = $this->article()->getRawDoc()->find('#' . $knownName);
500
501
            if (!$known->count()) {
502
                $known = $this->article()->getRawDoc()->find('.' . $knownName);
503
            }
504
505
            if ($known->count()) {
506
                $mainImage = $known->first()->find('img');
507
508
                if ($mainImage->count()) {
509
                    $knownImage = $mainImage->first();
510
                }
511
            }
512
        }
513
514
        if (is_null($knownImage)) {
515
            return null;
516
        }
517
518
        $knownImgSrc = $knownImage->attr('src');
519
520
        $mainImage = new Image();
521
        $mainImage->setImageSrc($this->buildImagePath($knownImgSrc));
522
        $mainImage->setImageExtractionType('known');
523
        $mainImage->setConfidenceScore(90);
524
525
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
526
527
        if (!empty($locallyStoredImage)) {
528
            $mainImage->setBytes($locallyStoredImage->getBytes());
529
            $mainImage->setHeight($locallyStoredImage->getHeight());
530
            $mainImage->setWidth($locallyStoredImage->getWidth());
531
        }
532
533
        return $this->ensureMinimumImageSize($mainImage);
534
    }
535
536
    /**
537
     * This method will take an image path and build out the absolute path to that image
538
     * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg
539
     *
540
     * @param string $imageSrc
541
     *
542
     * @return string
543
     */
544
    private function buildImagePath(string $imageSrc): string {
545
        $parts = array(
546
            'scheme',
547
            'host',
548
            'port',
549
            'path',
550
            'query',
551
        );
552
553
        $imageUrlParts = parse_url($imageSrc);
554
        $articleUrlParts = parse_url($this->article()->getFinalUrl());
555
        if (isset($imageUrlParts['path'], $articleUrlParts['path']) && $imageUrlParts['path'] && $imageUrlParts['path'][0] !== '/') {
556
            $articleUrlDir = dirname($articleUrlParts['path']);
557
            $imageUrlParts['path'] = $articleUrlDir . '/' . $imageUrlParts['path'];
0 ignored issues
show
Bug introduced by
Are you sure $imageUrlParts['path'] of type array can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

557
            $imageUrlParts['path'] = $articleUrlDir . '/' . /** @scrutinizer ignore-type */ $imageUrlParts['path'];
Loading history...
558
        }
559
560
        foreach ($parts as $part) {
561
            if (!isset($imageUrlParts[$part]) && isset($articleUrlParts[$part])) {
562
                $imageUrlParts[$part] = $articleUrlParts[$part];
563
564
            } else if (isset($imageUrlParts[$part]) && !isset($articleUrlParts[$part])) {
565
                break;
566
            }
567
        }
568
569
        return http_build_url($imageUrlParts, array());
570
    }
571
572
    /**
573
     * @param string[]
574
     *
575
     * @return array
576
     */
577
    private function customSiteMapping(): array {
578
        if (empty(self::$CUSTOM_SITE_MAPPING)) {
579
            $file = __DIR__ . '/../../../resources/images/known-image-css.txt';
580
581
            $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
582
583
            foreach ($lines as $line) {
584
                list($domain, $css) = explode('^', $line);
585
586
                self::$CUSTOM_SITE_MAPPING[$domain] = $css;
587
            }
588
        }
589
590
        return self::$CUSTOM_SITE_MAPPING;
591
    }
592
593
}
594