Completed
Branch master (803fe5)
by Andrew
05:47 queued 03:48
created

ImageExtractor::getAllImages()   B

Complexity

Conditions 2
Paths 2

Size

Total Lines 25
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 25
c 0
b 0
f 0
rs 8.8571
cc 2
eloc 16
nc 2
nop 0
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\ArticleMutatorTrait;
7
use Goose\Images\{Image, ImageUtils, LocallyStoredImage};
8
use Goose\Modules\{AbstractModule, ModuleInterface};
9
use DOMWrap\{Element, NodeList};
10
11
/**
12
 * Image Extractor
13
 *
14
 * @package Goose\Modules\Extractors
15
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16
 */
17
class ImageExtractor extends AbstractModule implements ModuleInterface {
18
    use ArticleMutatorTrait;
19
20
    /** @var string[] */
21
    private $badFileNames = [
22
        '\.html', '\.gif', '\.ico', 'button', 'twitter\.jpg', 'facebook\.jpg',
23
        'ap_buy_photo', 'digg\.jpg', 'digg\.png', 'delicious\.png',
24
        'facebook\.png', 'reddit\.jpg', 'doubleclick', 'diggthis',
25
        'diggThis', 'adserver', '\/ads\/', 'ec\.atdmt\.com', 'mediaplex\.com',
26
        'adsatt', 'view\.atdmt',
27
    ];
28
29
    /** @var string[] */
30
    private static $KNOWN_IMG_DOM_NAMES = [
31
        'yn-story-related-media',
32
        'cnn_strylccimg300cntr',
33
        'big_photo',
34
        'ap-smallphoto-a'
35
    ];
36
37
    /** @var int */
38
    private static $MAX_PARENT_DEPTH = 2;
39
40
    /** @var string[] */
41
    private static $CUSTOM_SITE_MAPPING = [];
42
43
    /** @inheritdoc  */
44
    public function run(Article $article): self {
45
        $this->article($article);
46
47
        if ($this->config()->get('image_fetch_best')) {
48
            $article->setTopImage($this->getBestImage());
49
50
            if ($this->config()->get('image_fetch_all')
51
              && $article->getTopNode() instanceof Element) {
52
                $article->setAllImages($this->getAllImages());
53
            }
54
        }
55
56
        return $this;
57
    }
58
59
    /**
60
     * @return Image|null
61
     */
62
    private function getBestImage(): ?Image {
63
        $image = $this->checkForKnownElements();
64
65
        if ($image) {
66
            return $image;
67
        }
68
69
        $image = $this->checkForMetaTag();
70
71
        if ($image) {
72
            return $image;
73
        }
74
75
        if ($this->article()->getTopNode() instanceof Element) {
76
            $image = $this->checkForLargeImages($this->article()->getTopNode(), 0, 0);
77
78
            if ($image) {
79
                return $image;
80
            }
81
        }
82
83
        return null;
84
    }
85
86
    /**
87
     * Prefer Twitter images (as they tend to have the right size for us), then Open Graph images
88
     * (which seem to be smaller), and finally linked images.
89
     *
90
     * @return Image|null
91
     */
92
    private function checkForMetaTag(): ?Image {
93
        $image = $this->checkForTwitterTag();
94
95
        if ($image) {
96
            return $image;
97
        }
98
99
        $image = $this->checkForOpenGraphTag();
100
101
        if ($image) {
102
            return $image;
103
        }
104
105
        $image = $this->checkForLinkTag();
106
107
        if ($image) {
108
            return $image;
109
        }
110
111
        return null;
112
    }
113
114
    /**
115
     * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk
116
     * so we'll go through a phased approach...
117
     * 1. get a list of ALL images from the parent node
118
     * 2. filter out any bad image names that we know of (gifs, ads, etc..)
119
     * 3. do a head request on each file to make sure it meets our bare requirements
120
     * 4. any images left over let's do a full GET request, download em to disk and check their dimensions
121
     * 5. Score images based on different factors like height/width and possibly things like color density
122
     *
123
     * @param Element $node
124
     * @param int $parentDepthLevel
125
     * @param int $siblingDepthLevel
126
     *
127
     * @return Image|null
128
     */
129
    private function checkForLargeImages(Element $node, int $parentDepthLevel, int $siblingDepthLevel): ?Image {
130
        $goodLocalImages = $this->getImageCandidates($node);
131
132
        $scoredLocalImages = $this->scoreLocalImages($goodLocalImages, $parentDepthLevel);
133
134
        ksort($scoredLocalImages);
135
136
        if (!empty($scoredLocalImages)) {
137
            foreach ($scoredLocalImages as $imageScore => $scoredLocalImage) {
138
                $mainImage = new Image();
139
                $mainImage->setImageSrc($scoredLocalImage->getImgSrc());
140
                $mainImage->setImageExtractionType('bigimage');
141
                $mainImage->setConfidenceScore(100 / count($scoredLocalImages));
142
                $mainImage->setImageScore($imageScore);
143
                $mainImage->setBytes($scoredLocalImage->getBytes());
144
                $mainImage->setHeight($scoredLocalImage->getHeight());
145
                $mainImage->setWidth($scoredLocalImage->getWidth());
146
147
                return $mainImage;
148
            }
149
        } else {
150
            $depthObj = $this->getDepthLevel($node, $parentDepthLevel, $siblingDepthLevel);
151
152
            if ($depthObj && NULL !== $depthObj->node) {
153
                return $this->checkForLargeImages($depthObj->node, $depthObj->parentDepth, $depthObj->siblingDepth);
154
            }
155
        }
156
157
        return null;
158
    }
159
160
    /**
161
     * @param Element $node
162
     * @param int $parentDepth
163
     * @param int $siblingDepth
164
     *
165
     * @return object|null
166
     */
167
    private function getDepthLevel(Element $node, int $parentDepth, int $siblingDepth): ?object {
168
        if (is_null($node) || !($node->parent() instanceof Element)) {
0 ignored issues
show
introduced by
The condition is_null($node) || ! $nod...tanceof DOMWrap\Element can never be false.
Loading history...
169
            return null;
170
        }
171
172
        if ($parentDepth > self::$MAX_PARENT_DEPTH) {
173
            return null;
174
        }
175
176
        // Find previous sibling element node
177
        $siblingNode = $node->preceding(function($node) {
178
            return $node instanceof Element;
179
        });
180
181
        if (is_null($siblingNode)) {
182
            return (object)[
183
                'node' => $node->parent(),
184
                'parentDepth' => $parentDepth + 1,
185
                'siblingDepth' => 0,
186
            ];
187
        }
188
189
        return (object)[
190
            'node' => $siblingNode,
191
            'parentDepth' => $parentDepth,
192
            'siblingDepth' => $siblingDepth + 1,
193
        ];
194
    }
195
196
    /**
197
     * Set image score and on locally downloaded images
198
     *
199
     * we're going to score the images in the order in which they appear so images higher up will have more importance,
200
     * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is
201
     * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa
202
     * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image
203
     *
204
     * @param LocallyStoredImage[] $locallyStoredImages
205
     * @param int $depthLevel
206
     *
207
     * @return LocallyStoredImage[]
208
     */
209
    private function scoreLocalImages($locallyStoredImages, int $depthLevel): array {
0 ignored issues
show
Unused Code introduced by
The parameter $depthLevel is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

209
    private function scoreLocalImages($locallyStoredImages, /** @scrutinizer ignore-unused */ int $depthLevel): array {

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
210
        $results = [];
211
        $i = 1;
212
        $initialArea = 0;
213
214
        // Limit to the first 30 images
215
        $locallyStoredImages = array_slice($locallyStoredImages, 0, 30);
216
217
        foreach ($locallyStoredImages as $locallyStoredImage) {
218
            $sequenceScore = 1 / $i;
219
            $area = $locallyStoredImage->getWidth() * $locallyStoredImage->getHeight();
220
221
            if ($initialArea == 0) {
222
                $initialArea = $area * 1.48;
223
                $totalScore = 1;
224
            } else {
225
                $areaDifference = $area * $initialArea;
226
                $totalScore = $sequenceScore * $areaDifference;
227
            }
228
229
            $i++;
230
231
            $results[$totalScore] = $locallyStoredImage;
232
        }
233
234
        return $results;
235
    }
236
237
    /**
238
     * @param LocallyStoredImage $locallyStoredImage
239
     * @param int $depthLevel
240
     *
241
     * @return bool
242
     */
243
    private function isWorthyImage($locallyStoredImage, int $depthLevel): bool {
0 ignored issues
show
Unused Code introduced by
The method isWorthyImage() is not used, and could be removed.

This check looks for private methods that have been defined, but are not used inside the class.

Loading history...
244
        if ($locallyStoredImage->getWidth() <= $this->config()->get('image_min_width')
245
          || $locallyStoredImage->getHeight() <= $this->config()->get('image_min_height')
246
          || $locallyStoredImage->getFileExtension() == 'NA'
247
          || ($depthLevel < 1 && $locallyStoredImage->getWidth() < 300) || $depthLevel >= 1
248
          || $this->isBannerDimensions($locallyStoredImage->getWidth(), $locallyStoredImage->getHeight())) {
249
            return false;
250
        }
251
252
        return true;
253
    }
254
255
    /**
256
     * @return Image[]
257
     */
258
    private function getAllImages(): array {
259
        $results = [];
260
261
        $images = $this->article()->getTopNode()->find('img');
262
263
        // Generate a complete URL for each image
264
        $imageUrls = array_map(function($image) {
265
            return $this->buildImagePath($image->attr('src'));
266
        }, $images->toArray());
0 ignored issues
show
Bug introduced by
$images->toArray() of type iterable is incompatible with the type array expected by parameter $arr1 of array_map(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

266
        }, /** @scrutinizer ignore-type */ $images->toArray());
Loading history...
267
268
        $localImages = $this->getLocallyStoredImages($imageUrls);
269
270
        foreach ($localImages as $localImage) {
271
            $image = new Image();
272
            $image->setImageSrc($localImage->getImgSrc());
273
            $image->setBytes($localImage->getBytes());
274
            $image->setHeight($localImage->getHeight());
275
            $image->setWidth($localImage->getWidth());
276
            $image->setImageExtractionType('all');
277
            $image->setConfidenceScore(0);
278
279
            $results[] = $image;
280
        }
281
282
        return $results;
283
    }
284
285
    /**
286
     * returns true if we think this is kind of a bannery dimension
287
     * like 600 / 100 = 6 may be a fishy dimension for a good image
288
     *
289
     * @param int $width
290
     * @param int $height
291
     *
292
     * @return bool
293
     */
294
    private function isBannerDimensions(int $width, int $height): bool {
295
        if ($width == $height) {
296
            return false;
297
        }
298
299
        if ($width > $height) {
300
            $diff = $width / $height;
301
            if ($diff > 5) {
302
                return true;
303
            }
304
        }
305
306
        if ($height > $width) {
307
            $diff = $height / $width;
308
            if ($diff > 5) {
309
                return true;
310
            }
311
        }
312
313
        return false;
314
    }
315
316
    /**
317
     * takes a list of image elements and filters out the ones with bad names
318
     *
319
     * @param \DOMWrap\NodeList $images
320
     *
321
     * @return Element[]
322
     */
323
    private function filterBadNames(NodeList $images): array {
324
        $goodImages = [];
325
326
        foreach ($images as $image) {
327
            if ($this->isOkImageFileName($image)) {
328
                $goodImages[] = $image;
329
            } else {
330
                $image->remove();
331
            }
332
        }
333
334
        return $goodImages;
335
    }
336
337
    /**
338
     * will check the image src against a list of bad image files we know of like buttons, etc...
339
     *
340
     * @param Element $imageNode
341
     *
342
     * @return bool
343
     */
344
    private function isOkImageFileName(Element $imageNode): bool {
345
        $imgSrc = $imageNode->attr('src');
346
347
        if (empty($imgSrc)) {
348
            return false;
349
        }
350
351
        $regex = '@' . implode('|', $this->badFileNames) . '@i';
352
353
        if (preg_match($regex, $imgSrc)) {
354
            return false;
355
        }
356
357
        return true;
358
    }
359
360
    /**
361
     * @param Element $node
362
     *
363
     * @return LocallyStoredImage[]
364
     */
365
    private function getImageCandidates(Element $node): array {
366
        $images = $node->find('img');
367
        $filteredImages = $this->filterBadNames($images);
368
        $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
369
370
        return $goodImages;
371
    }
372
373
    /**
374
     * loop through all the images and find the ones that have the best bytes to even make them a candidate
375
     *
376
     * @param Element[] $images
377
     *
378
     * @return LocallyStoredImage[]
379
     */
380
    private function findImagesThatPassByteSizeTest(array $images): array {
381
        $i = 0; /** @todo Re-factor how the LocallyStoredImage => Image relation works ? Note: PHP 5.6.x adds a 3rd argument to array_filter() to pass the key as well as value. */
382
383
        // Limit to the first 30 images
384
        $images = array_slice($images, 0, 30);
385
386
        // Generate a complete URL for each image
387
        $imageUrls = array_map(function($image) {
388
            return $this->buildImagePath($image->attr('src'));
389
        }, $images);
390
391
        $localImages = $this->getLocallyStoredImages($imageUrls, true);
392
393
        $results = array_filter($localImages, function($localImage) use($images, $i) {
394
            $image = $images[$i++];
395
396
            $bytes = $localImage->getBytes();
397
398
            if ($bytes < $this->config()->get('image_min_bytes') && $bytes != 0 || $bytes > $this->config()->get('image_max_bytes')) {
399
                $image->remove();
400
401
                return false;
402
            }
403
404
            return true;
405
        });
406
407
        return $results;
408
    }
409
410
    /**
411
     * checks to see if we were able to find feature image tags on this page
412
     *
413
     * @return Image|null
414
     */
415
    private function checkForLinkTag(): ?Image {
416
        return $this->checkForTag('link[rel="image_src"]', 'href', 'linktag');
417
    }
418
419
    /**
420
     * checks to see if we were able to find open graph tags on this page
421
     *
422
     * @return Image|null
423
     */
424
    private function checkForOpenGraphTag(): ?Image {
425
        return $this->checkForTag('meta[property="og:image"],meta[name="og:image"]', 'content', 'opengraph');
426
    }
427
428
    /**
429
     * checks to see if we were able to find twitter tags on this page
430
     *
431
     * @return Image|null
432
     */
433
    private function checkForTwitterTag(): ?Image {
434
        return $this->checkForTag('meta[property="twitter:image"],meta[name="twitter:image"],meta[property="twitter:image:src"],meta[name="twitter:image:src"]', 'content', 'twitter');
435
    }
436
437
    /**
438
     * @param string $selector
439
     * @param string $attr
440
     * @param string $type
441
     *
442
     * @return Image|null
443
     */
444
    private function checkForTag(string $selector, string $attr, string $type): ?Image {
445
        $meta = $this->article()->getRawDoc()->find($selector);
446
447
        if (!$meta->count()) {
448
            return null;
449
        }
450
451
        $node = $meta->first();
452
453
        if (!($node instanceof Element)) {
454
            return null;
455
        }
456
457
        if (!$node->hasAttribute($attr)) {
458
            return null;
459
        }
460
461
        $imagePath = $this->buildImagePath($node->attr($attr));
462
        $mainImage = new Image();
463
        $mainImage->setImageSrc($imagePath);
464
        $mainImage->setImageExtractionType($type);
465
        $mainImage->setConfidenceScore(100);
466
467
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
468
469
        if (!empty($locallyStoredImage)) {
470
            $mainImage->setBytes($locallyStoredImage->getBytes());
471
            $mainImage->setHeight($locallyStoredImage->getHeight());
472
            $mainImage->setWidth($locallyStoredImage->getWidth());
473
        }
474
475
        return $this->ensureMinimumImageSize($mainImage);
476
    }
477
478
    /**
479
     * @param Image $mainImage
480
     *
481
     * @return Image|null
482
     */
483
    private function ensureMinimumImageSize(Image $mainImage): ?Image {
484
        if ($mainImage->getWidth() >= $this->config()->get('image_min_width')
485
          && $mainImage->getHeight() >= $this->config()->get('image_min_height')) {
486
            return $mainImage;
487
        }
488
489
        return null;
490
    }
491
492
    /**
493
     * @param string $imageSrc
494
     * @param bool $returnAll
495
     *
496
     * @return LocallyStoredImage|null
497
     */
498
    private function getLocallyStoredImage(string $imageSrc, bool $returnAll = false): ?LocallyStoredImage {
499
        $locallyStoredImages = ImageUtils::storeImagesToLocalFile([$imageSrc], $returnAll, $this->config());
500
501
        return array_shift($locallyStoredImages);
502
    }
503
504
    /**
505
     * @param string[] $imageSrcs
506
     * @param bool $returnAll
507
     *
508
     * @return LocallyStoredImage[]
509
     */
510
    private function getLocallyStoredImages($imageSrcs, bool $returnAll = false): array {
511
        return ImageUtils::storeImagesToLocalFile($imageSrcs, $returnAll, $this->config());
512
    }
513
514
    /**
515
     * @return string
516
     */
517
    private function getCleanDomain(): string {
518
        return implode('.', array_slice(explode('.', $this->article()->getDomain()), -2, 2));
519
    }
520
521
    /**
522
     * In here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have
523
     * known  places to look for good images.
524
     *
525
     * @todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites
526
     *
527
     * @return Image|null
528
     */
529
    private function checkForKnownElements(): ?Image {
530
        if (!$this->article()->getRawDoc()) {
531
            return null;
532
        }
533
534
        $knownImgDomNames = self::$KNOWN_IMG_DOM_NAMES;
535
536
        $domain = $this->getCleanDomain();
537
538
        $customSiteMapping = $this->customSiteMapping();
539
540
        if (isset($customSiteMapping[$domain])) {
541
            foreach (explode('|', $customSiteMapping[$domain]) as $class) {
542
                $knownImgDomNames[] = $class;
543
            }
544
        }
545
546
        $knownImage = null;
547
548
        foreach ($knownImgDomNames as $knownName) {
549
            $known = $this->article()->getRawDoc()->find('#' . $knownName);
550
551
            if (!$known->count()) {
552
                $known = $this->article()->getRawDoc()->find('.' . $knownName);
553
            }
554
555
            if ($known->count()) {
556
                $mainImage = $known->first()->find('img');
557
558
                if ($mainImage->count()) {
559
                    $knownImage = $mainImage->first();
560
                }
561
            }
562
        }
563
564
        if (is_null($knownImage)) {
565
            return null;
566
        }
567
568
        $knownImgSrc = $knownImage->attr('src');
569
570
        $mainImage = new Image();
571
        $mainImage->setImageSrc($this->buildImagePath($knownImgSrc));
572
        $mainImage->setImageExtractionType('known');
573
        $mainImage->setConfidenceScore(90);
574
575
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
576
577
        if (!empty($locallyStoredImage)) {
578
            $mainImage->setBytes($locallyStoredImage->getBytes());
579
            $mainImage->setHeight($locallyStoredImage->getHeight());
580
            $mainImage->setWidth($locallyStoredImage->getWidth());
581
        }
582
583
        return $this->ensureMinimumImageSize($mainImage);
584
    }
585
586
    /**
587
     * This method will take an image path and build out the absolute path to that image
588
     * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg
589
     *
590
     * @param string $imageSrc
591
     *
592
     * @return string
593
     */
594
    private function buildImagePath(string $imageSrc): string {
595
        $parts = array(
596
            'scheme',
597
            'host',
598
            'port',
599
            'path',
600
            'query',
601
        );
602
603
        $imageUrlParts = parse_url($imageSrc);
604
        $articleUrlParts = parse_url($this->article()->getFinalUrl());
605
        if (isset($imageUrlParts['path'], $articleUrlParts['path']) && $imageUrlParts['path'] && $imageUrlParts['path']{0} !== '/') {
606
            $articleUrlDir = dirname($articleUrlParts['path']);
607
            $imageUrlParts['path'] = $articleUrlDir . '/' . $imageUrlParts['path'];
608
        }
609
610
        foreach ($parts as $part) {
611
            if (!isset($imageUrlParts[$part]) && isset($articleUrlParts[$part])) {
612
                $imageUrlParts[$part] = $articleUrlParts[$part];
613
614
            } else if (isset($imageUrlParts[$part]) && !isset($articleUrlParts[$part])) {
615
                break;
616
            }
617
        }
618
619
        return http_build_url($imageUrlParts, array());
620
    }
621
622
    /**
623
     * @param string[]
624
     *
625
     * @return array
626
     */
627
    private function customSiteMapping(): array {
628
        if (empty(self::$CUSTOM_SITE_MAPPING)) {
629
            $file = __DIR__ . '/../../../resources/images/known-image-css.txt';
630
631
            $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
632
633
            foreach ($lines as $line) {
634
                list($domain, $css) = explode('^', $line);
635
636
                self::$CUSTOM_SITE_MAPPING[$domain] = $css;
637
            }
638
        }
639
640
        return self::$CUSTOM_SITE_MAPPING;
641
    }
642
643
}
644