Passed
Push — master ( 803fe5...201ac0 )
by Andrew
02:01
created

ImageExtractor::isWorthyImage()   B

Complexity

Conditions 8
Paths 2

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 10
c 0
b 0
f 0
rs 7.7777
cc 8
eloc 7
nc 2
nop 2
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\ArticleMutatorTrait;
7
use Goose\Images\{Image, ImageUtils, LocallyStoredImage};
8
use Goose\Modules\{AbstractModule, ModuleInterface};
9
use DOMWrap\{Element, NodeList};
10
11
/**
12
 * Image Extractor
13
 *
14
 * @package Goose\Modules\Extractors
15
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16
 */
17
class ImageExtractor extends AbstractModule implements ModuleInterface {
18
    use ArticleMutatorTrait;
19
20
    /** @var string[] */
21
    private $badFileNames = [
22
        '\.html', '\.gif', '\.ico', 'button', 'twitter\.jpg', 'facebook\.jpg',
23
        'ap_buy_photo', 'digg\.jpg', 'digg\.png', 'delicious\.png',
24
        'facebook\.png', 'reddit\.jpg', 'doubleclick', 'diggthis',
25
        'diggThis', 'adserver', '\/ads\/', 'ec\.atdmt\.com', 'mediaplex\.com',
26
        'adsatt', 'view\.atdmt',
27
    ];
28
29
    /** @var string[] */
30
    private static $KNOWN_IMG_DOM_NAMES = [
31
        'yn-story-related-media',
32
        'cnn_strylccimg300cntr',
33
        'big_photo',
34
        'ap-smallphoto-a'
35
    ];
36
37
    /** @var int */
38
    private static $MAX_PARENT_DEPTH = 2;
39
40
    /** @var string[] */
41
    private static $CUSTOM_SITE_MAPPING = [];
42
43
    /** @inheritdoc  */
44
    public function run(Article $article): self {
45
        $this->article($article);
46
47
        if ($this->config()->get('image_fetch_best')) {
48
            $article->setTopImage($this->getBestImage());
49
50
            if ($this->config()->get('image_fetch_all')
51
              && $article->getTopNode() instanceof Element) {
52
                $article->setAllImages($this->getAllImages());
53
            }
54
        }
55
56
        return $this;
57
    }
58
59
    /**
60
     * @return Image|null
61
     */
62
    private function getBestImage(): ?Image {
63
        $image = $this->checkForKnownElements();
64
65
        if ($image) {
66
            return $image;
67
        }
68
69
        $image = $this->checkForMetaTag();
70
71
        if ($image) {
72
            return $image;
73
        }
74
75
        if ($this->article()->getTopNode() instanceof Element) {
76
            $image = $this->checkForLargeImages($this->article()->getTopNode(), 0, 0);
77
78
            if ($image) {
79
                return $image;
80
            }
81
        }
82
83
        return null;
84
    }
85
86
    /**
87
     * Prefer Twitter images (as they tend to have the right size for us), then Open Graph images
88
     * (which seem to be smaller), and finally linked images.
89
     *
90
     * @return Image|null
91
     */
92
    private function checkForMetaTag(): ?Image {
93
        $image = $this->checkForTwitterTag();
94
95
        if ($image) {
96
            return $image;
97
        }
98
99
        $image = $this->checkForOpenGraphTag();
100
101
        if ($image) {
102
            return $image;
103
        }
104
105
        $image = $this->checkForLinkTag();
106
107
        if ($image) {
108
            return $image;
109
        }
110
111
        return null;
112
    }
113
114
    /**
115
     * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk
116
     * so we'll go through a phased approach...
117
     * 1. get a list of ALL images from the parent node
118
     * 2. filter out any bad image names that we know of (gifs, ads, etc..)
119
     * 3. do a head request on each file to make sure it meets our bare requirements
120
     * 4. any images left over let's do a full GET request, download em to disk and check their dimensions
121
     * 5. Score images based on different factors like height/width and possibly things like color density
122
     *
123
     * @param Element $node
124
     * @param int $parentDepthLevel
125
     * @param int $siblingDepthLevel
126
     *
127
     * @return Image|null
128
     */
129
    private function checkForLargeImages(Element $node, int $parentDepthLevel, int $siblingDepthLevel): ?Image {
130
        $goodLocalImages = $this->getImageCandidates($node);
131
132
        $scoredLocalImages = $this->scoreLocalImages($goodLocalImages);
133
134
        ksort($scoredLocalImages);
135
136
        if (!empty($scoredLocalImages)) {
137
            foreach ($scoredLocalImages as $imageScore => $scoredLocalImage) {
138
                $mainImage = new Image();
139
                $mainImage->setImageSrc($scoredLocalImage->getImgSrc());
140
                $mainImage->setImageExtractionType('bigimage');
141
                $mainImage->setConfidenceScore(100 / count($scoredLocalImages));
142
                $mainImage->setImageScore($imageScore);
143
                $mainImage->setBytes($scoredLocalImage->getBytes());
144
                $mainImage->setHeight($scoredLocalImage->getHeight());
145
                $mainImage->setWidth($scoredLocalImage->getWidth());
146
147
                return $mainImage;
148
            }
149
        } else {
150
            $depthObj = $this->getDepthLevel($node, $parentDepthLevel, $siblingDepthLevel);
151
152
            if ($depthObj && NULL !== $depthObj->node) {
153
                return $this->checkForLargeImages($depthObj->node, $depthObj->parentDepth, $depthObj->siblingDepth);
154
            }
155
        }
156
157
        return null;
158
    }
159
160
    /**
161
     * @param Element $node
162
     * @param int $parentDepth
163
     * @param int $siblingDepth
164
     *
165
     * @return object|null
166
     */
167
    private function getDepthLevel(Element $node, int $parentDepth, int $siblingDepth): ?object {
168
        if (is_null($node) || !($node->parent() instanceof Element)) {
0 ignored issues
show
introduced by
The condition is_null($node) || ! $nod...tanceof DOMWrap\Element can never be false.
Loading history...
169
            return null;
170
        }
171
172
        if ($parentDepth > self::$MAX_PARENT_DEPTH) {
173
            return null;
174
        }
175
176
        // Find previous sibling element node
177
        $siblingNode = $node->preceding(function($node) {
178
            return $node instanceof Element;
179
        });
180
181
        if (is_null($siblingNode)) {
182
            return (object)[
183
                'node' => $node->parent(),
184
                'parentDepth' => $parentDepth + 1,
185
                'siblingDepth' => 0,
186
            ];
187
        }
188
189
        return (object)[
190
            'node' => $siblingNode,
191
            'parentDepth' => $parentDepth,
192
            'siblingDepth' => $siblingDepth + 1,
193
        ];
194
    }
195
196
    /**
197
     * Set image score and on locally downloaded images
198
     *
199
     * we're going to score the images in the order in which they appear so images higher up will have more importance,
200
     * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is
201
     * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa
202
     * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image
203
     *
204
     * @param LocallyStoredImage[] $locallyStoredImages
205
     *
206
     * @return LocallyStoredImage[]
207
     */
208
    private function scoreLocalImages($locallyStoredImages): array {
209
        $results = [];
210
        $i = 1;
211
        $initialArea = 0;
212
213
        // Limit to the first 30 images
214
        $locallyStoredImages = array_slice($locallyStoredImages, 0, 30);
215
216
        foreach ($locallyStoredImages as $locallyStoredImage) {
217
            $sequenceScore = 1 / $i;
218
            $area = $locallyStoredImage->getWidth() * $locallyStoredImage->getHeight();
219
220
            if ($initialArea == 0) {
221
                $initialArea = $area * 1.48;
222
                $totalScore = 1;
223
            } else {
224
                $areaDifference = $area * $initialArea;
225
                $totalScore = $sequenceScore * $areaDifference;
226
            }
227
228
            $i++;
229
230
            $results[$totalScore] = $locallyStoredImage;
231
        }
232
233
        return $results;
234
    }
235
236
    /**
237
     * @return Image[]
238
     */
239
    private function getAllImages(): array {
240
        $results = [];
241
242
        $images = $this->article()->getTopNode()->find('img');
243
244
        // Generate a complete URL for each image
245
        $imageUrls = array_map(function($image) {
246
            return $this->buildImagePath($image->attr('src'));
247
        }, $images->toArray());
248
249
        $localImages = $this->getLocallyStoredImages($imageUrls);
250
251
        foreach ($localImages as $localImage) {
252
            $image = new Image();
253
            $image->setImageSrc($localImage->getImgSrc());
254
            $image->setBytes($localImage->getBytes());
255
            $image->setHeight($localImage->getHeight());
256
            $image->setWidth($localImage->getWidth());
257
            $image->setImageExtractionType('all');
258
            $image->setConfidenceScore(0);
259
260
            $results[] = $image;
261
        }
262
263
        return $results;
264
    }
265
266
    /**
267
     * returns true if we think this is kind of a bannery dimension
268
     * like 600 / 100 = 6 may be a fishy dimension for a good image
269
     *
270
     * @param int $width
271
     * @param int $height
272
     *
273
     * @return bool
274
     */
275
    private function isBannerDimensions(int $width, int $height): bool {
0 ignored issues
show
Unused Code introduced by
The method isBannerDimensions() is not used, and could be removed.

This check looks for private methods that have been defined, but are not used inside the class.

Loading history...
276
        if ($width == $height) {
277
            return false;
278
        }
279
280
        if ($width > $height) {
281
            $diff = $width / $height;
282
            if ($diff > 5) {
283
                return true;
284
            }
285
        }
286
287
        if ($height > $width) {
288
            $diff = $height / $width;
289
            if ($diff > 5) {
290
                return true;
291
            }
292
        }
293
294
        return false;
295
    }
296
297
    /**
298
     * takes a list of image elements and filters out the ones with bad names
299
     *
300
     * @param \DOMWrap\NodeList $images
301
     *
302
     * @return Element[]
303
     */
304
    private function filterBadNames(NodeList $images): array {
305
        $goodImages = [];
306
307
        foreach ($images as $image) {
308
            if ($this->isOkImageFileName($image)) {
309
                $goodImages[] = $image;
310
            } else {
311
                $image->remove();
312
            }
313
        }
314
315
        return $goodImages;
316
    }
317
318
    /**
319
     * will check the image src against a list of bad image files we know of like buttons, etc...
320
     *
321
     * @param Element $imageNode
322
     *
323
     * @return bool
324
     */
325
    private function isOkImageFileName(Element $imageNode): bool {
326
        $imgSrc = $imageNode->attr('src');
327
328
        if (empty($imgSrc)) {
329
            return false;
330
        }
331
332
        $regex = '@' . implode('|', $this->badFileNames) . '@i';
333
334
        if (preg_match($regex, $imgSrc)) {
335
            return false;
336
        }
337
338
        return true;
339
    }
340
341
    /**
342
     * @param Element $node
343
     *
344
     * @return LocallyStoredImage[]
345
     */
346
    private function getImageCandidates(Element $node): array {
347
        $images = $node->find('img');
348
        $filteredImages = $this->filterBadNames($images);
349
        $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
350
351
        return $goodImages;
352
    }
353
354
    /**
355
     * loop through all the images and find the ones that have the best bytes to even make them a candidate
356
     *
357
     * @param Element[] $images
358
     *
359
     * @return LocallyStoredImage[]
360
     */
361
    private function findImagesThatPassByteSizeTest(array $images): array {
362
        $i = 0; /** @todo Re-factor how the LocallyStoredImage => Image relation works ? Note: PHP 5.6.x adds a 3rd argument to array_filter() to pass the key as well as value. */
363
364
        // Limit to the first 30 images
365
        $images = array_slice($images, 0, 30);
366
367
        // Generate a complete URL for each image
368
        $imageUrls = array_map(function($image) {
369
            return $this->buildImagePath($image->attr('src'));
370
        }, $images);
371
372
        $localImages = $this->getLocallyStoredImages($imageUrls, true);
373
374
        $results = array_filter($localImages, function($localImage) use($images, $i) {
375
            $image = $images[$i++];
376
377
            $bytes = $localImage->getBytes();
378
379
            if ($bytes < $this->config()->get('image_min_bytes') && $bytes != 0 || $bytes > $this->config()->get('image_max_bytes')) {
380
                $image->remove();
381
382
                return false;
383
            }
384
385
            return true;
386
        });
387
388
        return $results;
389
    }
390
391
    /**
392
     * checks to see if we were able to find feature image tags on this page
393
     *
394
     * @return Image|null
395
     */
396
    private function checkForLinkTag(): ?Image {
397
        return $this->checkForTag('link[rel="image_src"]', 'href', 'linktag');
398
    }
399
400
    /**
401
     * checks to see if we were able to find open graph tags on this page
402
     *
403
     * @return Image|null
404
     */
405
    private function checkForOpenGraphTag(): ?Image {
406
        return $this->checkForTag('meta[property="og:image"],meta[name="og:image"]', 'content', 'opengraph');
407
    }
408
409
    /**
410
     * checks to see if we were able to find twitter tags on this page
411
     *
412
     * @return Image|null
413
     */
414
    private function checkForTwitterTag(): ?Image {
415
        return $this->checkForTag('meta[property="twitter:image"],meta[name="twitter:image"],meta[property="twitter:image:src"],meta[name="twitter:image:src"]', 'content', 'twitter');
416
    }
417
418
    /**
419
     * @param string $selector
420
     * @param string $attr
421
     * @param string $type
422
     *
423
     * @return Image|null
424
     */
425
    private function checkForTag(string $selector, string $attr, string $type): ?Image {
426
        $meta = $this->article()->getRawDoc()->find($selector);
427
428
        if (!$meta->count()) {
429
            return null;
430
        }
431
432
        $node = $meta->first();
433
434
        if (!($node instanceof Element)) {
435
            return null;
436
        }
437
438
        if (!$node->hasAttribute($attr)) {
439
            return null;
440
        }
441
442
        $imagePath = $this->buildImagePath($node->attr($attr));
443
        $mainImage = new Image();
444
        $mainImage->setImageSrc($imagePath);
445
        $mainImage->setImageExtractionType($type);
446
        $mainImage->setConfidenceScore(100);
447
448
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
449
450
        if (!empty($locallyStoredImage)) {
451
            $mainImage->setBytes($locallyStoredImage->getBytes());
452
            $mainImage->setHeight($locallyStoredImage->getHeight());
453
            $mainImage->setWidth($locallyStoredImage->getWidth());
454
        }
455
456
        return $this->ensureMinimumImageSize($mainImage);
457
    }
458
459
    /**
460
     * @param Image $mainImage
461
     *
462
     * @return Image|null
463
     */
464
    private function ensureMinimumImageSize(Image $mainImage): ?Image {
465
        if ($mainImage->getWidth() >= $this->config()->get('image_min_width')
466
          && $mainImage->getHeight() >= $this->config()->get('image_min_height')) {
467
            return $mainImage;
468
        }
469
470
        return null;
471
    }
472
473
    /**
474
     * @param string $imageSrc
475
     * @param bool $returnAll
476
     *
477
     * @return LocallyStoredImage|null
478
     */
479
    private function getLocallyStoredImage(string $imageSrc, bool $returnAll = false): ?LocallyStoredImage {
480
        $locallyStoredImages = ImageUtils::storeImagesToLocalFile([$imageSrc], $returnAll, $this->config());
481
482
        return array_shift($locallyStoredImages);
483
    }
484
485
    /**
486
     * @param string[] $imageSrcs
487
     * @param bool $returnAll
488
     *
489
     * @return LocallyStoredImage[]
490
     */
491
    private function getLocallyStoredImages($imageSrcs, bool $returnAll = false): array {
492
        return ImageUtils::storeImagesToLocalFile($imageSrcs, $returnAll, $this->config());
493
    }
494
495
    /**
496
     * @return string
497
     */
498
    private function getCleanDomain(): string {
499
        return implode('.', array_slice(explode('.', $this->article()->getDomain()), -2, 2));
500
    }
501
502
    /**
503
     * In here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have
504
     * known  places to look for good images.
505
     *
506
     * @todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites
507
     *
508
     * @return Image|null
509
     */
510
    private function checkForKnownElements(): ?Image {
511
        if (!$this->article()->getRawDoc()) {
512
            return null;
513
        }
514
515
        $knownImgDomNames = self::$KNOWN_IMG_DOM_NAMES;
516
517
        $domain = $this->getCleanDomain();
518
519
        $customSiteMapping = $this->customSiteMapping();
520
521
        if (isset($customSiteMapping[$domain])) {
522
            foreach (explode('|', $customSiteMapping[$domain]) as $class) {
523
                $knownImgDomNames[] = $class;
524
            }
525
        }
526
527
        $knownImage = null;
528
529
        foreach ($knownImgDomNames as $knownName) {
530
            $known = $this->article()->getRawDoc()->find('#' . $knownName);
531
532
            if (!$known->count()) {
533
                $known = $this->article()->getRawDoc()->find('.' . $knownName);
534
            }
535
536
            if ($known->count()) {
537
                $mainImage = $known->first()->find('img');
538
539
                if ($mainImage->count()) {
540
                    $knownImage = $mainImage->first();
541
                }
542
            }
543
        }
544
545
        if (is_null($knownImage)) {
546
            return null;
547
        }
548
549
        $knownImgSrc = $knownImage->attr('src');
550
551
        $mainImage = new Image();
552
        $mainImage->setImageSrc($this->buildImagePath($knownImgSrc));
553
        $mainImage->setImageExtractionType('known');
554
        $mainImage->setConfidenceScore(90);
555
556
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
557
558
        if (!empty($locallyStoredImage)) {
559
            $mainImage->setBytes($locallyStoredImage->getBytes());
560
            $mainImage->setHeight($locallyStoredImage->getHeight());
561
            $mainImage->setWidth($locallyStoredImage->getWidth());
562
        }
563
564
        return $this->ensureMinimumImageSize($mainImage);
565
    }
566
567
    /**
568
     * This method will take an image path and build out the absolute path to that image
569
     * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg
570
     *
571
     * @param string $imageSrc
572
     *
573
     * @return string
574
     */
575
    private function buildImagePath(string $imageSrc): string {
576
        $parts = array(
577
            'scheme',
578
            'host',
579
            'port',
580
            'path',
581
            'query',
582
        );
583
584
        $imageUrlParts = parse_url($imageSrc);
585
        $articleUrlParts = parse_url($this->article()->getFinalUrl());
586
        if (isset($imageUrlParts['path'], $articleUrlParts['path']) && $imageUrlParts['path'] && $imageUrlParts['path']{0} !== '/') {
587
            $articleUrlDir = dirname($articleUrlParts['path']);
588
            $imageUrlParts['path'] = $articleUrlDir . '/' . $imageUrlParts['path'];
589
        }
590
591
        foreach ($parts as $part) {
592
            if (!isset($imageUrlParts[$part]) && isset($articleUrlParts[$part])) {
593
                $imageUrlParts[$part] = $articleUrlParts[$part];
594
595
            } else if (isset($imageUrlParts[$part]) && !isset($articleUrlParts[$part])) {
596
                break;
597
            }
598
        }
599
600
        return http_build_url($imageUrlParts, array());
601
    }
602
603
    /**
604
     * @param string[]
605
     *
606
     * @return array
607
     */
608
    private function customSiteMapping(): array {
609
        if (empty(self::$CUSTOM_SITE_MAPPING)) {
610
            $file = __DIR__ . '/../../../resources/images/known-image-css.txt';
611
612
            $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
613
614
            foreach ($lines as $line) {
615
                list($domain, $css) = explode('^', $line);
616
617
                self::$CUSTOM_SITE_MAPPING[$domain] = $css;
618
            }
619
        }
620
621
        return self::$CUSTOM_SITE_MAPPING;
622
    }
623
624
}
625