Completed
Push — master ( 84b1b9...90721d )
by Andrew
02:11
created

ImageExtractor::buildImagePath()   D

Complexity

Conditions 9
Paths 8

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 1
Metric Value
c 3
b 0
f 1
dl 0
loc 27
rs 4.909
cc 9
eloc 18
nc 8
nop 1
1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Images\Image;
7
use Goose\Images\ImageUtils;
8
use Goose\Images\LocallyStoredImage;
9
use Goose\Traits\ArticleMutatorTrait;
10
use Goose\Modules\AbstractModule;
11
use Goose\Modules\ModuleInterface;
12
use DOMWrap\Element;
13
use DOMWrap\NodeList;
14
15
/**
16
 * Image Extractor
17
 *
18
 * @package Goose\Modules\Extractors
19
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
20
 */
21
class ImageExtractor extends AbstractModule implements ModuleInterface {
22
    use ArticleMutatorTrait;
23
24
    /** @var string[] */
25
    private $badFileNames = [
26
        '\.html', '\.gif', '\.ico', 'button', 'twitter\.jpg', 'facebook\.jpg',
27
        'ap_buy_photo', 'digg\.jpg', 'digg\.png', 'delicious\.png',
28
        'facebook\.png', 'reddit\.jpg', 'doubleclick', 'diggthis',
29
        'diggThis', 'adserver', '\/ads\/', 'ec\.atdmt\.com', 'mediaplex\.com',
30
        'adsatt', 'view\.atdmt',
31
    ];
32
33
    /** @var string[] */
34
    private static $KNOWN_IMG_DOM_NAMES = [
35
        'yn-story-related-media',
36
        'cnn_strylccimg300cntr',
37
        'big_photo',
38
        'ap-smallphoto-a'
39
    ];
40
41
    /** @var int */
42
    private static $MAX_PARENT_DEPTH = 2;
43
44
    /** @var string[] */
45
    private static $CUSTOM_SITE_MAPPING = [];
46
47
    /**
48
     * @param Article $article
49
     */
50
    public function run(Article $article) {
51
        $this->article($article);
52
53
        if ($this->config()->get('image_fetch_best')) {
54
            $article->setTopImage($this->getBestImage());
55
56
            if ($this->config()->get('image_fetch_all')
57
              && $article->getTopNode() instanceof Element) {
58
                $article->setAllImages($this->getAllImages());
59
            }
60
        }
61
    }
62
63
    /**
64
     * @return Image|null
65
     */
66
    private function getBestImage() {
67
        $image = $this->checkForKnownElements();
68
69
        if ($image) {
70
            return $image;
71
        }
72
73
        $image = $this->checkForMetaTag();
74
75
        if ($image) {
76
            return $image;
77
        }
78
79
        if ($this->article()->getTopNode() instanceof Element) {
80
            $image = $this->checkForLargeImages($this->article()->getTopNode(), 0, 0);
81
82
            if ($image) {
83
                return $image;
84
            }
85
        }
86
87
        return null;
88
    }
89
90
    /**
91
     * Prefer Twitter images (as they tend to have the right size for us), then Open Graph images
92
     * (which seem to be smaller), and finally linked images.
93
     *
94
     * @return Image|null
95
     */
96
    private function checkForMetaTag() {
97
        $image = $this->checkForTwitterTag();
98
99
        if ($image) {
100
            return $image;
101
        }
102
103
        $image = $this->checkForOpenGraphTag();
104
105
        if ($image) {
106
            return $image;
107
        }
108
109
        $image = $this->checkForLinkTag();
110
111
        if ($image) {
112
            return $image;
113
        }
114
115
        return null;
116
    }
117
118
    /**
119
     * although slow the best way to determine the best image is to download them and check the actual dimensions of the image when on disk
120
     * so we'll go through a phased approach...
121
     * 1. get a list of ALL images from the parent node
122
     * 2. filter out any bad image names that we know of (gifs, ads, etc..)
123
     * 3. do a head request on each file to make sure it meets our bare requirements
124
     * 4. any images left over let's do a full GET request, download em to disk and check their dimensions
125
     * 5. Score images based on different factors like height/width and possibly things like color density
126
     *
127
     * @param Element $node
128
     * @param int $parentDepthLevel
129
     * @param int $siblingDepthLevel
130
     *
131
     * @return Image|null
132
     */
133
    private function checkForLargeImages(Element $node, $parentDepthLevel, $siblingDepthLevel) {
134
        $goodLocalImages = $this->getImageCandidates($node);
135
136
        $scoredLocalImages = $this->scoreLocalImages($goodLocalImages, $parentDepthLevel);
137
138
        ksort($scoredLocalImages);
139
140
        if (!empty($scoredLocalImages)) {
141
            foreach ($scoredLocalImages as $imageScore => $scoredLocalImage) {
142
                $mainImage = new Image();
143
                $mainImage->setImageSrc($scoredLocalImage->getImgSrc());
144
                $mainImage->setImageExtractionType('bigimage');
145
                $mainImage->setConfidenceScore(100 / count($scoredLocalImages));
146
                $mainImage->setImageScore($imageScore);
147
                $mainImage->setBytes($scoredLocalImage->getBytes());
148
                $mainImage->setHeight($scoredLocalImage->getHeight());
149
                $mainImage->setWidth($scoredLocalImage->getWidth());
150
151
                return $mainImage;
152
            }
153
        } else {
154
            $depthObj = $this->getDepthLevel($node, $parentDepthLevel, $siblingDepthLevel);
155
156
            if ($depthObj && NULL !== $depthObj->node) {
157
                return $this->checkForLargeImages($depthObj->node, $depthObj->parentDepth, $depthObj->siblingDepth);
158
            }
159
        }
160
161
        return null;
162
    }
163
164
    /**
165
     * @param Element $node
166
     * @param int $parentDepth
167
     * @param int $siblingDepth
168
     *
169
     * @return object|null
170
     */
171
    private function getDepthLevel(Element $node, $parentDepth, $siblingDepth) {
172
        if (is_null($node) || !($node->parent() instanceof Element)) {
173
            return null;
174
        }
175
176
        if ($parentDepth > self::$MAX_PARENT_DEPTH) {
177
            return null;
178
        }
179
180
        // Find previous sibling element node
181
        $siblingNode = $node->preceding(function($node) {
182
            return $node instanceof Element;
183
        });
184
185
        if (is_null($siblingNode)) {
186
            return (object)[
187
                'node' => $node->parent(),
188
                'parentDepth' => $parentDepth + 1,
189
                'siblingDepth' => 0,
190
            ];
191
        }
192
193
        return (object)[
194
            'node' => $siblingNode,
195
            'parentDepth' => $parentDepth,
196
            'siblingDepth' => $siblingDepth + 1,
197
        ];
198
    }
199
200
    /**
201
     * Set image score and on locally downloaded images
202
     *
203
     * we're going to score the images in the order in which they appear so images higher up will have more importance,
204
     * we'll count the area of the 1st image as a score of 1 and then calculate how much larger or small each image after it is
205
     * we'll also make sure to try and weed out banner type ad blocks that have big widths and small heights or vice versa
206
     * so if the image is 3rd found in the dom it's sequence score would be 1 / 3 = .33 * diff in area from the first image
207
     *
208
     * @param LocallyStoredImage[] $locallyStoredImages
209
     * @param int $depthLevel
210
     *
211
     * @return LocallyStoredImage[]
212
     */
213
    private function scoreLocalImages($locallyStoredImages, $depthLevel) {
0 ignored issues
show
Unused Code introduced by
The parameter $depthLevel is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
214
        $results = [];
215
        $i = 1;
216
        $initialArea = 0;
217
218
        // Limit to the first 30 images
219
        $locallyStoredImages = array_slice($locallyStoredImages, 0, 30);
220
221
        foreach ($locallyStoredImages as $locallyStoredImage) {
222
            $sequenceScore = 1 / $i;
223
            $area = $locallyStoredImage->getWidth() * $locallyStoredImage->getHeight();
224
225
            if ($initialArea == 0) {
226
                $initialArea = $area * 1.48;
227
                $totalScore = 1;
228
            } else {
229
                $areaDifference = $area * $initialArea;
230
                $totalScore = $sequenceScore * $areaDifference;
231
            }
232
233
            $i++;
234
235
            $results[$totalScore] = $locallyStoredImage;
236
        }
237
238
        return $results;
239
    }
240
241
    /**
242
     * @param LocallyStoredImage $locallyStoredImage
243
     * @param int $depthLevel
244
     *
245
     * @return bool
246
     */
247
    private function isWorthyImage($locallyStoredImage, $depthLevel) {
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
248
        if ($locallyStoredImage->getWidth() <= $this->config()->get('image_min_width')
249
          || $locallyStoredImage->getHeight() <= $this->config()->get('image_min_height')
250
          || $locallyStoredImage->getFileExtension() == 'NA'
251
          || ($depthLevel < 1 && $locallyStoredImage->getWidth() < 300) || $depthLevel >= 1
252
          || $this->isBannerDimensions($locallyStoredImage->getWidth(), $locallyStoredImage->getHeight())) {
253
            return false;
254
        }
255
256
        return true;
257
    }
258
259
    /**
260
     * @return Image[]
261
     */
262
    private function getAllImages() {
263
        $results = [];
264
265
        $images = $this->article()->getTopNode()->find('img');
266
267
        // Generate a complete URL for each image
268
        $imageUrls = array_map(function($image) {
269
            return $this->buildImagePath($image->attr('src'));
270
        }, $images->toArray());
271
272
        $localImages = $this->getLocallyStoredImages($imageUrls);
273
274
        foreach ($localImages as $localImage) {
275
            $image = new Image();
276
            $image->setImageSrc($localImage->getImgSrc());
277
            $image->setBytes($localImage->getBytes());
278
            $image->setHeight($localImage->getHeight());
279
            $image->setWidth($localImage->getWidth());
280
            $image->setImageExtractionType('all');
281
            $image->setConfidenceScore(0);
282
283
            $results[] = $image;
284
        }
285
286
        return $results;
287
    }
288
289
    /**
290
     * returns true if we think this is kind of a bannery dimension
291
     * like 600 / 100 = 6 may be a fishy dimension for a good image
292
     *
293
     * @param int $width
294
     * @param int $height
295
     */
296
    private function isBannerDimensions($width, $height) {
297
        if ($width == $height) {
298
            return false;
299
        }
300
301 View Code Duplication
        if ($width > $height) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
302
            $diff = $width / $height;
303
            if ($diff > 5) {
304
                return true;
305
            }
306
        }
307
308 View Code Duplication
        if ($height > $width) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
309
            $diff = $height / $width;
310
            if ($diff > 5) {
311
                return true;
312
            }
313
        }
314
315
        return false;
316
    }
317
318
    /**
319
     * takes a list of image elements and filters out the ones with bad names
320
     *
321
     * @param \DOMWrap\NodeList $images
322
     *
323
     * @return Element[]
324
     */
325
    private function filterBadNames(NodeList $images) {
326
        $goodImages = [];
327
328
        foreach ($images as $image) {
329
            if ($this->isOkImageFileName($image)) {
330
                $goodImages[] = $image;
331
            } else {
332
                $image->remove();
333
            }
334
        }
335
336
        return $goodImages;
337
    }
338
339
    /**
340
     * will check the image src against a list of bad image files we know of like buttons, etc...
341
     *
342
     * @param Element $imageNode
343
     *
344
     * @return bool
345
     */
346
    private function isOkImageFileName(Element $imageNode) {
347
        $imgSrc = $imageNode->attr('src');
348
349
        if (empty($imgSrc)) {
350
            return false;
351
        }
352
353
        $regex = '@' . implode('|', $this->badFileNames) . '@i';
354
355
        if (preg_match($regex, $imgSrc)) {
356
            return false;
357
        }
358
359
        return true;
360
    }
361
362
    /**
363
     * @param Element $node
364
     *
365
     * @return LocallyStoredImage[]
366
     */
367
    private function getImageCandidates(Element $node) {
368
        $images = $node->find('img');
369
        $filteredImages = $this->filterBadNames($images);
370
        $goodImages = $this->findImagesThatPassByteSizeTest($filteredImages);
371
372
        return $goodImages;
373
    }
374
375
    /**
376
     * loop through all the images and find the ones that have the best bytes to even make them a candidate
377
     *
378
     * @param Element[] $images
379
     *
380
     * @return LocallyStoredImage[]
381
     */
382
    private function findImagesThatPassByteSizeTest($images) {
383
        $i = 0; /** @todo Re-factor how the LocallyStoredImage => Image relation works ? Note: PHP 5.6.x adds a 3rd argument to array_filter() to pass the key as well as value. */
384
385
        // Limit to the first 30 images
386
        $images = array_slice($images, 0, 30);
387
388
        // Generate a complete URL for each image
389
        $imageUrls = array_map(function($image) {
390
            return $this->buildImagePath($image->attr('src'));
391
        }, $images);
392
393
        $localImages = $this->getLocallyStoredImages($imageUrls, true);
394
395
        $results = array_filter($localImages, function($localImage) use($images, $i) {
396
            $image = $images[$i++];
397
398
            $bytes = $localImage->getBytes();
399
400
            if ($bytes < $this->config()->get('image_min_bytes') && $bytes != 0 || $bytes > $this->config()->get('image_max_bytes')) {
401
                $image->remove();
402
403
                return false;
404
            }
405
406
            return true;
407
        });
408
409
        return $results;
410
    }
411
412
    /**
413
     * checks to see if we were able to find feature image tags on this page
414
     *
415
     * @return Image|null
416
     */
417
    private function checkForLinkTag() {
418
        return $this->checkForTag('link[rel="image_src"]', 'href', 'linktag');
419
    }
420
421
    /**
422
     * checks to see if we were able to find open graph tags on this page
423
     *
424
     * @return Image|null
425
     */
426
    private function checkForOpenGraphTag() {
427
        return $this->checkForTag('meta[property="og:image"],meta[name="og:image"]', 'content', 'opengraph');
428
    }
429
430
    /**
431
     * checks to see if we were able to find twitter tags on this page
432
     *
433
     * @return Image|null
434
     */
435
    private function checkForTwitterTag() {
436
        return $this->checkForTag('meta[property="twitter:image"],meta[name="twitter:image"],meta[property="twitter:image:src"],meta[name="twitter:image:src"]', 'content', 'twitter');
437
    }
438
439
    /**
440
     * @param string $selector
441
     * @param string $attr
442
     * @param string $type
443
     *
444
     * @return Image|null
445
     */
446
    private function checkForTag($selector, $attr, $type) {
447
        $meta = $this->article()->getRawDoc()->find($selector);
448
449
        if (!$meta->count()) {
450
            return null;
451
        }
452
453
        $node = $meta->first();
454
455
        if (!($node instanceof Element)) {
456
            return null;
457
        }
458
459
        if (!$node->hasAttribute($attr)) {
460
            return null;
461
        }
462
463
        $imagePath = $this->buildImagePath($node->attr($attr));
0 ignored issues
show
Bug introduced by
It seems like $node->attr($attr) targeting DOMWrap\Traits\ManipulationTrait::attr() can also be of type null or object<DOMWrap\Element>; however, Goose\Modules\Extractors...actor::buildImagePath() does only seem to accept string, maybe add an additional type check?

This check looks at variables that are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
464
        $mainImage = new Image();
465
        $mainImage->setImageSrc($imagePath);
466
        $mainImage->setImageExtractionType($type);
467
        $mainImage->setConfidenceScore(100);
468
469
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
470
471 View Code Duplication
        if (!empty($locallyStoredImage)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
472
            $mainImage->setBytes($locallyStoredImage->getBytes());
473
            $mainImage->setHeight($locallyStoredImage->getHeight());
474
            $mainImage->setWidth($locallyStoredImage->getWidth());
475
        }
476
477
        return $this->ensureMinimumImageSize($mainImage);
478
    }
479
480
    /**
481
     * @param Image $mainImage
482
     *
483
     * @return Image|null
484
     */
485
    private function ensureMinimumImageSize(Image $mainImage) {
486
        if ($mainImage->getWidth() >= $this->config()->get('image_min_width')
487
          && $mainImage->getHeight() >= $this->config()->get('image_min_height')) {
488
            return $mainImage;
489
        }
490
491
        return null;
492
    }
493
494
    /**
495
     * @param string $imageSrc
496
     * @param bool $returnAll
497
     *
498
     * @return LocallyStoredImage|null
499
     */
500
    private function getLocallyStoredImage($imageSrc, $returnAll = false) {
501
        $locallyStoredImages = ImageUtils::storeImagesToLocalFile([$imageSrc], $returnAll, $this->config());
502
503
        return array_shift($locallyStoredImages);
504
    }
505
506
    /**
507
     * @param string[] $imageSrcs
508
     * @param bool $returnAll
509
     *
510
     * @return LocallyStoredImage[]
511
     */
512
    private function getLocallyStoredImages($imageSrcs, $returnAll = false) {
513
        return ImageUtils::storeImagesToLocalFile($imageSrcs, $returnAll, $this->config());
514
    }
515
516
    /**
517
     * @return string
518
     */
519
    private function getCleanDomain() {
520
        return implode('.', array_slice(explode('.', $this->article()->getDomain()), -2, 2));
521
    }
522
523
    /**
524
     * In here we check for known image contains from sites we've checked out like yahoo, techcrunch, etc... that have
525
     * known  places to look for good images.
526
     *
527
     * @todo enable this to use a series of settings files so people can define what the image ids/classes are on specific sites
528
     *
529
     * @return Image|null
530
     */
531
    private function checkForKnownElements() {
532
        if (!$this->article()->getRawDoc()) {
533
            return null;
534
        }
535
536
        $knownImgDomNames = self::$KNOWN_IMG_DOM_NAMES;
537
538
        $domain = $this->getCleanDomain();
539
540
        $customSiteMapping = $this->customSiteMapping();
541
542
        if (isset($customSiteMapping[$domain])) {
543
            foreach (explode('|', $customSiteMapping[$domain]) as $class) {
544
                $knownImgDomNames[] = $class;
545
            }
546
        }
547
548
        $knownImage = null;
549
550
        foreach ($knownImgDomNames as $knownName) {
551
            $known = $this->article()->getRawDoc()->find('#' . $knownName);
552
553
            if (!$known->count()) {
554
                $known = $this->article()->getRawDoc()->find('.' . $knownName);
555
            }
556
557
            if ($known->count()) {
558
                $mainImage = $known->first()->find('img');
559
560
                if ($mainImage->count()) {
561
                    $knownImage = $mainImage->first();
562
                }
563
            }
564
        }
565
566
        if (is_null($knownImage)) {
567
            return null;
568
        }
569
570
        $knownImgSrc = $knownImage->attr('src');
571
572
        $mainImage = new Image();
573
        $mainImage->setImageSrc($this->buildImagePath($knownImgSrc));
574
        $mainImage->setImageExtractionType('known');
575
        $mainImage->setConfidenceScore(90);
576
577
        $locallyStoredImage = $this->getLocallyStoredImage($mainImage->getImageSrc());
578
579 View Code Duplication
        if (!empty($locallyStoredImage)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
580
            $mainImage->setBytes($locallyStoredImage->getBytes());
581
            $mainImage->setHeight($locallyStoredImage->getHeight());
582
            $mainImage->setWidth($locallyStoredImage->getWidth());
583
        }
584
585
        return $this->ensureMinimumImageSize($mainImage);
586
    }
587
588
    /**
589
     * This method will take an image path and build out the absolute path to that image
590
     * using the initial url we crawled so we can find a link to the image if they use relative urls like ../myimage.jpg
591
     *
592
     * @param string $imageSrc
593
     *
594
     * @return string
595
     */
596
    private function buildImagePath($imageSrc) {
597
        $parts = array(
598
            'scheme',
599
            'host',
600
            'port',
601
            'path',
602
            'query',
603
        );
604
605
        $imageUrlParts = parse_url($imageSrc);
606
        $articleUrlParts = parse_url($this->article()->getFinalUrl());
607
        if (isset($imageUrlParts['path'], $articleUrlParts['path']) && $imageUrlParts['path'] && $imageUrlParts['path']{0} !== '/') {
608
            $articleUrlDir = dirname($articleUrlParts['path']);
609
            $imageUrlParts['path'] = $articleUrlDir . '/' . $imageUrlParts['path'];
610
        }
611
612
        foreach ($parts as $part) {
613
            if (!isset($imageUrlParts[$part]) && isset($articleUrlParts[$part])) {
614
                $imageUrlParts[$part] = $articleUrlParts[$part];
615
616
            } else if (isset($imageUrlParts[$part]) && !isset($articleUrlParts[$part])) {
617
                break;
618
            }
619
        }
620
621
        return http_build_url($imageUrlParts, array());
622
    }
623
624
    /**
625
     * @param string[]
626
     */
627
    private function customSiteMapping() {
628
        if (empty(self::$CUSTOM_SITE_MAPPING)) {
629
            $file = __DIR__ . '/../../../resources/images/known-image-css.txt';
630
631
            $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
632
633
            foreach ($lines as $line) {
634
                list($domain, $css) = explode('^', $line);
635
636
                self::$CUSTOM_SITE_MAPPING[$domain] = $css;
637
            }
638
        }
639
640
        return self::$CUSTOM_SITE_MAPPING;
641
    }
642
643
}
644