Completed
Push — master ( 9c6482...84b1b9 )
by Andrew
01:51
created

AdditionalDataExtractor::getVideos()   C

Complexity

Conditions 7
Paths 2

Size

Total Lines 30
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 30
rs 6.7272
c 0
b 0
f 0
cc 7
eloc 17
nc 2
nop 0
1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\AbstractModule;
9
use Goose\Modules\ModuleInterface;
10
use DOMWrap\Element;
11
12
/**
13
 * Additional Data Extractor
14
 *
15
 * @package Goose\Modules\Extractors
16
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
17
 */
18
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
19
    use ArticleMutatorTrait;
20
21
    /** @var string */
22
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
23
24
    /** @var string[] */
25
    protected static $VIDEO_PROVIDERS = [
26
        'youtube\.com',
27
        'youtu\.be',
28
        'vimeo\.com',
29
        'blip\.tv',
30
        'dailymotion\.com',
31
        'dai\.ly',
32
        'flickr\.com',
33
        'flic\.kr',
34
    ];
35
36
    /**
37
     * @param Article $article
38
     */
39
    public function run(Article $article) {
40
        $this->article($article);
41
42
        $article->setTags($this->getTags());
43
44
        if ($this->article()->getTopNode() instanceof Element) {
45
            $article->setVideos($this->getVideos());
46
            $article->setLinks($this->getLinks());
47
            $article->setPopularWords($this->getPopularWords());
48
        }
49
    }
50
51
    /**
52
     * @return string[]
53
     */
54
    private function getTags() {
55
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
56
57
        $tags = [];
58
59
        foreach ($nodes as $node) {
60
            $tags[] = Helper::textNormalise($node->text());
61
        }
62
63
        return $tags;
64
    }
65
66
    /**
67
     * Pulls out videos we like
68
     *
69
     * @return string[]
70
     */
71
    private function getVideos() {
72
        $videos = [];
73
74
        $parentNode = $this->article()->getTopNode()->parent();
75
76
        if ($parentNode instanceof Element) {
77
            $nodes = $parentNode->find('embed, object, iframe');
78
79
            foreach ($nodes as $node) {
80
                if ($node->hasAttribute('src')) {
81
                    $src = $node->attr('src');
82
                } else {
83
                    $src = $node->attr('data');
84
                }
85
86
                $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
87
                    $srcHost = parse_url($src, PHP_URL_HOST);
88
                    $srcScheme = parse_url($src, PHP_URL_SCHEME);
89
90
                    return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
91
                });
92
93
                if ($match) {
94
                    $videos[] = $src;
95
                }
96
            }
97
        }
98
99
        return $videos;
100
    }
101
102
    /**
103
     * Pulls out links we like
104
     *
105
     * @return string[]
106
     */
107
    private function getLinks() {
108
        $goodLinks = [];
109
110
        $parentNode = $this->article()->getTopNode()->parent();
111
112
        if ($parentNode instanceof Element) {
113
            $candidates = $parentNode->find('a[href]');
114
115
            foreach ($candidates as $el) {
116
                if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
117
                    $goodLinks[] = [
118
                        'url' => $el->attr('href'),
119
                        'text' => Helper::textNormalise($el->text()),
120
                    ];
121
                }
122
            }
123
        }
124
125
        return $goodLinks;
126
    }
127
128
    /**
129
     * @return string[]
130
     */
131
    private function getPopularWords() {
132
        $limit = 5;
133
        $minimumFrequency = 1;
134
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();
135
136
        $text = $this->article()->getTitle();
137
        $text .= ' ' . $this->article()->getMetaDescription();
138
139
        if ($this->article()->getTopNode()) {
140
            $text .= ' ' . $this->article()->getCleanedArticleText();
141
        }
142
143
        // Decode and split words by white-space
144
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
145
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
146
147
        // Determine stop words currently in $words
148
        $ignoreWords = array_intersect($words, $stopWords);
149
        // Remove ignored words from $words
150
        $words = array_udiff($words, $ignoreWords, 'strcasecmp');
151
152
        // Count and sort $words
153
        $words = array_count_values($words);
154
        arsort($words);
155
156
        // Limit and filter $words
157
        $words = array_slice($words, 0, $limit);
158
        $words = array_filter($words, function($value) use ($minimumFrequency){
159
            return !($value < $minimumFrequency);
160
        });
161
162
        return $words;
163
    }
164
}
165