AdditionalDataExtractor   A
last analyzed

Complexity

Total Complexity 21

Size/Duplication

Total Lines 169
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 21
eloc 83
c 1
b 0
f 0
dl 0
loc 169
rs 10

5 Methods

Rating   Name   Duplication   Size   Complexity  
A getTags() 0 10 2
A getPopularWords() 0 36 3
A getLinks() 0 19 5
B getVideos() 0 36 9
A run() 0 12 2
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\{AbstractModule, ModuleInterface};
9
use DOMWrap\Element;
10
11
/**
12
 * Additional Data Extractor
13
 *
14
 * @package Goose\Modules\Extractors
15
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16
 */
17
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
18
    use ArticleMutatorTrait;
19
20
    /** @var string */
21
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
22
23
    /** @var string[] */
24
    protected static $VIDEO_PROVIDERS = [
25
        'youtube\.com',
26
        'youtu\.be',
27
        'vimeo\.com',
28
        'blip\.tv',
29
        'dailymotion\.com',
30
        'dai\.ly',
31
        'flickr\.com',
32
        'flic\.kr',
33
    ];
34
35
    /** @var string[] */
36
    protected static $VIDEO_EXTENSIONS = [
37
        'mpg',
38
        'mp4',
39
        'avi',
40
        'flv',
41
        'mov',
42
        'wmv',
43
        'ogv',
44
        'gifv',
45
        'mpeg',
46
    ];
47
48
    /** @inheritdoc */
49
    public function run(Article $article): self {
50
        $this->article($article);
51
52
        $article->setTags($this->getTags());
53
54
        if ($this->article()->getTopNode() instanceof Element) {
55
            $article->setVideos($this->getVideos());
56
            $article->setLinks($this->getLinks());
57
            $article->setPopularWords($this->getPopularWords());
58
        }
59
60
        return $this;
61
    }
62
63
    /**
64
     * @return string[]
65
     */
66
    private function getTags(): array {
67
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
68
69
        $tags = [];
70
71
        foreach ($nodes as $node) {
72
            $tags[] = Helper::textNormalise($node->text());
73
        }
74
75
        return $tags;
76
    }
77
78
    /**
79
     * Pulls out videos we like
80
     *
81
     * @return string[]
82
     */
83
    private function getVideos(): array {
84
        $videos = [];
85
86
        $topNode = $this->article()->getTopNode();
87
88
        if ($topNode instanceof Element && $topNode->parent() instanceof Element) {
89
            $nodes = $topNode->parent()->find('embed, object, iframe, video');
90
91
            foreach ($nodes as $node) {
92
                if ($node->hasAttribute('src')) {
93
                    $src = $node->attr('src');
94
                } else {
95
                    $src = $node->attr('data');
96
                }
97
98
                $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
99
                    $srcHost = (string)parse_url($src, PHP_URL_HOST);
100
                    $srcScheme = (string)parse_url($src, PHP_URL_SCHEME);
101
102
                    return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
103
                });
104
105
                if (!$match) {
106
                    $srcPath = parse_url(strtolower($src), PHP_URL_PATH);
107
                    $srcExtension = pathinfo((string)$srcPath, PATHINFO_EXTENSION);
108
109
                    $match = in_array($srcExtension, self::$VIDEO_EXTENSIONS);
110
                }
111
112
                if ($match) {
113
                    $videos[] = $src;
114
                }
115
            }
116
        }
117
118
        return $videos;
119
    }
120
121
    /**
122
     * Pulls out links we like
123
     *
124
     * @return array
125
     */
126
    private function getLinks(): array {
127
        $goodLinks = [];
128
129
        $parentNode = $this->article()->getTopNode()->parent();
130
131
        if ($parentNode instanceof Element) {
132
            $candidates = $parentNode->find('a[href]');
133
134
            foreach ($candidates as $el) {
135
                if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
136
                    $goodLinks[] = [
137
                        'url' => $el->attr('href'),
138
                        'text' => Helper::textNormalise($el->text()),
139
                    ];
140
                }
141
            }
142
        }
143
144
        return $goodLinks;
145
    }
146
147
    /**
148
     * @return string[]
149
     */
150
    private function getPopularWords(): array {
151
        $limit = 5;
152
        $minimumFrequency = 1;
153
        $stopWords = $this->config()->getStopWords()->getWordList();
154
155
        $text = $this->article()->getTitle();
156
        $text .= ' ' . $this->article()->getMetaDescription();
157
158
        if ($this->article()->getTopNode()) {
159
            $text .= ' ' . $this->article()->getCleanedArticleText();
160
        }
161
162
        // Decode and split words by white-space
163
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
164
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
165
166
        if (!is_array($words)) {
0 ignored issues
show
introduced by
The condition is_array($words) is always true.
Loading history...
167
            return array();
168
        }
169
170
        // Determine stop words currently in $words
171
        $ignoreWords = array_intersect($words, $stopWords);
172
        // Remove ignored words from $words
173
        $words = array_udiff($words, $ignoreWords, 'strcasecmp');
174
175
        // Count and sort $words
176
        $words = array_count_values($words);
177
        arsort($words);
178
179
        // Limit and filter $words
180
        $words = array_slice($words, 0, $limit);
181
        $words = array_filter($words, function($value) use ($minimumFrequency){
182
            return !($value < $minimumFrequency);
183
        });
184
185
        return $words;
186
    }
187
}
188