Completed
Push — master ( c7a267...479701 )
by Andrew
03:34
created

AdditionalDataExtractor::getTags()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 11
rs 9.4285
cc 2
eloc 6
nc 2
nop 0
1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\AbstractModule;
9
use Goose\Modules\ModuleInterface;
10
use DOMWrap\Element;
11
12
/**
13
 * Additional Data Extractor
14
 *
15
 * @package Goose\Modules\Extractors
16
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
17
 */
18
class AdditionalDataExtractor extends AbstractModule implements ModuleInterface {
19
    use ArticleMutatorTrait;
20
21
    /** @var string */
22
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
23
24
    /** @var string[] */
25
    protected static $VIDEO_PROVIDERS = [
26
        'youtube\.com',
27
        'youtu\.be',
28
        'vimeo\.com',
29
        'blip\.tv',
30
        'dailymotion\.com',
31
        'dai\.ly',
32
        'flickr\.com',
33
        'flic\.kr',
34
    ];
35
36
    /**
37
     * @param Article $article
38
     */
39
    public function run(Article $article) {
40
        $this->article($article);
41
42
        $article->setTags($this->getTags());
43
44
        if ($this->article()->getTopNode() instanceof Element) {
45
            $article->setVideos($this->getVideos());
46
            $article->setLinks($this->getLinks());
47
            $article->setPopularWords($this->getPopularWords());
48
        }
49
    }
50
51
    /**
52
     * @return string[]
53
     */
54
    private function getTags() {
55
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
56
57
        $tags = [];
58
59
        foreach ($nodes as $node) {
60
            $tags[] = Helper::textNormalise($node->text());
61
        }
62
63
        return $tags;
64
    }
65
66
    /**
67
     * Pulls out videos we like
68
     *
69
     * @return string[]
70
     */
71
    private function getVideos() {
72
        $videos = [];
73
74
        $nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');
75
76
        foreach ($nodes as $node) {
77
            if ($node->hasAttribute('src')) {
78
                $src = $node->attr('src');
79
            } else {
80
                $src = $node->attr('data');
81
            }
82
83
            $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
84
                $srcHost = parse_url($src, PHP_URL_HOST);
85
                $srcScheme = parse_url($src, PHP_URL_SCHEME);
86
87
                return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
88
            });
89
90
            if ($match) {
91
                $videos[] = $src;
92
            }
93
        }
94
95
        return $videos;
96
    }
97
98
    /**
99
     * Pulls out links we like
100
     *
101
     * @return string[]
102
     */
103
    private function getLinks() {
104
        $goodLinks = [];
105
106
        $candidates = $this->article()->getTopNode()->parent()->find('a[href]');
107
108
        foreach ($candidates as $el) {
109
            if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
110
                $goodLinks[] = [
111
                    'url' => $el->attr('href'),
112
                    'text' => Helper::textNormalise($el->text()),
113
                ];
114
            }
115
        }
116
117
        return $goodLinks;
118
    }
119
120
    /**
121
     * @return string[]
122
     */
123
    private function getPopularWords() {
124
        $limit = 5;
125
        $minimumFrequency = 1;
126
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();
127
128
        $text = $this->article()->getTitle();
129
        $text .= ' ' . $this->article()->getMetaDescription();
130
131
        if ($this->article()->getTopNode()) {
132
            $text .= ' ' . $this->article()->getCleanedArticleText();
133
        }
134
135
        // Decode and split words by white-space
136
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
137
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
138
139
        // Determine stop words currently in $words
140
        $ignoreWords = array_intersect($words, $stopWords);
141
        // Remove ignored words from $words
142
        $words = array_diff($words, $ignoreWords);
143
144
        // Count and sort $words
145
        $words = array_count_values($words);
146
        arsort($words);
147
148
        // Limit and filter $words
149
        $words = array_slice($words, 0, $limit);
150
        $words = array_filter($words, function($value) use ($minimumFrequency){
151
            return !($value < $minimumFrequency);
152
        });
153
154
        return $words;
155
    }
156
}
157