Completed
Push — master ( 140b40...7cb46a )
by Andrew
03:24
created

MetaExtractor::getOpenGraph()   B

Complexity

Conditions 4
Paths 4

Size

Total Lines 24
Code Lines 12

Duplication

Lines 10
Ratio 41.67 %

Importance

Changes 2
Bugs 0 Features 1
Metric Value
dl 10
loc 24
c 2
b 0
f 1
rs 8.6845
cc 4
eloc 12
nc 4
nop 0
1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\AbstractModule;
9
use Goose\Modules\ModuleInterface;
10
use DOMWrap\Element;
11
use DOMWrap\Document;
12
13
/**
14
 * Content Extractor
15
 *
16
 * @package Goose\Modules\Extractors
17
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
18
 */
19
class MetaExtractor extends AbstractModule implements ModuleInterface {
20
    use ArticleMutatorTrait;
21
22
    /** @var string[] */
23
    protected static $SPLITTER_CHARS = [
24
        '|', '-', '»', ':',
25
    ];
26
27
    /** @var string */
28
    protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']";
29
30
    /** @var string[] */
31
    protected static $VIDEO_PROVIDERS = [
32
        'youtube\.com',
33
        'youtu\.be',
34
        'vimeo\.com',
35
        'blip\.tv',
36
        'dailymotion\.com',
37
        'dai\.ly',
38
        'flickr\.com',
39
        'flic\.kr',
40
    ];
41
42
    /**
43
     * @param Article $article
44
     */
45
    public function run(Article $article) {
46
        $this->article($article);
47
48
        $article->setOpenGraph($this->getOpenGraph());
49
        $article->setTitle($this->getTitle());
50
        $article->setMetaDescription($this->getMetaDescription());
51
        $article->setMetaKeywords($this->getMetaKeywords());
52
        $article->setCanonicalLink($this->getCanonicalLink());
53
        $article->setTags($this->getTags());
54
55
        if ($this->article()->getTopNode() instanceof Element) {
56
            $article->setVideos($this->getVideos());
57
            $article->setLinks($this->getLinks());
58
            $article->setPopularWords($this->getPopularWords());
59
        }
60
61
        $article->setLanguage($this->getMetaLanguage());
62
63
        $this->config()->set('language', $article->getLanguage());
64
    }
65
66
    /**
67
     * Retrieve all OpenGraph meta data
68
     *
69
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
70
     * 
71
     * @return string[]
72
     */
73
    private function getOpenGraph() {
74
        $results = array();
75
76
        $nodes = $this->article()->getDoc()->find('meta[property^="og:"]');
77
78 View Code Duplication
        foreach ($nodes as $node) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
79
            $property = explode(':', $node->attr('property'));
80
81
            $results[$property[1]] = $node->attr('content');
82
        }
83
84
        // Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
85
        if (isset($results['type'])) {
86
            $nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');
87
88 View Code Duplication
            foreach ($nodes as $node) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
89
                $property = explode(':', $node->attr('property'));
90
91
                $results[$property[1]] = $node->attr('content');
92
            }
93
        }
94
95
        return $results;
96
    }
97
98
    /**
99
     * Clean title text
100
     *
101
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
102
     * 
103
     * @param string $title
104
     *
105
     * @return string
106
     */
107
    private function cleanTitle($title) {
108
        $openGraph = $this->article()->getOpenGraph();
109
110
        // Check if we have the site name in OpenGraph data
111
        if (isset($openGraph['site_name'])) {
112
            $title = str_replace($openGraph['site_name'], '', $title);
113
        }
114
115
        // Try to remove the domain from URL
116
        if ($this->article()->getDomain()) {
117
            $title = str_ireplace($this->article()->getDomain(), '', $title);
118
        }
119
120
        // Split the title in words
121
        // TechCrunch | my wonderfull article
122
        // my wonderfull article | TechCrunch
123
        $titleWords = preg_split('@[\s]+@', trim($title));
124
125
        // Check for an empty title
126
        if (empty($titleWords)) {
127
            return '';
128
        }
129
130
        // Check if last letter is in self::$SPLITTER_CHARS
131
        // if so remove it
132
        if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) {
133
            array_pop($titleWords);
134
        }
135
136
        // Check if first letter is in self::$SPLITTER_CHARS
137
        // if so remove it
138
        if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) {
139
            array_shift($titleWords);
140
        }
141
142
        // Rebuild the title
143
        $title = trim(implode(' ', $titleWords));
144
145
        return $title;
146
    }
147
148
    /**
149
     * Get article title
150
     *
151
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
152
     * 
153
     * @return string
154
     */
155
    private function getTitle() {
156
        $openGraph = $this->article()->getOpenGraph();
157
158
        // Rely on OpenGraph in case we have the data
159
        if (isset($openGraph['title'])) {
160
            return $this->cleanTitle($openGraph['title']);
161
        }
162
163
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
164
        if ($nodes->count()) {
165
            return $this->cleanTitle($nodes->first()->attr('content'));
166
        }
167
168
        $nodes = $this->article()->getDoc()->find('html > head > title');
169
        if ($nodes->count()) {
170
            return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
171
        }
172
173
        return '';
174
    }
175
176
    /**
177
     * @param Document $doc
178
     * @param string $tag
179
     * @param string $property
180
     * @param string $value
181
     *
182
     * @return \DOMWrap\NodeList
183
     */
184
    private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) {
185
        return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']");
186
    }
187
188
    /**
189
     * @param Document $doc
190
     * @param string $property
191
     * @param string $value
192
     * @param string $attr
193
     *
194
     * @return string
195
     */
196
    private function getMetaContent(Document $doc, $property, $value, $attr = 'content') {
197
        $nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value);
198
199
        if (!$nodes->count()) {
200
            return '';
201
        }
202
203
        $content = $nodes->first()->attr($attr);
204
        $content = trim($content);
205
206
        return $content;
207
    }
208
209
    /**
210
     * If the article has meta language set in the source, use that
211
     *
212
     * @return string
213
     */
214
    private function getMetaLanguage() {
215
        $lang = '';
216
217
        $el = $this->article()->getDoc()->find('html[lang]');
218
219
        if ($el->count()) {
220
            $lang = $el->first()->attr('lang');
221
        }
222
223
        if (empty($lang)) {
224
            $selectors = [
225
                'html > head > meta[http-equiv=content-language]',
226
                'html > head > meta[name=lang]',
227
            ];
228
229
            foreach ($selectors as $selector) {
230
                $el = $this->article()->getDoc()->find($selector);
231
232
                if ($el->count()) {
233
                    $lang = $el->first()->attr('content');
234
                    break;
235
                }
236
            }
237
        }
238
239
        if (preg_match('@^[A-Za-z]{2}$@', $lang)) {
240
            return strtolower($lang);
241
        }
242
243
        return '';
244
    }
245
246
    /**
247
     * If the article has meta description set in the source, use that
248
     *
249
     * @return string
250
     */
251
    private function getMetaDescription() {
252
        $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description');
253
254
        if (empty($desc)) {
255
            $desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description');
256
        }
257
258
        if (empty($desc)) {
259
            $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description');
260
        }
261
262
        return trim($desc);
263
    }
264
265
    /**
266
     * If the article has meta keywords set in the source, use that
267
     *
268
     * @return string
269
     */
270
    private function getMetaKeywords() {
271
        return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords');
272
    }
273
274
    /**
275
     * If the article has meta canonical link set in the url
276
     *
277
     * @return string
278
     */
279
    private function getCanonicalLink() {
280
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical');
281
282
        if ($nodes->count()) {
283
            return trim($nodes->first()->attr('href'));
284
        }
285
286
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url');
287
288
        if ($nodes->count()) {
289
            return trim($nodes->first()->attr('content'));
290
        }
291
292
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url');
293
294
        if ($nodes->count()) {
295
            return trim($nodes->first()->attr('content'));
296
        }
297
298
        return $this->article()->getFinalUrl();
299
    }
300
301
    /**
302
     * @return string[]
303
     */
304
    private function getTags() {
305
        $nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR);
306
307
        $tags = [];
308
309
        foreach ($nodes as $node) {
310
            $tags[] = Helper::textNormalise($node->text());
311
        }
312
313
        return $tags;
314
    }
315
316
    /**
317
     * Pulls out videos we like
318
     *
319
     * @return string[]
320
     */
321
    private function getVideos() {
322
        $videos = [];
323
324
        $nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe');
325
326
        foreach ($nodes as $node) {
327
            if ($node->hasAttribute('src')) {
328
                $src = $node->attr('src');
329
            } else {
330
                $src = $node->attr('data');
331
            }
332
333
            $match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) {
334
                $srcHost = parse_url($src, PHP_URL_HOST);
335
                $srcScheme = parse_url($src, PHP_URL_SCHEME);
336
337
                return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']);
338
            });
339
340
            if ($match) {
341
                $videos[] = $src;
342
            }
343
        }
344
345
        return $videos;
346
    }
347
348
    /**
349
     * Pulls out links we like
350
     *
351
     * @return string[]
352
     */
353
    private function getLinks() {
354
        $goodLinks = [];
355
356
        $candidates = $this->article()->getTopNode()->parent()->find('a[href]');
357
358
        foreach ($candidates as $el) {
359
            if ($el->attr('href') != '#' && trim($el->attr('href')) != '') {
360
                $goodLinks[] = [
361
                    'url' => $el->attr('href'),
362
                    'text' => Helper::textNormalise($el->text()),
363
                ];
364
            }
365
        }
366
367
        return $goodLinks;
368
    }
369
370
    /**
371
     * @return string[]
372
     */
373
    private function getPopularWords() {
374
        $limit = 5;
375
        $minimumFrequency = 1;
376
        $stopWords = $this->config()->getStopWords()->getCurrentStopWords();
377
378
        $text = $this->article()->getTitle();
379
        $text .= ' ' . $this->article()->getMetaDescription();
380
381
        if ($this->article()->getTopNode()) {
382
            $text .= ' ' . $this->article()->getCleanedArticleText();
383
        }
384
385
        // Decode and split words by white-space
386
        $text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8');
387
        $words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY);
388
389
        // Determine stop words currently in $words
390
        $ignoreWords = array_intersect($words, $stopWords);
391
        // Remove ignored words from $words
392
        $words = array_diff($words, $ignoreWords);
393
394
        // Count and sort $words
395
        $words = array_count_values($words);
396
        arsort($words);
397
398
        // Limit and filter $words
399
        $words = array_slice($words, 0, $limit);
400
        $words = array_filter($words, function($value) use ($minimumFrequency){
401
            return !($value < $minimumFrequency);
402
        });
403
404
        return $words;
405
    }
406
}
407