Completed
Push — master ( c7a267...479701 )
by Andrew
03:34
created

MetaExtractor   A

Complexity

Total Complexity 33

Size/Duplication

Total Lines 260
Duplicated Lines 3.85 %

Coupling/Cohesion

Components 1
Dependencies 7

Importance

Changes 9
Bugs 2 Features 2
Metric Value
wmc 33
c 9
b 2
f 2
lcom 1
cbo 7
dl 10
loc 260
rs 9.3999

10 Methods

Rating   Name   Duplication   Size   Complexity  
A run() 0 13 1
B getOpenGraph() 10 24 4
C cleanTitle() 0 40 7
A getTitle() 0 20 4
A getNodesByLowercasePropertyValue() 0 3 1
A getMetaContent() 0 12 2
B getMetaLanguage() 0 31 6
A getMetaDescription() 0 13 3
A getMetaKeywords() 0 3 1
A getCanonicalLink() 0 21 4

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\AbstractModule;
9
use Goose\Modules\ModuleInterface;
10
use DOMWrap\Document;
11
12
/**
13
 * Content Extractor
14
 *
15
 * @package Goose\Modules\Extractors
16
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
17
 */
18
class MetaExtractor extends AbstractModule implements ModuleInterface {
19
    use ArticleMutatorTrait;
20
21
    /** @var string[] */
22
    protected static $SPLITTER_CHARS = [
23
        '|', '-', '»', ':',
24
    ];
25
26
    /**
27
     * @param Article $article
28
     */
29
    public function run(Article $article) {
30
        $this->article($article);
31
32
        $article->setOpenGraph($this->getOpenGraph());
33
        $article->setTitle($this->getTitle());
34
        $article->setMetaDescription($this->getMetaDescription());
35
        $article->setMetaKeywords($this->getMetaKeywords());
36
        $article->setCanonicalLink($this->getCanonicalLink());
37
38
        $article->setLanguage($this->getMetaLanguage());
39
40
        $this->config()->set('language', $article->getLanguage());
41
    }
42
43
    /**
44
     * Retrieve all OpenGraph meta data
45
     *
46
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
47
     * 
48
     * @return string[]
49
     */
50
    private function getOpenGraph() {
51
        $results = array();
52
53
        $nodes = $this->article()->getDoc()->find('meta[property^="og:"]');
54
55 View Code Duplication
        foreach ($nodes as $node) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
56
            $property = explode(':', $node->attr('property'));
57
58
            $results[$property[1]] = $node->attr('content');
59
        }
60
61
        // Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
62
        if (isset($results['type'])) {
63
            $nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');
64
65 View Code Duplication
            foreach ($nodes as $node) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
66
                $property = explode(':', $node->attr('property'));
67
68
                $results[$property[1]] = $node->attr('content');
69
            }
70
        }
71
72
        return $results;
73
    }
74
75
    /**
76
     * Clean title text
77
     *
78
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
79
     * 
80
     * @param string $title
81
     *
82
     * @return string
83
     */
84
    private function cleanTitle($title) {
85
        $openGraph = $this->article()->getOpenGraph();
86
87
        // Check if we have the site name in OpenGraph data
88
        if (isset($openGraph['site_name'])) {
89
            $title = str_replace($openGraph['site_name'], '', $title);
90
        }
91
92
        // Try to remove the domain from URL
93
        if ($this->article()->getDomain()) {
94
            $title = str_ireplace($this->article()->getDomain(), '', $title);
95
        }
96
97
        // Split the title in words
98
        // TechCrunch | my wonderfull article
99
        // my wonderfull article | TechCrunch
100
        $titleWords = preg_split('@[\s]+@', trim($title));
101
102
        // Check for an empty title
103
        if (empty($titleWords)) {
104
            return '';
105
        }
106
107
        // Check if last letter is in self::$SPLITTER_CHARS
108
        // if so remove it
109
        if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) {
110
            array_pop($titleWords);
111
        }
112
113
        // Check if first letter is in self::$SPLITTER_CHARS
114
        // if so remove it
115
        if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) {
116
            array_shift($titleWords);
117
        }
118
119
        // Rebuild the title
120
        $title = trim(implode(' ', $titleWords));
121
122
        return $title;
123
    }
124
125
    /**
126
     * Get article title
127
     *
128
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
129
     * 
130
     * @return string
131
     */
132
    private function getTitle() {
133
        $openGraph = $this->article()->getOpenGraph();
134
135
        // Rely on OpenGraph in case we have the data
136
        if (isset($openGraph['title'])) {
137
            return $this->cleanTitle($openGraph['title']);
138
        }
139
140
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
141
        if ($nodes->count()) {
142
            return $this->cleanTitle($nodes->first()->attr('content'));
143
        }
144
145
        $nodes = $this->article()->getDoc()->find('html > head > title');
146
        if ($nodes->count()) {
147
            return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
148
        }
149
150
        return '';
151
    }
152
153
    /**
154
     * @param Document $doc
155
     * @param string $tag
156
     * @param string $property
157
     * @param string $value
158
     *
159
     * @return \DOMWrap\NodeList
160
     */
161
    private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) {
162
        return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']");
163
    }
164
165
    /**
166
     * @param Document $doc
167
     * @param string $property
168
     * @param string $value
169
     * @param string $attr
170
     *
171
     * @return string
172
     */
173
    private function getMetaContent(Document $doc, $property, $value, $attr = 'content') {
174
        $nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value);
175
176
        if (!$nodes->count()) {
177
            return '';
178
        }
179
180
        $content = $nodes->first()->attr($attr);
181
        $content = trim($content);
182
183
        return $content;
184
    }
185
186
    /**
187
     * If the article has meta language set in the source, use that
188
     *
189
     * @return string
190
     */
191
    private function getMetaLanguage() {
192
        $lang = '';
193
194
        $el = $this->article()->getDoc()->find('html[lang]');
195
196
        if ($el->count()) {
197
            $lang = $el->first()->attr('lang');
198
        }
199
200
        if (empty($lang)) {
201
            $selectors = [
202
                'html > head > meta[http-equiv=content-language]',
203
                'html > head > meta[name=lang]',
204
            ];
205
206
            foreach ($selectors as $selector) {
207
                $el = $this->article()->getDoc()->find($selector);
208
209
                if ($el->count()) {
210
                    $lang = $el->first()->attr('content');
211
                    break;
212
                }
213
            }
214
        }
215
216
        if (preg_match('@^[A-Za-z]{2}$@', $lang)) {
217
            return strtolower($lang);
218
        }
219
220
        return '';
221
    }
222
223
    /**
224
     * If the article has meta description set in the source, use that
225
     *
226
     * @return string
227
     */
228
    private function getMetaDescription() {
229
        $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description');
230
231
        if (empty($desc)) {
232
            $desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description');
233
        }
234
235
        if (empty($desc)) {
236
            $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description');
237
        }
238
239
        return trim($desc);
240
    }
241
242
    /**
243
     * If the article has meta keywords set in the source, use that
244
     *
245
     * @return string
246
     */
247
    private function getMetaKeywords() {
248
        return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords');
249
    }
250
251
    /**
252
     * If the article has meta canonical link set in the url
253
     *
254
     * @return string
255
     */
256
    private function getCanonicalLink() {
257
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical');
258
259
        if ($nodes->count()) {
260
            return trim($nodes->first()->attr('href'));
261
        }
262
263
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url');
264
265
        if ($nodes->count()) {
266
            return trim($nodes->first()->attr('content'));
267
        }
268
269
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url');
270
271
        if ($nodes->count()) {
272
            return trim($nodes->first()->attr('content'));
273
        }
274
275
        return $this->article()->getFinalUrl();
276
    }
277
}
278