MetaExtractor::getMetaContent()   A
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 14
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 7
c 1
b 0
f 0
dl 0
loc 14
rs 10
cc 3
nc 3
nop 4
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Utils\Helper;
7
use Goose\Traits\ArticleMutatorTrait;
8
use Goose\Modules\{AbstractModule, ModuleInterface};
9
use DOMWrap\{Document, NodeList};
10
11
/**
12
 * Content Extractor
13
 *
14
 * @package Goose\Modules\Extractors
15
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
16
 */
17
class MetaExtractor extends AbstractModule implements ModuleInterface {
18
    use ArticleMutatorTrait;
19
20
    /** @var string[] */
21
    protected static $SPLITTER_CHARS = [
22
        '|', '-', '»', ':',
23
    ];
24
25
    /** @inheritdoc */
26
    public function run(Article $article): self {
27
        $this->article($article);
28
29
        $article->setOpenGraph($this->getOpenGraph());
30
        $article->setTitle($this->getTitle());
31
        $article->setMetaDescription($this->getMetaDescription());
32
        $article->setMetaKeywords($this->getMetaKeywords());
33
        $article->setCanonicalLink($this->getCanonicalLink());
34
        $article->setLanguage($this->getMetaLanguage() ?: $this->config()->get('language'));
35
36
        $this->config()->set('language', $article->getLanguage());
37
38
        return $this;
39
    }
40
41
    /**
42
     * Retrieve all OpenGraph meta data
43
     *
44
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
45
     * 
46
     * @return string[]
47
     */
48
    private function getOpenGraph(): array {
49
        $results = array();
50
51
        $nodes = $this->article()->getDoc()->find('meta[property^="og:"]');
52
53
        foreach ($nodes as $node) {
54
            $property = explode(':', $node->attr('property'));
55
            array_shift($property);
56
            $results[implode(':', $property)] = $node->attr('content');
57
        }
58
59
        // Additionally retrieve type values based on provided og:type (http://ogp.me/#types)
60
        if (isset($results['type'])) {
61
            $nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]');
62
63
            foreach ($nodes as $node) {
64
                $property = explode(':', $node->attr('property'));
65
                array_shift($property);
66
                $results[implode(':', $property)] = $node->attr('content');
67
            }
68
        }
69
70
        return $results;
71
    }
72
73
    /**
74
     * Clean title text
75
     *
76
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
77
     * 
78
     * @param string $title
79
     *
80
     * @return string
81
     */
82
    private function cleanTitle(string $title): string {
83
        $openGraph = $this->article()->getOpenGraph();
84
85
        // Check if we have the site name in OpenGraph data and it does not match the title
86
        if (isset($openGraph['site_name']) && $openGraph['site_name'] != $title) {
87
            $title = str_replace($openGraph['site_name'], '', $title);
88
        }
89
90
        // Try to remove the domain from URL
91
        if ($this->article()->getDomain()) {
92
            $title = str_ireplace($this->article()->getDomain(), '', $title);
93
        }
94
95
        // Split the title in words
96
        // TechCrunch | my wonderfull article
97
        // my wonderfull article | TechCrunch
98
        $titleWords = preg_split('@[\s]+@', trim($title));
0 ignored issues
show
Bug introduced by
It seems like $title can also be of type array; however, parameter $string of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

98
        $titleWords = preg_split('@[\s]+@', trim(/** @scrutinizer ignore-type */ $title));
Loading history...
99
100
        // Check for an empty title
101
        if (empty($titleWords)) {
102
            return '';
103
        }
104
105
        // Check if last letter is in self::$SPLITTER_CHARS
106
        // if so remove it
107
        if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) {
108
            array_pop($titleWords);
109
        }
110
111
        // Check if first letter is in self::$SPLITTER_CHARS
112
        // if so remove it
113
        if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) {
114
            array_shift($titleWords);
115
        }
116
117
        // Rebuild the title
118
        $title = trim(implode(' ', $titleWords));
119
120
        return $title;
121
    }
122
123
    /**
124
     * Get article title
125
     *
126
     * Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier
127
     * 
128
     * @return string
129
     */
130
    private function getTitle(): string {
131
        $openGraph = $this->article()->getOpenGraph();
132
133
        // Rely on OpenGraph in case we have the data
134
        if (isset($openGraph['title'])) {
135
            return $this->cleanTitle($openGraph['title']);
136
        }
137
138
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline');
139
        if ($nodes->count()) {
140
            return $this->cleanTitle($nodes->first()->attr('content'));
141
        }
142
143
        $nodes = $this->article()->getDoc()->find('html > head > title');
144
        if ($nodes->count()) {
145
            return $this->cleanTitle(Helper::textNormalise($nodes->first()->text()));
146
        }
147
148
        return '';
149
    }
150
151
    /**
152
     * @param Document $doc
153
     * @param string $tag
154
     * @param string $property
155
     * @param string $value
156
     *
157
     * @return NodeList
158
     */
159
    private function getNodesByLowercasePropertyValue(Document $doc, string $tag, string $property, string $value): NodeList {
160
        return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']");
161
    }
162
163
    /**
164
     * @param Document $doc
165
     * @param string $property
166
     * @param string $value
167
     * @param string $attr
168
     *
169
     * @return string
170
     */
171
    private function getMetaContent(Document $doc, string $property, string $value, string $attr = 'content'): string {
172
        $nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value);
173
174
        if (!$nodes->count()) {
175
            return '';
176
        }
177
178
        $content = $nodes->first()->attr($attr);
179
180
        if (!is_string($content)) {
181
            return '';
182
        }
183
184
        return trim($content);
185
    }
186
187
    /**
188
     * If the article has meta language set in the source, use that
189
     *
190
     * @return string
191
     */
192
    private function getMetaLanguage(): string {
193
        $lang = '';
194
195
        $el = $this->article()->getDoc()->find('html[lang]');
196
197
        if ($el->count()) {
198
            $lang = $el->first()->attr('lang');
199
        }
200
201
        if (empty($lang)) {
202
            $selectors = [
203
                'html > head > meta[http-equiv=content-language]',
204
                'html > head > meta[name=lang]',
205
            ];
206
207
            foreach ($selectors as $selector) {
208
                $el = $this->article()->getDoc()->find($selector);
209
210
                if ($el->count()) {
211
                    $lang = $el->first()->attr('content');
212
                    break;
213
                }
214
            }
215
        }
216
217
        if (preg_match('@^[A-Za-z]{2}$@', $lang)) {
218
            return strtolower($lang);
219
        }
220
221
        return '';
222
    }
223
224
    /**
225
     * If the article has meta description set in the source, use that
226
     *
227
     * @return string
228
     */
229
    private function getMetaDescription(): string {
230
        $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description');
231
232
        if (empty($desc)) {
233
            $desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description');
234
        }
235
236
        if (empty($desc)) {
237
            $desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description');
238
        }
239
240
        return trim($desc);
241
    }
242
243
    /**
244
     * If the article has meta keywords set in the source, use that
245
     *
246
     * @return string
247
     */
248
    private function getMetaKeywords(): string {
249
        return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords');
250
    }
251
252
    /**
253
     * If the article has meta canonical link set in the url
254
     *
255
     * @return string|null
256
     */
257
    private function getCanonicalLink(): ?string {
258
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical');
259
260
        if ($nodes->count()) {
261
            return trim($nodes->first()->attr('href'));
262
        }
263
264
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url');
265
266
        if ($nodes->count()) {
267
            return trim($nodes->first()->attr('content'));
268
        }
269
270
        $nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url');
271
272
        if ($nodes->count()) {
273
            return trim($nodes->first()->attr('content'));
274
        }
275
276
        return $this->article()->getFinalUrl();
277
    }
278
}
279