1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Goose\Modules\Extractors; |
4
|
|
|
|
5
|
|
|
use Goose\Article; |
6
|
|
|
use Goose\Utils\Helper; |
7
|
|
|
use Goose\Traits\ArticleMutatorTrait; |
8
|
|
|
use Goose\Modules\AbstractModule; |
9
|
|
|
use Goose\Modules\ModuleInterface; |
10
|
|
|
use DOMWrap\Document; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Content Extractor |
14
|
|
|
* |
15
|
|
|
* @package Goose\Modules\Extractors |
16
|
|
|
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
17
|
|
|
*/ |
18
|
|
|
class MetaExtractor extends AbstractModule implements ModuleInterface { |
19
|
|
|
use ArticleMutatorTrait; |
20
|
|
|
|
21
|
|
|
/** @var string[] */ |
22
|
|
|
protected static $SPLITTER_CHARS = [ |
23
|
|
|
'|', '-', '»', ':', |
24
|
|
|
]; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* @param Article $article |
28
|
|
|
*/ |
29
|
|
|
public function run(Article $article) { |
30
|
|
|
$this->article($article); |
31
|
|
|
|
32
|
|
|
$article->setOpenGraph($this->getOpenGraph()); |
33
|
|
|
$article->setTitle($this->getTitle()); |
34
|
|
|
$article->setMetaDescription($this->getMetaDescription()); |
35
|
|
|
$article->setMetaKeywords($this->getMetaKeywords()); |
36
|
|
|
$article->setCanonicalLink($this->getCanonicalLink()); |
37
|
|
|
|
38
|
|
|
$article->setLanguage($this->getMetaLanguage()); |
39
|
|
|
|
40
|
|
|
$this->config()->set('language', $article->getLanguage()); |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* Retrieve all OpenGraph meta data |
45
|
|
|
* |
46
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
47
|
|
|
* |
48
|
|
|
* @return string[] |
49
|
|
|
*/ |
50
|
|
|
private function getOpenGraph() { |
51
|
|
|
$results = array(); |
52
|
|
|
|
53
|
|
|
$nodes = $this->article()->getDoc()->find('meta[property^="og:"]'); |
54
|
|
|
|
55
|
|
View Code Duplication |
foreach ($nodes as $node) { |
|
|
|
|
56
|
|
|
$property = explode(':', $node->attr('property')); |
57
|
|
|
|
58
|
|
|
$results[$property[1]] = $node->attr('content'); |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
// Additionally retrieve type values based on provided og:type (http://ogp.me/#types) |
62
|
|
|
if (isset($results['type'])) { |
63
|
|
|
$nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]'); |
64
|
|
|
|
65
|
|
View Code Duplication |
foreach ($nodes as $node) { |
|
|
|
|
66
|
|
|
$property = explode(':', $node->attr('property')); |
67
|
|
|
|
68
|
|
|
$results[$property[1]] = $node->attr('content'); |
69
|
|
|
} |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
return $results; |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* Clean title text |
77
|
|
|
* |
78
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
79
|
|
|
* |
80
|
|
|
* @param string $title |
81
|
|
|
* |
82
|
|
|
* @return string |
83
|
|
|
*/ |
84
|
|
|
private function cleanTitle($title) { |
85
|
|
|
$openGraph = $this->article()->getOpenGraph(); |
86
|
|
|
|
87
|
|
|
// Check if we have the site name in OpenGraph data |
88
|
|
|
if (isset($openGraph['site_name'])) { |
89
|
|
|
$title = str_replace($openGraph['site_name'], '', $title); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
// Try to remove the domain from URL |
93
|
|
|
if ($this->article()->getDomain()) { |
94
|
|
|
$title = str_ireplace($this->article()->getDomain(), '', $title); |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
// Split the title in words |
98
|
|
|
// TechCrunch | my wonderfull article |
99
|
|
|
// my wonderfull article | TechCrunch |
100
|
|
|
$titleWords = preg_split('@[\s]+@', trim($title)); |
101
|
|
|
|
102
|
|
|
// Check for an empty title |
103
|
|
|
if (empty($titleWords)) { |
104
|
|
|
return ''; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
// Check if last letter is in self::$SPLITTER_CHARS |
108
|
|
|
// if so remove it |
109
|
|
|
if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) { |
110
|
|
|
array_pop($titleWords); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
// Check if first letter is in self::$SPLITTER_CHARS |
114
|
|
|
// if so remove it |
115
|
|
|
if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) { |
116
|
|
|
array_shift($titleWords); |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
// Rebuild the title |
120
|
|
|
$title = trim(implode(' ', $titleWords)); |
121
|
|
|
|
122
|
|
|
return $title; |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
/** |
126
|
|
|
* Get article title |
127
|
|
|
* |
128
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
129
|
|
|
* |
130
|
|
|
* @return string |
131
|
|
|
*/ |
132
|
|
|
private function getTitle() { |
133
|
|
|
$openGraph = $this->article()->getOpenGraph(); |
134
|
|
|
|
135
|
|
|
// Rely on OpenGraph in case we have the data |
136
|
|
|
if (isset($openGraph['title'])) { |
137
|
|
|
return $this->cleanTitle($openGraph['title']); |
138
|
|
|
} |
139
|
|
|
|
140
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline'); |
141
|
|
|
if ($nodes->count()) { |
142
|
|
|
return $this->cleanTitle($nodes->first()->attr('content')); |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
$nodes = $this->article()->getDoc()->find('html > head > title'); |
146
|
|
|
if ($nodes->count()) { |
147
|
|
|
return $this->cleanTitle(Helper::textNormalise($nodes->first()->text())); |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
return ''; |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
/** |
154
|
|
|
* @param Document $doc |
155
|
|
|
* @param string $tag |
156
|
|
|
* @param string $property |
157
|
|
|
* @param string $value |
158
|
|
|
* |
159
|
|
|
* @return \DOMWrap\NodeList |
160
|
|
|
*/ |
161
|
|
|
private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) { |
162
|
|
|
return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']"); |
163
|
|
|
} |
164
|
|
|
|
165
|
|
|
/** |
166
|
|
|
* @param Document $doc |
167
|
|
|
* @param string $property |
168
|
|
|
* @param string $value |
169
|
|
|
* @param string $attr |
170
|
|
|
* |
171
|
|
|
* @return string |
172
|
|
|
*/ |
173
|
|
|
private function getMetaContent(Document $doc, $property, $value, $attr = 'content') { |
174
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value); |
175
|
|
|
|
176
|
|
|
if (!$nodes->count()) { |
177
|
|
|
return ''; |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
$content = $nodes->first()->attr($attr); |
181
|
|
|
$content = trim($content); |
182
|
|
|
|
183
|
|
|
return $content; |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
/** |
187
|
|
|
* If the article has meta language set in the source, use that |
188
|
|
|
* |
189
|
|
|
* @return string |
190
|
|
|
*/ |
191
|
|
|
private function getMetaLanguage() { |
192
|
|
|
$lang = ''; |
193
|
|
|
|
194
|
|
|
$el = $this->article()->getDoc()->find('html[lang]'); |
195
|
|
|
|
196
|
|
|
if ($el->count()) { |
197
|
|
|
$lang = $el->first()->attr('lang'); |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
if (empty($lang)) { |
201
|
|
|
$selectors = [ |
202
|
|
|
'html > head > meta[http-equiv=content-language]', |
203
|
|
|
'html > head > meta[name=lang]', |
204
|
|
|
]; |
205
|
|
|
|
206
|
|
|
foreach ($selectors as $selector) { |
207
|
|
|
$el = $this->article()->getDoc()->find($selector); |
208
|
|
|
|
209
|
|
|
if ($el->count()) { |
210
|
|
|
$lang = $el->first()->attr('content'); |
211
|
|
|
break; |
212
|
|
|
} |
213
|
|
|
} |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
if (preg_match('@^[A-Za-z]{2}$@', $lang)) { |
217
|
|
|
return strtolower($lang); |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
return ''; |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
/** |
224
|
|
|
* If the article has meta description set in the source, use that |
225
|
|
|
* |
226
|
|
|
* @return string |
227
|
|
|
*/ |
228
|
|
|
private function getMetaDescription() { |
229
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description'); |
230
|
|
|
|
231
|
|
|
if (empty($desc)) { |
232
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description'); |
233
|
|
|
} |
234
|
|
|
|
235
|
|
|
if (empty($desc)) { |
236
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description'); |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
return trim($desc); |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
/** |
243
|
|
|
* If the article has meta keywords set in the source, use that |
244
|
|
|
* |
245
|
|
|
* @return string |
246
|
|
|
*/ |
247
|
|
|
private function getMetaKeywords() { |
248
|
|
|
return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords'); |
249
|
|
|
} |
250
|
|
|
|
251
|
|
|
/** |
252
|
|
|
* If the article has meta canonical link set in the url |
253
|
|
|
* |
254
|
|
|
* @return string |
255
|
|
|
*/ |
256
|
|
|
private function getCanonicalLink() { |
257
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical'); |
258
|
|
|
|
259
|
|
|
if ($nodes->count()) { |
260
|
|
|
return trim($nodes->first()->attr('href')); |
261
|
|
|
} |
262
|
|
|
|
263
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url'); |
264
|
|
|
|
265
|
|
|
if ($nodes->count()) { |
266
|
|
|
return trim($nodes->first()->attr('content')); |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url'); |
270
|
|
|
|
271
|
|
|
if ($nodes->count()) { |
272
|
|
|
return trim($nodes->first()->attr('content')); |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
return $this->article()->getFinalUrl(); |
276
|
|
|
} |
277
|
|
|
} |
278
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.