1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Goose\Modules\Extractors; |
4
|
|
|
|
5
|
|
|
use Goose\Article; |
6
|
|
|
use Goose\Utils\Helper; |
7
|
|
|
use Goose\Traits\ArticleMutatorTrait; |
8
|
|
|
use Goose\Modules\AbstractModule; |
9
|
|
|
use Goose\Modules\ModuleInterface; |
10
|
|
|
use DOMWrap\Element; |
11
|
|
|
use DOMWrap\Document; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* Content Extractor |
15
|
|
|
* |
16
|
|
|
* @package Goose\Modules\Extractors |
17
|
|
|
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
18
|
|
|
*/ |
19
|
|
|
class MetaExtractor extends AbstractModule implements ModuleInterface { |
20
|
|
|
use ArticleMutatorTrait; |
21
|
|
|
|
22
|
|
|
/** @var string[] */ |
23
|
|
|
protected static $SPLITTER_CHARS = [ |
24
|
|
|
'|', '-', '»', ':', |
25
|
|
|
]; |
26
|
|
|
|
27
|
|
|
/** @var string */ |
28
|
|
|
protected static $A_REL_TAG_SELECTOR = "a[rel='tag'], a[href*='/tag/']"; |
29
|
|
|
|
30
|
|
|
/** @var string[] */ |
31
|
|
|
protected static $VIDEO_PROVIDERS = [ |
32
|
|
|
'youtube\.com', |
33
|
|
|
'youtu\.be', |
34
|
|
|
'vimeo\.com', |
35
|
|
|
'blip\.tv', |
36
|
|
|
'dailymotion\.com', |
37
|
|
|
'dai\.ly', |
38
|
|
|
'flickr\.com', |
39
|
|
|
'flic\.kr', |
40
|
|
|
]; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* @param Article $article |
44
|
|
|
*/ |
45
|
|
|
public function run(Article $article) { |
46
|
|
|
$this->article($article); |
47
|
|
|
|
48
|
|
|
$article->setOpenGraph($this->getOpenGraph()); |
49
|
|
|
$article->setTitle($this->getTitle()); |
50
|
|
|
$article->setMetaDescription($this->getMetaDescription()); |
51
|
|
|
$article->setMetaKeywords($this->getMetaKeywords()); |
52
|
|
|
$article->setCanonicalLink($this->getCanonicalLink()); |
53
|
|
|
$article->setTags($this->getTags()); |
54
|
|
|
|
55
|
|
|
if ($this->article()->getTopNode() instanceof Element) { |
56
|
|
|
$article->setVideos($this->getVideos()); |
57
|
|
|
$article->setLinks($this->getLinks()); |
58
|
|
|
$article->setPopularWords($this->getPopularWords()); |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
$article->setLanguage($this->getMetaLanguage()); |
62
|
|
|
|
63
|
|
|
$this->config()->set('language', $article->getLanguage()); |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Retrieve all OpenGraph meta data |
68
|
|
|
* |
69
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
70
|
|
|
* |
71
|
|
|
* @return string[] |
72
|
|
|
*/ |
73
|
|
|
private function getOpenGraph() { |
74
|
|
|
$results = array(); |
75
|
|
|
|
76
|
|
|
$nodes = $this->article()->getDoc()->find('meta[property^="og:"]'); |
77
|
|
|
|
78
|
|
View Code Duplication |
foreach ($nodes as $node) { |
|
|
|
|
79
|
|
|
$property = explode(':', $node->attr('property')); |
80
|
|
|
|
81
|
|
|
$results[$property[1]] = $node->attr('content'); |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
// Additionally retrieve type values based on provided og:type (http://ogp.me/#types) |
85
|
|
|
if (isset($results['type'])) { |
86
|
|
|
$nodes = $this->article()->getDoc()->find('meta[property^="' . $results['type'] .':"]'); |
87
|
|
|
|
88
|
|
View Code Duplication |
foreach ($nodes as $node) { |
|
|
|
|
89
|
|
|
$property = explode(':', $node->attr('property')); |
90
|
|
|
|
91
|
|
|
$results[$property[1]] = $node->attr('content'); |
92
|
|
|
} |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
return $results; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Clean title text |
100
|
|
|
* |
101
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
102
|
|
|
* |
103
|
|
|
* @param string $title |
104
|
|
|
* |
105
|
|
|
* @return string |
106
|
|
|
*/ |
107
|
|
|
private function cleanTitle($title) { |
108
|
|
|
$openGraph = $this->article()->getOpenGraph(); |
109
|
|
|
|
110
|
|
|
// Check if we have the site name in OpenGraph data |
111
|
|
|
if (isset($openGraph['site_name'])) { |
112
|
|
|
$title = str_replace($openGraph['site_name'], '', $title); |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
// Try to remove the domain from URL |
116
|
|
|
if ($this->article()->getDomain()) { |
117
|
|
|
$title = str_ireplace($this->article()->getDomain(), '', $title); |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
// Split the title in words |
121
|
|
|
// TechCrunch | my wonderfull article |
122
|
|
|
// my wonderfull article | TechCrunch |
123
|
|
|
$titleWords = preg_split('@[\s]+@', trim($title)); |
124
|
|
|
|
125
|
|
|
// Check for an empty title |
126
|
|
|
if (empty($titleWords)) { |
127
|
|
|
return ''; |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
// Check if last letter is in self::$SPLITTER_CHARS |
131
|
|
|
// if so remove it |
132
|
|
|
if (in_array($titleWords[count($titleWords) - 1], self::$SPLITTER_CHARS)) { |
133
|
|
|
array_pop($titleWords); |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
// Check if first letter is in self::$SPLITTER_CHARS |
137
|
|
|
// if so remove it |
138
|
|
|
if (isset($titleWords[0]) && in_array($titleWords[0], self::$SPLITTER_CHARS)) { |
139
|
|
|
array_shift($titleWords); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
// Rebuild the title |
143
|
|
|
$title = trim(implode(' ', $titleWords)); |
144
|
|
|
|
145
|
|
|
return $title; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Get article title |
150
|
|
|
* |
151
|
|
|
* Ported from python-goose https://github.com/grangier/python-goose/ by Xavier Grangier |
152
|
|
|
* |
153
|
|
|
* @return string |
154
|
|
|
*/ |
155
|
|
|
private function getTitle() { |
156
|
|
|
$openGraph = $this->article()->getOpenGraph(); |
157
|
|
|
|
158
|
|
|
// Rely on OpenGraph in case we have the data |
159
|
|
|
if (isset($openGraph['title'])) { |
160
|
|
|
return $this->cleanTitle($openGraph['title']); |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'headline'); |
164
|
|
|
if ($nodes->count()) { |
165
|
|
|
return $this->cleanTitle($nodes->first()->attr('content')); |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
$nodes = $this->article()->getDoc()->find('html > head > title'); |
169
|
|
|
if ($nodes->count()) { |
170
|
|
|
return $this->cleanTitle(Helper::textNormalise($nodes->first()->text())); |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
return ''; |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
/** |
177
|
|
|
* @param Document $doc |
178
|
|
|
* @param string $tag |
179
|
|
|
* @param string $property |
180
|
|
|
* @param string $value |
181
|
|
|
* |
182
|
|
|
* @return \DOMWrap\NodeList |
183
|
|
|
*/ |
184
|
|
|
private function getNodesByLowercasePropertyValue(Document $doc, $tag, $property, $value) { |
185
|
|
|
return $doc->findXPath("descendant::".$tag."[translate(@".$property.", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')='".$value."']"); |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
/** |
189
|
|
|
* @param Document $doc |
190
|
|
|
* @param string $property |
191
|
|
|
* @param string $value |
192
|
|
|
* @param string $attr |
193
|
|
|
* |
194
|
|
|
* @return string |
195
|
|
|
*/ |
196
|
|
|
private function getMetaContent(Document $doc, $property, $value, $attr = 'content') { |
197
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($doc, 'meta', $property, $value); |
198
|
|
|
|
199
|
|
|
if (!$nodes->count()) { |
200
|
|
|
return ''; |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
$content = $nodes->first()->attr($attr); |
204
|
|
|
$content = trim($content); |
205
|
|
|
|
206
|
|
|
return $content; |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
/** |
210
|
|
|
* If the article has meta language set in the source, use that |
211
|
|
|
* |
212
|
|
|
* @return string |
213
|
|
|
*/ |
214
|
|
|
private function getMetaLanguage() { |
215
|
|
|
$lang = ''; |
216
|
|
|
|
217
|
|
|
$el = $this->article()->getDoc()->find('html[lang]'); |
218
|
|
|
|
219
|
|
|
if ($el->count()) { |
220
|
|
|
$lang = $el->first()->attr('lang'); |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
if (empty($lang)) { |
224
|
|
|
$selectors = [ |
225
|
|
|
'html > head > meta[http-equiv=content-language]', |
226
|
|
|
'html > head > meta[name=lang]', |
227
|
|
|
]; |
228
|
|
|
|
229
|
|
|
foreach ($selectors as $selector) { |
230
|
|
|
$el = $this->article()->getDoc()->find($selector); |
231
|
|
|
|
232
|
|
|
if ($el->count()) { |
233
|
|
|
$lang = $el->first()->attr('content'); |
234
|
|
|
break; |
235
|
|
|
} |
236
|
|
|
} |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
if (preg_match('@^[A-Za-z]{2}$@', $lang)) { |
240
|
|
|
return strtolower($lang); |
241
|
|
|
} |
242
|
|
|
|
243
|
|
|
return ''; |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
/** |
247
|
|
|
* If the article has meta description set in the source, use that |
248
|
|
|
* |
249
|
|
|
* @return string |
250
|
|
|
*/ |
251
|
|
|
private function getMetaDescription() { |
252
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'description'); |
253
|
|
|
|
254
|
|
|
if (empty($desc)) { |
255
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'property', 'og:description'); |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
if (empty($desc)) { |
259
|
|
|
$desc = $this->getMetaContent($this->article()->getDoc(), 'name', 'twitter:description'); |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
return trim($desc); |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
/** |
266
|
|
|
* If the article has meta keywords set in the source, use that |
267
|
|
|
* |
268
|
|
|
* @return string |
269
|
|
|
*/ |
270
|
|
|
private function getMetaKeywords() { |
271
|
|
|
return $this->getMetaContent($this->article()->getDoc(), 'name', 'keywords'); |
272
|
|
|
} |
273
|
|
|
|
274
|
|
|
/** |
275
|
|
|
* If the article has meta canonical link set in the url |
276
|
|
|
* |
277
|
|
|
* @return string |
278
|
|
|
*/ |
279
|
|
|
private function getCanonicalLink() { |
280
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'link', 'rel', 'canonical'); |
281
|
|
|
|
282
|
|
|
if ($nodes->count()) { |
283
|
|
|
return trim($nodes->first()->attr('href')); |
284
|
|
|
} |
285
|
|
|
|
286
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'property', 'og:url'); |
287
|
|
|
|
288
|
|
|
if ($nodes->count()) { |
289
|
|
|
return trim($nodes->first()->attr('content')); |
290
|
|
|
} |
291
|
|
|
|
292
|
|
|
$nodes = $this->getNodesByLowercasePropertyValue($this->article()->getDoc(), 'meta', 'name', 'twitter:url'); |
293
|
|
|
|
294
|
|
|
if ($nodes->count()) { |
295
|
|
|
return trim($nodes->first()->attr('content')); |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
return $this->article()->getFinalUrl(); |
299
|
|
|
} |
300
|
|
|
|
301
|
|
|
/** |
302
|
|
|
* @return string[] |
303
|
|
|
*/ |
304
|
|
|
private function getTags() { |
305
|
|
|
$nodes = $this->article()->getDoc()->find(self::$A_REL_TAG_SELECTOR); |
306
|
|
|
|
307
|
|
|
$tags = []; |
308
|
|
|
|
309
|
|
|
foreach ($nodes as $node) { |
310
|
|
|
$tags[] = Helper::textNormalise($node->text()); |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
return $tags; |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
/** |
317
|
|
|
* Pulls out videos we like |
318
|
|
|
* |
319
|
|
|
* @return string[] |
320
|
|
|
*/ |
321
|
|
|
private function getVideos() { |
322
|
|
|
$videos = []; |
323
|
|
|
|
324
|
|
|
$nodes = $this->article()->getTopNode()->parent()->find('embed, object, iframe'); |
325
|
|
|
|
326
|
|
|
foreach ($nodes as $node) { |
327
|
|
|
if ($node->hasAttribute('src')) { |
328
|
|
|
$src = $node->attr('src'); |
329
|
|
|
} else { |
330
|
|
|
$src = $node->attr('data'); |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
$match = array_reduce(self::$VIDEO_PROVIDERS, function($match, $domain) use ($src) { |
334
|
|
|
$srcHost = parse_url($src, PHP_URL_HOST); |
335
|
|
|
$srcScheme = parse_url($src, PHP_URL_SCHEME); |
336
|
|
|
|
337
|
|
|
return $match || preg_match('@' . $domain . '$@i', $srcHost) && in_array($srcScheme, ['http', 'https']); |
338
|
|
|
}); |
339
|
|
|
|
340
|
|
|
if ($match) { |
341
|
|
|
$videos[] = $src; |
342
|
|
|
} |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
return $videos; |
346
|
|
|
} |
347
|
|
|
|
348
|
|
|
/** |
349
|
|
|
* Pulls out links we like |
350
|
|
|
* |
351
|
|
|
* @return string[] |
352
|
|
|
*/ |
353
|
|
|
private function getLinks() { |
354
|
|
|
$goodLinks = []; |
355
|
|
|
|
356
|
|
|
$candidates = $this->article()->getTopNode()->parent()->find('a[href]'); |
357
|
|
|
|
358
|
|
|
foreach ($candidates as $el) { |
359
|
|
|
if ($el->attr('href') != '#' && trim($el->attr('href')) != '') { |
360
|
|
|
$goodLinks[] = [ |
361
|
|
|
'url' => $el->attr('href'), |
362
|
|
|
'text' => Helper::textNormalise($el->text()), |
363
|
|
|
]; |
364
|
|
|
} |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
return $goodLinks; |
368
|
|
|
} |
369
|
|
|
|
370
|
|
|
/** |
371
|
|
|
* @return string[] |
372
|
|
|
*/ |
373
|
|
|
private function getPopularWords() { |
374
|
|
|
$limit = 5; |
375
|
|
|
$minimumFrequency = 1; |
376
|
|
|
$stopWords = $this->config()->getStopWords()->getCurrentStopWords(); |
377
|
|
|
|
378
|
|
|
$text = $this->article()->getTitle(); |
379
|
|
|
$text .= ' ' . $this->article()->getMetaDescription(); |
380
|
|
|
|
381
|
|
|
if ($this->article()->getTopNode()) { |
382
|
|
|
$text .= ' ' . $this->article()->getCleanedArticleText(); |
383
|
|
|
} |
384
|
|
|
|
385
|
|
|
// Decode and split words by white-space |
386
|
|
|
$text = html_entity_decode($text, ENT_COMPAT | ENT_HTML5, 'UTF-8'); |
387
|
|
|
$words = preg_split('@[\s]+@iu', $text, -1, PREG_SPLIT_NO_EMPTY); |
388
|
|
|
|
389
|
|
|
// Determine stop words currently in $words |
390
|
|
|
$ignoreWords = array_intersect($words, $stopWords); |
391
|
|
|
// Remove ignored words from $words |
392
|
|
|
$words = array_diff($words, $ignoreWords); |
393
|
|
|
|
394
|
|
|
// Count and sort $words |
395
|
|
|
$words = array_count_values($words); |
396
|
|
|
arsort($words); |
397
|
|
|
|
398
|
|
|
// Limit and filter $words |
399
|
|
|
$words = array_slice($words, 0, $limit); |
400
|
|
|
$words = array_filter($words, function($value) use ($minimumFrequency){ |
401
|
|
|
return !($value < $minimumFrequency); |
402
|
|
|
}); |
403
|
|
|
|
404
|
|
|
return $words; |
405
|
|
|
} |
406
|
|
|
} |
407
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.