1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace andreskrey\Readability; |
4
|
|
|
|
5
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMDocument; |
6
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMElement; |
7
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMNode; |
8
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMText; |
9
|
|
|
use andreskrey\Readability\Nodes\NodeUtility; |
10
|
|
|
use Psr\Log\LoggerInterface; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Class Readability. |
14
|
|
|
*/ |
15
|
|
|
class Readability |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* Main DOMDocument where all the magic happens. |
19
|
|
|
* |
20
|
|
|
* @var DOMDocument |
21
|
|
|
*/ |
22
|
|
|
protected $dom; |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* Title of the article. |
26
|
|
|
* |
27
|
|
|
* @var string|null |
28
|
|
|
*/ |
29
|
|
|
protected $title = null; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* Final DOMDocument with the fully parsed HTML. |
33
|
|
|
* |
34
|
|
|
* @var DOMDocument|null |
35
|
|
|
*/ |
36
|
|
|
protected $content = null; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Excerpt of the article. |
40
|
|
|
* |
41
|
|
|
* @var string|null |
42
|
|
|
*/ |
43
|
|
|
protected $excerpt = null; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Main image of the article. |
47
|
|
|
* |
48
|
|
|
* @var string|null |
49
|
|
|
*/ |
50
|
|
|
protected $image = null; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* Author of the article. Extracted from the byline tags and other social media properties. |
54
|
|
|
* |
55
|
|
|
* @var string|null |
56
|
|
|
*/ |
57
|
|
|
protected $author = null; |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* Website name. |
61
|
|
|
* |
62
|
|
|
* @var string|null |
63
|
|
|
*/ |
64
|
|
|
protected $siteName = null; |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* Direction of the text. |
68
|
|
|
* |
69
|
|
|
* @var string|null |
70
|
|
|
*/ |
71
|
|
|
protected $direction = null; |
72
|
|
|
|
73
|
|
|
/** |
74
|
|
|
* Configuration object. |
75
|
|
|
* |
76
|
|
|
* @var Configuration |
77
|
|
|
*/ |
78
|
|
|
private $configuration; |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* Logger object. |
82
|
|
|
* |
83
|
|
|
* @var LoggerInterface |
84
|
|
|
*/ |
85
|
|
|
private $logger; |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Collection of attempted text extractions. |
89
|
|
|
* |
90
|
|
|
* @var array |
91
|
|
|
*/ |
92
|
|
|
private $attempts = []; |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* @var array |
96
|
|
|
*/ |
97
|
|
|
private $defaultTagsToScore = [ |
98
|
|
|
'section', |
99
|
|
|
'h2', |
100
|
|
|
'h3', |
101
|
|
|
'h4', |
102
|
|
|
'h5', |
103
|
|
|
'h6', |
104
|
|
|
'p', |
105
|
|
|
'td', |
106
|
|
|
'pre', |
107
|
|
|
]; |
108
|
|
|
|
109
|
|
|
/** |
110
|
|
|
* @var array |
111
|
|
|
*/ |
112
|
|
|
private $alterToDIVExceptions = [ |
113
|
|
|
'div', |
114
|
|
|
'article', |
115
|
|
|
'section', |
116
|
|
|
'p', |
117
|
|
|
]; |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* Readability constructor. |
121
|
|
|
* |
122
|
|
|
* @param Configuration $configuration |
123
|
|
|
*/ |
124
|
|
|
public function __construct(Configuration $configuration) |
125
|
|
|
{ |
126
|
|
|
$this->configuration = $configuration; |
127
|
|
|
$this->logger = $this->configuration->getLogger(); |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
/** |
131
|
|
|
* Main parse function. |
132
|
|
|
* |
133
|
|
|
* @param $html |
134
|
|
|
* |
135
|
|
|
* @throws ParseException |
136
|
|
|
* |
137
|
|
|
* @return bool |
138
|
|
|
*/ |
139
|
|
|
public function parse($html) |
140
|
|
|
{ |
141
|
|
|
$this->logger->info('*** Starting parse process...'); |
142
|
|
|
|
143
|
|
|
$this->dom = $this->loadHTML($html); |
144
|
|
|
|
145
|
|
|
// Checking for minimum HTML to work with. |
146
|
|
|
if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) { |
147
|
|
|
$this->logger->emergency('No body tag present or body tag empty'); |
148
|
|
|
|
149
|
|
|
throw new ParseException('Invalid or incomplete HTML.'); |
150
|
|
|
} |
151
|
|
|
|
152
|
|
|
$this->getMetadata(); |
153
|
|
|
|
154
|
|
|
$this->getMainImage(); |
155
|
|
|
|
156
|
|
|
while (true) { |
157
|
|
|
$root = $root->firstChild; |
158
|
|
|
|
159
|
|
|
$elementsToScore = $this->getNodes($root); |
160
|
|
|
$this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore))); |
161
|
|
|
|
162
|
|
|
$result = $this->rateNodes($elementsToScore); |
163
|
|
|
|
164
|
|
|
/* |
165
|
|
|
* Now that we've gone through the full algorithm, check to see if |
166
|
|
|
* we got any meaningful content. If we didn't, we may need to re-run |
167
|
|
|
* grabArticle with different flags set. This gives us a higher likelihood of |
168
|
|
|
* finding the content, and the sieve approach gives us a higher likelihood of |
169
|
|
|
* finding the -right- content. |
170
|
|
|
*/ |
171
|
|
|
|
172
|
|
|
$length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); |
173
|
|
|
|
174
|
|
|
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); |
175
|
|
|
|
176
|
|
|
if ($result && $length < $this->configuration->getCharThreshold()) { |
177
|
|
|
$this->dom = $this->loadHTML($html); |
178
|
|
|
$root = $this->dom->getElementsByTagName('body')->item(0); |
179
|
|
|
|
180
|
|
|
if ($this->configuration->getStripUnlikelyCandidates()) { |
181
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false'); |
182
|
|
|
$this->configuration->setStripUnlikelyCandidates(false); |
183
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
184
|
|
|
} elseif ($this->configuration->getWeightClasses()) { |
185
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false'); |
186
|
|
|
$this->configuration->setWeightClasses(false); |
187
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
188
|
|
|
} elseif ($this->configuration->getCleanConditionally()) { |
189
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false'); |
190
|
|
|
$this->configuration->setCleanConditionally(false); |
191
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
192
|
|
|
} else { |
193
|
|
|
$this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.'); |
194
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
195
|
|
|
|
196
|
|
|
// No luck after removing flags, just return the longest text we found during the different loops |
197
|
|
|
usort($this->attempts, function($a, $b) { |
198
|
|
|
return $a['textLength'] < $b['textLength']; |
199
|
|
|
}); |
200
|
|
|
|
201
|
|
|
// But first check if we actually have something |
202
|
|
|
if (!$this->attempts[0]['textLength']) { |
203
|
|
|
$this->logger->emergency('[Parsing] Could not parse text, giving up :('); |
204
|
|
|
|
205
|
|
|
throw new ParseException('Could not parse text.'); |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
$this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.'); |
209
|
|
|
|
210
|
|
|
$result = $this->attempts[0]['articleContent']; |
211
|
|
|
break; |
212
|
|
|
} |
213
|
|
|
} else { |
214
|
|
|
break; |
215
|
|
|
} |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
$result = $this->postProcessContent($result); |
|
|
|
|
219
|
|
|
|
220
|
|
|
// If we haven't found an excerpt in the article's metadata, use the article's |
221
|
|
|
// first paragraph as the excerpt. This can be used for displaying a preview of |
222
|
|
|
// the article's content. |
223
|
|
|
if (!$this->getExcerpt()) { |
224
|
|
|
$this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); |
225
|
|
|
$paragraphs = $result->getElementsByTagName('p'); |
226
|
|
|
if ($paragraphs->length > 0) { |
227
|
|
|
$this->setExcerpt(trim($paragraphs->item(0)->textContent)); |
228
|
|
|
} |
229
|
|
|
} |
230
|
|
|
|
231
|
|
|
$this->setContent($result); |
232
|
|
|
|
233
|
|
|
$this->logger->info('*** Parse successful :)'); |
234
|
|
|
|
235
|
|
|
return true; |
236
|
|
|
} |
237
|
|
|
|
238
|
|
|
/** |
239
|
|
|
* Creates a DOM Document object and loads the provided HTML on it. |
240
|
|
|
* |
241
|
|
|
* Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) |
242
|
|
|
* Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs |
243
|
|
|
* because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both |
244
|
|
|
* objects and ruining the backup. |
245
|
|
|
* |
246
|
|
|
* @param string $html |
247
|
|
|
* |
248
|
|
|
* @return DOMDocument |
249
|
|
|
*/ |
250
|
|
|
private function loadHTML($html) |
251
|
|
|
{ |
252
|
|
|
$this->logger->debug('[Loading] Loading HTML...'); |
253
|
|
|
|
254
|
|
|
// To avoid throwing a gazillion of errors on malformed HTMLs |
255
|
|
|
libxml_use_internal_errors(true); |
256
|
|
|
|
257
|
|
|
$dom = new DOMDocument('1.0', 'utf-8'); |
258
|
|
|
|
259
|
|
|
if (!$this->configuration->getSubstituteEntities()) { |
260
|
|
|
// Keep the original HTML entities |
261
|
|
|
$dom->substituteEntities = false; |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
if ($this->configuration->getNormalizeEntities()) { |
265
|
|
|
$this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); |
266
|
|
|
// Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content |
267
|
|
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
if ($this->configuration->getSummonCthulhu()) { |
271
|
|
|
$this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); |
272
|
|
|
$html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
// Prepend the XML tag to avoid having issues with special characters. Should be harmless. |
276
|
|
|
$dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
277
|
|
|
$dom->encoding = 'UTF-8'; |
278
|
|
|
|
279
|
|
|
$this->removeScripts($dom); |
280
|
|
|
|
281
|
|
|
$this->prepDocument($dom); |
282
|
|
|
|
283
|
|
|
$this->logger->debug('[Loading] Loaded HTML successfully.'); |
284
|
|
|
|
285
|
|
|
return $dom; |
286
|
|
|
} |
287
|
|
|
|
288
|
|
|
/** |
289
|
|
|
* Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. |
290
|
|
|
*/ |
291
|
|
|
private function getMetadata() |
292
|
|
|
{ |
293
|
|
|
$this->logger->debug('[Metadata] Retrieving metadata...'); |
294
|
|
|
|
295
|
|
|
$values = []; |
296
|
|
|
// property is a space-separated list of values |
297
|
|
|
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i'; |
298
|
|
|
|
299
|
|
|
// name is a single value |
300
|
|
|
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; |
301
|
|
|
|
302
|
|
|
// Find description tags. |
303
|
|
|
foreach ($this->dom->getElementsByTagName('meta') as $meta) { |
304
|
|
|
/* @var DOMNode $meta */ |
305
|
|
|
$elementName = $meta->getAttribute('name'); |
306
|
|
|
$elementProperty = $meta->getAttribute('property'); |
307
|
|
|
$content = $meta->getAttribute('content'); |
308
|
|
|
$matches = null; |
309
|
|
|
$name = null; |
310
|
|
|
|
311
|
|
|
if ($elementProperty) { |
312
|
|
|
if (preg_match($propertyPattern, $elementProperty, $matches)) { |
313
|
|
|
for ($i = count($matches) - 1; $i >= 0; $i--) { |
314
|
|
|
// Convert to lowercase, and remove any whitespace |
315
|
|
|
// so we can match below. |
316
|
|
|
$name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); |
317
|
|
|
// multiple authors |
318
|
|
|
$values[$name] = trim($content); |
319
|
|
|
} |
320
|
|
|
} |
321
|
|
|
} |
322
|
|
|
|
323
|
|
|
if (!$matches && $elementName && preg_match($namePattern, $elementName)) { |
324
|
|
|
$name = $elementName; |
325
|
|
|
if ($content) { |
326
|
|
|
// Convert to lowercase, remove any whitespace, and convert dots |
327
|
|
|
// to colons so we can match below. |
328
|
|
|
$name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name)); |
329
|
|
|
$values[$name] = trim($content); |
330
|
|
|
} |
331
|
|
|
} |
332
|
|
|
} |
333
|
|
|
|
334
|
|
|
// get title |
335
|
|
|
/* |
336
|
|
|
* This is a very convoluted way of extracting the first matching key of the $values array |
337
|
|
|
* against a set of options. |
338
|
|
|
* |
339
|
|
|
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. |
340
|
|
|
* Will probably replace it with ??s after dropping support of PHP5.6 |
341
|
|
|
*/ |
342
|
|
|
$key = current(array_intersect([ |
343
|
|
|
'dc:title', |
344
|
|
|
'dcterm:title', |
345
|
|
|
'og:title', |
346
|
|
|
'weibo:article:title', |
347
|
|
|
'weibo:webpage:title', |
348
|
|
|
'title', |
349
|
|
|
'twitter:title' |
350
|
|
|
], array_keys($values))); |
351
|
|
|
|
352
|
|
|
$this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); |
353
|
|
|
|
354
|
|
|
if (!$this->getTitle()) { |
355
|
|
|
$this->setTitle($this->getArticleTitle()); |
356
|
|
|
} |
357
|
|
|
|
358
|
|
|
// get author |
359
|
|
|
$key = current(array_intersect([ |
360
|
|
|
'dc:creator', |
361
|
|
|
'dcterm:creator', |
362
|
|
|
'author' |
363
|
|
|
], array_keys($values))); |
364
|
|
|
|
365
|
|
|
$this->setAuthor(isset($values[$key]) ? $values[$key] : null); |
366
|
|
|
|
367
|
|
|
// get description |
368
|
|
|
$key = current(array_intersect([ |
369
|
|
|
'dc:description', |
370
|
|
|
'dcterm:description', |
371
|
|
|
'og:description', |
372
|
|
|
'weibo:article:description', |
373
|
|
|
'weibo:webpage:description', |
374
|
|
|
'description', |
375
|
|
|
'twitter:description' |
376
|
|
|
], array_keys($values))); |
377
|
|
|
|
378
|
|
|
$this->setExcerpt(isset($values[$key]) ? $values[$key] : null); |
379
|
|
|
|
380
|
|
|
// get main image |
381
|
|
|
$key = current(array_intersect([ |
382
|
|
|
'image', |
383
|
|
|
'og:image', |
384
|
|
|
'twitter:image' |
385
|
|
|
], array_keys($values))); |
386
|
|
|
|
387
|
|
|
$this->setImage(isset($values[$key]) ? $values[$key] : null); |
388
|
|
|
|
389
|
|
|
$key = current(array_intersect([ |
390
|
|
|
'og:site_name' |
391
|
|
|
], array_keys($values))); |
392
|
|
|
|
393
|
|
|
$this->setSiteName(isset($values[$key]) ? $values[$key] : null); |
394
|
|
|
} |
395
|
|
|
|
396
|
|
|
/** |
397
|
|
|
* Returns all the images of the parsed article. |
398
|
|
|
* |
399
|
|
|
* @return array |
400
|
|
|
*/ |
401
|
|
|
public function getImages() |
402
|
|
|
{ |
403
|
|
|
$result = []; |
404
|
|
|
if ($this->getImage()) { |
405
|
|
|
$result[] = $this->getImage(); |
406
|
|
|
} |
407
|
|
|
|
408
|
|
|
if (null == $this->getDOMDocument()) { |
409
|
|
|
return $result; |
410
|
|
|
} |
411
|
|
|
|
412
|
|
|
foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) { |
413
|
|
|
if ($src = $img->getAttribute('src')) { |
414
|
|
|
$result[] = $src; |
415
|
|
|
} |
416
|
|
|
} |
417
|
|
|
|
418
|
|
|
if ($this->configuration->getFixRelativeURLs()) { |
419
|
|
|
foreach ($result as &$imgSrc) { |
420
|
|
|
$imgSrc = $this->toAbsoluteURI($imgSrc); |
421
|
|
|
} |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
$result = array_unique(array_filter($result)); |
425
|
|
|
|
426
|
|
|
return $result; |
427
|
|
|
} |
428
|
|
|
|
429
|
|
|
/** |
430
|
|
|
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't |
431
|
|
|
* find a correct image. |
432
|
|
|
*/ |
433
|
|
|
public function getMainImage() |
434
|
|
|
{ |
435
|
|
|
$imgUrl = false; |
436
|
|
|
|
437
|
|
|
if ($this->getImage() !== null) { |
438
|
|
|
$imgUrl = $this->getImage(); |
439
|
|
|
} |
440
|
|
|
|
441
|
|
|
if (!$imgUrl) { |
442
|
|
|
foreach ($this->dom->getElementsByTagName('link') as $link) { |
443
|
|
|
/** @var \DOMElement $link */ |
444
|
|
|
/* |
445
|
|
|
* Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and |
446
|
|
|
* finally check for the existence of the href attribute, which should hold the image url. |
447
|
|
|
*/ |
448
|
|
|
if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { |
449
|
|
|
$imgUrl = $link->getAttribute('href'); |
450
|
|
|
break; |
451
|
|
|
} |
452
|
|
|
} |
453
|
|
|
} |
454
|
|
|
|
455
|
|
|
if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) { |
456
|
|
|
$this->setImage($this->toAbsoluteURI($imgUrl)); |
457
|
|
|
} |
458
|
|
|
} |
459
|
|
|
|
460
|
|
|
/** |
461
|
|
|
* Returns the title of the html. Prioritizes the title from the metadata against the title tag. |
462
|
|
|
* |
463
|
|
|
* @return string|null |
464
|
|
|
*/ |
465
|
|
|
private function getArticleTitle() |
466
|
|
|
{ |
467
|
|
|
$originalTitle = null; |
468
|
|
|
|
469
|
|
|
if ($this->getTitle()) { |
470
|
|
|
$originalTitle = $this->getTitle(); |
471
|
|
|
} else { |
472
|
|
|
$this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...'); |
473
|
|
|
$titleTag = $this->dom->getElementsByTagName('title'); |
474
|
|
|
if ($titleTag->length > 0) { |
475
|
|
|
$this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue)); |
476
|
|
|
$originalTitle = $titleTag->item(0)->nodeValue; |
477
|
|
|
} |
478
|
|
|
} |
479
|
|
|
|
480
|
|
|
if ($originalTitle === null) { |
481
|
|
|
return null; |
482
|
|
|
} |
483
|
|
|
|
484
|
|
|
$curTitle = $originalTitle = trim($originalTitle); |
485
|
|
|
$titleHadHierarchicalSeparators = false; |
486
|
|
|
|
487
|
|
|
/* |
488
|
|
|
* If there's a separator in the title, first remove the final part |
489
|
|
|
* |
490
|
|
|
* Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false |
491
|
|
|
* I can assure you it works properly if you let the code run. |
492
|
|
|
*/ |
493
|
|
|
if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { |
494
|
|
|
$titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); |
495
|
|
|
$curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); |
496
|
|
|
|
497
|
|
|
$this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); |
498
|
|
|
|
499
|
|
|
// If the resulting title is too short (3 words or fewer), remove |
500
|
|
|
// the first part instead: |
501
|
|
|
if (count(preg_split('/\s+/', $curTitle)) < 3) { |
|
|
|
|
502
|
|
|
$curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); |
503
|
|
|
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
504
|
|
|
} |
505
|
|
|
} elseif (strpos($curTitle, ': ') !== false) { |
506
|
|
|
// Check if we have an heading containing this exact string, so we |
507
|
|
|
// could assume it's the full title. |
508
|
|
|
$match = false; |
509
|
|
|
for ($i = 1; $i <= 2; $i++) { |
510
|
|
|
foreach ($this->dom->getElementsByTagName('h'.$i) as $hTag) { |
511
|
|
|
// Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs |
512
|
|
|
if (trim($hTag->nodeValue) === trim($curTitle)) { |
513
|
|
|
$match = true; |
514
|
|
|
} |
515
|
|
|
} |
516
|
|
|
} |
517
|
|
|
|
518
|
|
|
// If we don't, let's extract the title out of the original title string. |
519
|
|
|
if (!$match) { |
520
|
|
|
$curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); |
521
|
|
|
|
522
|
|
|
$this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle)); |
523
|
|
|
|
524
|
|
|
// If the title is now too short, try the first colon instead: |
525
|
|
|
if (count(preg_split('/\s+/', $curTitle)) < 3) { |
526
|
|
|
$curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); |
527
|
|
|
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
528
|
|
|
} elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { |
529
|
|
|
// But if we have too many words before the colon there's something weird |
530
|
|
|
// with the titles and the H tags so let's just use the original title instead |
531
|
|
|
$curTitle = $originalTitle; |
532
|
|
|
} |
533
|
|
|
} |
534
|
|
|
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
535
|
|
|
$hOnes = $this->dom->getElementsByTagName('h1'); |
536
|
|
|
|
537
|
|
|
if ($hOnes->length === 1) { |
538
|
|
|
$curTitle = $hOnes->item(0)->nodeValue; |
539
|
|
|
$this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle)); |
540
|
|
|
} |
541
|
|
|
} |
542
|
|
|
|
543
|
|
|
$curTitle = trim($curTitle); |
544
|
|
|
|
545
|
|
|
/* |
546
|
|
|
* If we now have 4 words or fewer as our title, and either no |
547
|
|
|
* 'hierarchical' separators (\, /, > or ») were found in the original |
548
|
|
|
* title or we decreased the number of words by more than 1 word, use |
549
|
|
|
* the original title. |
550
|
|
|
*/ |
551
|
|
|
$curTitleWordCount = count(preg_split('/\s+/', $curTitle)); |
552
|
|
|
$originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1; |
553
|
|
|
|
554
|
|
|
if ($curTitleWordCount <= 4 && |
555
|
|
|
(!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) { |
556
|
|
|
$curTitle = $originalTitle; |
557
|
|
|
|
558
|
|
|
$this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle)); |
559
|
|
|
} |
560
|
|
|
|
561
|
|
|
return $curTitle; |
562
|
|
|
} |
563
|
|
|
|
564
|
|
|
/** |
565
|
|
|
* Convert URI to an absolute URI. |
566
|
|
|
* |
567
|
|
|
* @param $uri string URI to convert |
568
|
|
|
* |
569
|
|
|
* @return string |
570
|
|
|
*/ |
571
|
|
|
private function toAbsoluteURI($uri) |
572
|
|
|
{ |
573
|
|
|
list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); |
574
|
|
|
|
575
|
|
|
// If this is already an absolute URI, return it. |
576
|
|
|
if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { |
577
|
|
|
return $uri; |
578
|
|
|
} |
579
|
|
|
|
580
|
|
|
// Scheme-rooted relative URI. |
581
|
|
|
if (substr($uri, 0, 2) === '//') { |
582
|
|
|
return $scheme.'://'.substr($uri, 2); |
583
|
|
|
} |
584
|
|
|
|
585
|
|
|
// Prepath-rooted relative URI. |
586
|
|
|
if (substr($uri, 0, 1) === '/') { |
587
|
|
|
return $prePath.$uri; |
588
|
|
|
} |
589
|
|
|
|
590
|
|
|
// Dotslash relative URI. |
591
|
|
|
if (strpos($uri, './') === 0) { |
592
|
|
|
return $pathBase.substr($uri, 2); |
593
|
|
|
} |
594
|
|
|
// Ignore hash URIs: |
595
|
|
|
if (substr($uri, 0, 1) === '#') { |
596
|
|
|
return $uri; |
597
|
|
|
} |
598
|
|
|
|
599
|
|
|
// Standard relative URI; add entire path. pathBase already includes a |
600
|
|
|
// trailing "/". |
601
|
|
|
return $pathBase.$uri; |
602
|
|
|
} |
603
|
|
|
|
604
|
|
|
/** |
605
|
|
|
* Returns full path info of an URL. |
606
|
|
|
* |
607
|
|
|
* @param string $url |
608
|
|
|
* |
609
|
|
|
* @return array [$pathBase, $scheme, $prePath] |
610
|
|
|
*/ |
611
|
|
|
public function getPathInfo($url) |
612
|
|
|
{ |
613
|
|
|
// Check for base URLs |
614
|
|
|
if ($this->dom->baseURI !== null) { |
615
|
|
|
if (substr($this->dom->baseURI, 0, 1) === '/') { |
616
|
|
|
// URLs starting with '/' override completely the URL defined in the link |
617
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).$this->dom->baseURI; |
618
|
|
|
} else { |
619
|
|
|
// Otherwise just prepend the base to the actual path |
620
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'.rtrim($this->dom->baseURI, '/').'/'; |
621
|
|
|
} |
622
|
|
|
} else { |
623
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'; |
624
|
|
|
} |
625
|
|
|
|
626
|
|
|
$scheme = parse_url($pathBase, PHP_URL_SCHEME); |
627
|
|
|
$prePath = $scheme.'://'.parse_url($pathBase, PHP_URL_HOST); |
628
|
|
|
|
629
|
|
|
return [$pathBase, $scheme, $prePath]; |
630
|
|
|
} |
631
|
|
|
|
632
|
|
|
/** |
633
|
|
|
* Gets nodes from the root element. |
634
|
|
|
* |
635
|
|
|
* @param $node DOMNode|DOMText |
636
|
|
|
* |
637
|
|
|
* @return array |
638
|
|
|
*/ |
639
|
|
|
private function getNodes($node) |
640
|
|
|
{ |
641
|
|
|
$this->logger->info('[Get Nodes] Retrieving nodes...'); |
642
|
|
|
|
643
|
|
|
$stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates(); |
644
|
|
|
|
645
|
|
|
$elementsToScore = []; |
646
|
|
|
|
647
|
|
|
/* |
648
|
|
|
* First, node prepping. Trash nodes that look cruddy (like ones with the |
649
|
|
|
* class name "comment", etc), and turn divs into P tags where they have been |
650
|
|
|
* used inappropriately (as in, where they contain no other block level elements.) |
651
|
|
|
*/ |
652
|
|
|
|
653
|
|
|
while ($node) { |
654
|
|
|
// Remove DOMComments nodes as we don't need them and mess up children counting |
655
|
|
|
if ($node->nodeType === XML_COMMENT_NODE) { |
656
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
657
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
658
|
|
|
continue; |
659
|
|
|
} |
660
|
|
|
|
661
|
|
|
$matchString = $node->getAttribute('class').' '.$node->getAttribute('id'); |
662
|
|
|
|
663
|
|
|
if (!$node->isProbablyVisible()) { |
664
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString)); |
665
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
666
|
|
|
continue; |
667
|
|
|
} |
668
|
|
|
|
669
|
|
|
// Check to see if this node is a byline, and remove it if it is. |
670
|
|
|
if ($this->checkByline($node, $matchString)) { |
671
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
672
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
673
|
|
|
continue; |
674
|
|
|
} |
675
|
|
|
|
676
|
|
|
// Remove unlikely candidates |
677
|
|
|
if ($stripUnlikelyCandidates) { |
678
|
|
|
if ( |
679
|
|
|
preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && |
680
|
|
|
!preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && |
681
|
|
|
$node->nodeName !== 'body' && |
682
|
|
|
$node->nodeName !== 'a' |
683
|
|
|
) { |
684
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
685
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
686
|
|
|
continue; |
687
|
|
|
} |
688
|
|
|
} |
689
|
|
|
|
690
|
|
|
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). |
691
|
|
|
if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || |
692
|
|
|
$node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || |
693
|
|
|
$node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' || |
694
|
|
|
$node->nodeName === 'p') && |
695
|
|
|
$node->isElementWithoutContent()) { |
696
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName)); |
697
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
698
|
|
|
continue; |
699
|
|
|
} |
700
|
|
|
|
701
|
|
|
if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) { |
702
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
703
|
|
|
$elementsToScore[] = $node; |
704
|
|
|
} |
705
|
|
|
|
706
|
|
|
// Turn all divs that don't have children block level elements into p's |
707
|
|
|
if ($node->nodeName === 'div') { |
708
|
|
|
// Put phrasing content into paragraphs. |
709
|
|
|
$p = null; |
710
|
|
|
$childNode = $node->firstChild; |
711
|
|
|
while ($childNode) { |
712
|
|
|
$nextSibling = $childNode->nextSibling; |
713
|
|
|
if ($childNode->isPhrasingContent()) { |
714
|
|
|
if ($p !== null) { |
715
|
|
|
$p->appendChild($childNode); |
716
|
|
|
} elseif (!$childNode->isWhitespace()) { |
717
|
|
|
$p = $this->dom->createElement('p'); |
718
|
|
|
$node->replaceChild($p, $childNode); |
719
|
|
|
$p->appendChild($childNode); |
720
|
|
|
} |
721
|
|
|
} elseif ($p !== null) { |
722
|
|
|
while ($p->lastChild && $p->lastChild->isWhitespace()) { |
|
|
|
|
723
|
|
|
$p->removeChild($p->lastChild); |
724
|
|
|
} |
725
|
|
|
$p = null; |
726
|
|
|
} |
727
|
|
|
$childNode = $nextSibling; |
728
|
|
|
} |
729
|
|
|
|
730
|
|
|
/* |
731
|
|
|
* Sites like http://mobile.slate.com encloses each paragraph with a DIV |
732
|
|
|
* element. DIVs with only a P element inside and no text content can be |
733
|
|
|
* safely converted into plain P elements to avoid confusing the scoring |
734
|
|
|
* algorithm with DIVs with are, in practice, paragraphs. |
735
|
|
|
*/ |
736
|
|
|
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { |
737
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
738
|
|
|
$pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0); |
739
|
|
|
$node->parentNode->replaceChild($pNode, $node); |
740
|
|
|
$node = $pNode; |
741
|
|
|
$elementsToScore[] = $node; |
742
|
|
|
} elseif (!$node->hasSingleChildBlockElement()) { |
743
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
744
|
|
|
$node = NodeUtility::setNodeTag($node, 'p'); |
745
|
|
|
$elementsToScore[] = $node; |
746
|
|
|
} |
747
|
|
|
} |
748
|
|
|
|
749
|
|
|
$node = NodeUtility::getNextNode($node); |
750
|
|
|
} |
751
|
|
|
|
752
|
|
|
return $elementsToScore; |
753
|
|
|
} |
754
|
|
|
|
755
|
|
|
/** |
756
|
|
|
* Checks if the node is a byline. |
757
|
|
|
* |
758
|
|
|
* @param DOMNode $node |
759
|
|
|
* @param string $matchString |
760
|
|
|
* |
761
|
|
|
* @return bool |
762
|
|
|
*/ |
763
|
|
|
private function checkByline($node, $matchString) |
764
|
|
|
{ |
765
|
|
|
if (!$this->configuration->getArticleByLine()) { |
766
|
|
|
return false; |
767
|
|
|
} |
768
|
|
|
|
769
|
|
|
/* |
770
|
|
|
* Check if the byline is already set |
771
|
|
|
*/ |
772
|
|
|
if ($this->getAuthor()) { |
773
|
|
|
return false; |
774
|
|
|
} |
775
|
|
|
|
776
|
|
|
$rel = $node->getAttribute('rel'); |
777
|
|
|
|
778
|
|
|
if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { |
779
|
|
|
$this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); |
780
|
|
|
$this->setAuthor(trim($node->getTextContent())); |
781
|
|
|
|
782
|
|
|
return true; |
783
|
|
|
} |
784
|
|
|
|
785
|
|
|
return false; |
786
|
|
|
} |
787
|
|
|
|
788
|
|
|
/** |
789
|
|
|
* Checks the validity of a byLine. Based on string length. |
790
|
|
|
* |
791
|
|
|
* @param string $text |
792
|
|
|
* |
793
|
|
|
* @return bool |
794
|
|
|
*/ |
795
|
|
|
private function isValidByline($text) |
796
|
|
|
{ |
797
|
|
|
if (gettype($text) == 'string') { |
798
|
|
|
$byline = trim($text); |
799
|
|
|
|
800
|
|
|
return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100); |
801
|
|
|
} |
802
|
|
|
|
803
|
|
|
return false; |
804
|
|
|
} |
805
|
|
|
|
806
|
|
|
/** |
807
|
|
|
* Removes all the scripts of the html. |
808
|
|
|
* |
809
|
|
|
* @param DOMDocument $dom |
810
|
|
|
*/ |
811
|
|
|
private function removeScripts(DOMDocument $dom) |
812
|
|
|
{ |
813
|
|
|
foreach (['script', 'noscript'] as $tag) { |
814
|
|
|
$nodes = $dom->getElementsByTagName($tag); |
815
|
|
|
foreach (iterator_to_array($nodes) as $node) { |
816
|
|
|
NodeUtility::removeNode($node); |
817
|
|
|
} |
818
|
|
|
} |
819
|
|
|
} |
820
|
|
|
|
821
|
|
|
/** |
822
|
|
|
* Prepares the document for parsing. |
823
|
|
|
* |
824
|
|
|
* @param DOMDocument $dom |
825
|
|
|
*/ |
826
|
|
|
private function prepDocument(DOMDocument $dom) |
827
|
|
|
{ |
828
|
|
|
$this->logger->info('[PrepDocument] Preparing document for parsing...'); |
829
|
|
|
|
830
|
|
|
foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { |
831
|
|
|
$next = $br->nextSibling; |
832
|
|
|
|
833
|
|
|
/* |
834
|
|
|
* Whether 2 or more <br> elements have been found and replaced with a |
835
|
|
|
* <p> block. |
836
|
|
|
*/ |
837
|
|
|
$replaced = false; |
838
|
|
|
|
839
|
|
|
/* |
840
|
|
|
* If we find a <br> chain, remove the <br>s until we hit another element |
841
|
|
|
* or non-whitespace. This leaves behind the first <br> in the chain |
842
|
|
|
* (which will be replaced with a <p> later). |
843
|
|
|
*/ |
844
|
|
|
while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) { |
845
|
|
|
$this->logger->debug('[PrepDocument] Removing chain of BR nodes...'); |
846
|
|
|
|
847
|
|
|
$replaced = true; |
848
|
|
|
$brSibling = $next->nextSibling; |
849
|
|
|
$next->parentNode->removeChild($next); |
850
|
|
|
$next = $brSibling; |
851
|
|
|
} |
852
|
|
|
|
853
|
|
|
/* |
854
|
|
|
* If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
855
|
|
|
* all sibling nodes as children of the <p> until we hit another <br> |
856
|
|
|
* chain. |
857
|
|
|
*/ |
858
|
|
|
|
859
|
|
|
if ($replaced) { |
860
|
|
|
$p = $dom->createElement('p'); |
861
|
|
|
$br->parentNode->replaceChild($p, $br); |
862
|
|
|
|
863
|
|
|
$next = $p->nextSibling; |
864
|
|
|
while ($next) { |
865
|
|
|
// If we've hit another <br><br>, we're done adding children to this <p>. |
866
|
|
|
if ($next->nodeName === 'br') { |
867
|
|
|
$nextElem = NodeUtility::nextElement($next->nextSibling); |
868
|
|
|
if ($nextElem && $nextElem->nodeName === 'br') { |
869
|
|
|
break; |
870
|
|
|
} |
871
|
|
|
} |
872
|
|
|
|
873
|
|
|
if (!$next->isPhrasingContent()) { |
|
|
|
|
874
|
|
|
break; |
875
|
|
|
} |
876
|
|
|
|
877
|
|
|
$this->logger->debug('[PrepDocument] Replacing BR with a P node...'); |
878
|
|
|
|
879
|
|
|
// Otherwise, make this node a child of the new <p>. |
880
|
|
|
$sibling = $next->nextSibling; |
881
|
|
|
$p->appendChild($next); |
882
|
|
|
$next = $sibling; |
883
|
|
|
} |
884
|
|
|
|
885
|
|
|
while ($p->lastChild && $p->lastChild->isWhitespace()) { |
886
|
|
|
$p->removeChild($p->lastChild); |
887
|
|
|
} |
888
|
|
|
|
889
|
|
|
if ($p->parentNode->tagName === 'p') { |
890
|
|
|
NodeUtility::setNodeTag($p->parentNode, 'div'); |
|
|
|
|
891
|
|
|
} |
892
|
|
|
} |
893
|
|
|
} |
894
|
|
|
|
895
|
|
|
// Replace font tags with span |
896
|
|
|
$fonts = $dom->getElementsByTagName('font'); |
897
|
|
|
$length = $fonts->length; |
898
|
|
|
for ($i = 0; $i < $length; $i++) { |
899
|
|
|
$this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); |
900
|
|
|
$font = $fonts->item($length - 1 - $i); |
901
|
|
|
NodeUtility::setNodeTag($font, 'span'); |
902
|
|
|
} |
903
|
|
|
} |
904
|
|
|
|
905
|
|
|
/** |
906
|
|
|
* Assign scores to each node. Returns full article parsed or false on error. |
907
|
|
|
* |
908
|
|
|
* @param array $nodes |
909
|
|
|
* |
910
|
|
|
* @return DOMDocument|bool |
911
|
|
|
*/ |
912
|
|
|
private function rateNodes($nodes) |
913
|
|
|
{ |
914
|
|
|
$this->logger->info('[Rating] Rating nodes...'); |
915
|
|
|
|
916
|
|
|
$candidates = []; |
917
|
|
|
|
918
|
|
|
/** @var DOMElement $node */ |
919
|
|
|
foreach ($nodes as $node) { |
920
|
|
|
if (is_null($node->parentNode)) { |
921
|
|
|
continue; |
922
|
|
|
} |
923
|
|
|
|
924
|
|
|
// Discard nodes with less than 25 characters, without blank space |
925
|
|
|
if (mb_strlen($node->getTextContent(true)) < 25) { |
926
|
|
|
continue; |
927
|
|
|
} |
928
|
|
|
|
929
|
|
|
$ancestors = $node->getNodeAncestors(); |
930
|
|
|
|
931
|
|
|
// Exclude nodes with no ancestor |
932
|
|
|
if (count($ancestors) === 0) { |
933
|
|
|
continue; |
934
|
|
|
} |
935
|
|
|
|
936
|
|
|
// Start with a point for the paragraph itself as a base. |
937
|
|
|
$contentScore = 1; |
938
|
|
|
|
939
|
|
|
// Add points for any commas within this paragraph. |
940
|
|
|
$contentScore += count(explode(',', $node->getTextContent(true))); |
941
|
|
|
|
942
|
|
|
// For every 100 characters in this paragraph, add another point. Up to 3 points. |
943
|
|
|
$contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); |
944
|
|
|
|
945
|
|
|
$this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128))); |
946
|
|
|
|
947
|
|
|
/** @var $ancestor DOMElement */ |
948
|
|
|
foreach ($ancestors as $level => $ancestor) { |
949
|
|
|
$this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...'); |
950
|
|
|
if (!$ancestor->isInitialized()) { |
951
|
|
|
$ancestor->initializeNode($this->configuration->getWeightClasses()); |
952
|
|
|
$candidates[] = $ancestor; |
953
|
|
|
} |
954
|
|
|
|
955
|
|
|
/* |
956
|
|
|
* Node score divider: |
957
|
|
|
* - parent: 1 (no division) |
958
|
|
|
* - grandparent: 2 |
959
|
|
|
* - great grandparent+: ancestor level * 3 |
960
|
|
|
*/ |
961
|
|
|
|
962
|
|
|
if ($level === 0) { |
963
|
|
|
$scoreDivider = 1; |
964
|
|
|
} elseif ($level === 1) { |
965
|
|
|
$scoreDivider = 2; |
966
|
|
|
} else { |
967
|
|
|
$scoreDivider = $level * 3; |
968
|
|
|
} |
969
|
|
|
|
970
|
|
|
$currentScore = $ancestor->contentScore; |
971
|
|
|
$ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider); |
972
|
|
|
|
973
|
|
|
$this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128))); |
974
|
|
|
} |
975
|
|
|
} |
976
|
|
|
|
977
|
|
|
/* |
978
|
|
|
* After we've calculated scores, loop through all of the possible |
979
|
|
|
* candidate nodes we found and find the one with the highest score. |
980
|
|
|
*/ |
981
|
|
|
|
982
|
|
|
$topCandidates = []; |
983
|
|
|
foreach ($candidates as $candidate) { |
984
|
|
|
|
985
|
|
|
/* |
986
|
|
|
* Scale the final candidates score based on link density. Good content |
987
|
|
|
* should have a relatively small link density (5% or less) and be mostly |
988
|
|
|
* unaffected by this operation. |
989
|
|
|
*/ |
990
|
|
|
|
991
|
|
|
$candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity()); |
992
|
|
|
|
993
|
|
|
for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) { |
994
|
|
|
$aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null; |
995
|
|
|
|
996
|
|
|
if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) { |
997
|
|
|
array_splice($topCandidates, $i, 0, [$candidate]); |
998
|
|
|
if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) { |
999
|
|
|
array_pop($topCandidates); |
1000
|
|
|
} |
1001
|
|
|
break; |
1002
|
|
|
} |
1003
|
|
|
} |
1004
|
|
|
} |
1005
|
|
|
|
1006
|
|
|
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; |
1007
|
|
|
$parentOfTopCandidate = null; |
1008
|
|
|
|
1009
|
|
|
/* |
1010
|
|
|
* If we still have no top candidate, just use the body as a last resort. |
1011
|
|
|
* We also have to copy the body node so it is something we can modify. |
1012
|
|
|
*/ |
1013
|
|
|
|
1014
|
|
|
if ($topCandidate === null || $topCandidate->nodeName === 'body') { |
1015
|
|
|
$this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.'); |
1016
|
|
|
|
1017
|
|
|
// Move all of the page's children into topCandidate |
1018
|
|
|
$topCandidate = new DOMDocument('1.0', 'utf-8'); |
1019
|
|
|
$topCandidate->encoding = 'UTF-8'; |
1020
|
|
|
$topCandidate->appendChild($topCandidate->createElement('div', '')); |
1021
|
|
|
$kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; |
1022
|
|
|
|
1023
|
|
|
// Cannot be foreached, don't ask me why. |
1024
|
|
|
for ($i = 0; $i < $kids->length; $i++) { |
1025
|
|
|
$import = $topCandidate->importNode($kids->item($i), true); |
1026
|
|
|
$topCandidate->firstChild->appendChild($import); |
1027
|
|
|
} |
1028
|
|
|
|
1029
|
|
|
// Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument. |
1030
|
|
|
$topCandidate = $topCandidate->firstChild; |
1031
|
|
|
} elseif ($topCandidate) { |
1032
|
|
|
$this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore)); |
1033
|
|
|
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array |
1034
|
|
|
// and whose scores are quite closed with current `topCandidate` node. |
1035
|
|
|
$alternativeCandidateAncestors = []; |
1036
|
|
|
for ($i = 1; $i < count($topCandidates); $i++) { |
|
|
|
|
1037
|
|
|
// In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero |
1038
|
|
|
// we have to use max() and replace zero with a low value like 0.1 |
1039
|
|
|
if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) { |
1040
|
|
|
array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false)); |
1041
|
|
|
} |
1042
|
|
|
} |
1043
|
|
|
|
1044
|
|
|
$MINIMUM_TOPCANDIDATES = 3; |
1045
|
|
|
if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { |
1046
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
1047
|
|
|
|
1048
|
|
|
// Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher |
1049
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { |
1050
|
|
|
$listsContainingThisAncestor = 0; |
1051
|
|
|
for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { |
1052
|
|
|
$listsContainingThisAncestor += (int) in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); |
1053
|
|
|
} |
1054
|
|
|
if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) { |
1055
|
|
|
$topCandidate = $parentOfTopCandidate; |
1056
|
|
|
break; |
1057
|
|
|
} |
1058
|
|
|
$parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
1059
|
|
|
} |
1060
|
|
|
} |
1061
|
|
|
|
1062
|
|
|
/* |
1063
|
|
|
* Because of our bonus system, parents of candidates might have scores |
1064
|
|
|
* themselves. They get half of the node. There won't be nodes with higher |
1065
|
|
|
* scores than our topCandidate, but if we see the score going *up* in the first |
1066
|
|
|
* few steps up the tree, that's a decent sign that there might be more content |
1067
|
|
|
* lurking in other places that we want to unify in. The sibling stuff |
1068
|
|
|
* below does some of that - but only if we've looked high enough up the DOM |
1069
|
|
|
* tree. |
1070
|
|
|
*/ |
1071
|
|
|
|
1072
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
1073
|
|
|
$lastScore = $topCandidate->contentScore; |
1074
|
|
|
|
1075
|
|
|
// The scores shouldn't get too low. |
1076
|
|
|
$scoreThreshold = $lastScore / 3; |
1077
|
|
|
|
1078
|
|
|
/* @var DOMElement $parentOfTopCandidate */ |
1079
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body') { |
1080
|
|
|
$parentScore = $parentOfTopCandidate->contentScore; |
1081
|
|
|
if ($parentScore < $scoreThreshold) { |
1082
|
|
|
break; |
1083
|
|
|
} |
1084
|
|
|
|
1085
|
|
|
if ($parentScore > $lastScore) { |
1086
|
|
|
// Alright! We found a better parent to use. |
1087
|
|
|
$topCandidate = $parentOfTopCandidate; |
1088
|
|
|
$this->logger->info('[Rating] Found a better top candidate.'); |
1089
|
|
|
break; |
1090
|
|
|
} |
1091
|
|
|
$lastScore = $parentOfTopCandidate->contentScore; |
1092
|
|
|
$parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
1093
|
|
|
} |
1094
|
|
|
|
1095
|
|
|
// If the top candidate is the only child, use parent instead. This will help sibling |
1096
|
|
|
// joining logic when adjacent content is actually located in parent's sibling node. |
1097
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
1098
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { |
1099
|
|
|
$topCandidate = $parentOfTopCandidate; |
1100
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
1101
|
|
|
} |
1102
|
|
|
} |
1103
|
|
|
|
1104
|
|
|
/* |
1105
|
|
|
* Now that we have the top candidate, look through its siblings for content |
1106
|
|
|
* that might also be related. Things like preambles, content split by ads |
1107
|
|
|
* that we removed, etc. |
1108
|
|
|
*/ |
1109
|
|
|
|
1110
|
|
|
$this->logger->info('[Rating] Creating final article content document...'); |
1111
|
|
|
|
1112
|
|
|
$articleContent = new DOMDocument('1.0', 'utf-8'); |
1113
|
|
|
$articleContent->createElement('div'); |
1114
|
|
|
|
1115
|
|
|
$siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); |
1116
|
|
|
// Keep potential top candidate's parent node to try to get text direction of it later. |
1117
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
1118
|
|
|
$siblings = $parentOfTopCandidate->childNodes; |
1119
|
|
|
|
1120
|
|
|
$hasContent = false; |
1121
|
|
|
|
1122
|
|
|
$this->logger->info('[Rating] Adding top candidate siblings...'); |
1123
|
|
|
|
1124
|
|
|
/* @var DOMElement $sibling */ |
1125
|
|
|
// Can't foreach here because down there we might change the tag name and that causes the foreach to skip items |
1126
|
|
|
for ($i = 0; $i < $siblings->length; $i++) { |
1127
|
|
|
$sibling = $siblings[$i]; |
1128
|
|
|
$append = false; |
1129
|
|
|
|
1130
|
|
|
if ($sibling === $topCandidate) { |
1131
|
|
|
$this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...'); |
1132
|
|
|
|
1133
|
|
|
$append = true; |
1134
|
|
|
} else { |
1135
|
|
|
$contentBonus = 0; |
1136
|
|
|
|
1137
|
|
|
// Give a bonus if sibling nodes and top candidates have the example same classname |
1138
|
|
|
if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { |
1139
|
|
|
$contentBonus += $topCandidate->contentScore * 0.2; |
1140
|
|
|
} |
1141
|
|
|
if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) { |
1142
|
|
|
$append = true; |
1143
|
|
|
} elseif ($sibling->nodeName === 'p') { |
1144
|
|
|
$linkDensity = $sibling->getLinkDensity(); |
1145
|
|
|
$nodeContent = $sibling->getTextContent(true); |
1146
|
|
|
|
1147
|
|
|
if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) { |
1148
|
|
|
$append = true; |
1149
|
|
|
} elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { |
1150
|
|
|
$append = true; |
1151
|
|
|
} |
1152
|
|
|
} |
1153
|
|
|
} |
1154
|
|
|
|
1155
|
|
|
if ($append) { |
1156
|
|
|
$this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128))); |
1157
|
|
|
|
1158
|
|
|
$hasContent = true; |
1159
|
|
|
|
1160
|
|
|
if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) { |
1161
|
|
|
/* |
1162
|
|
|
* We have a node that isn't a common block level element, like a form or td tag. |
1163
|
|
|
* Turn it into a div so it doesn't get filtered out later by accident. |
1164
|
|
|
*/ |
1165
|
|
|
$sibling = NodeUtility::setNodeTag($sibling, 'div'); |
1166
|
|
|
} |
1167
|
|
|
|
1168
|
|
|
$import = $articleContent->importNode($sibling, true); |
1169
|
|
|
$articleContent->appendChild($import); |
1170
|
|
|
|
1171
|
|
|
/* |
1172
|
|
|
* No node shifting needs to be check because when calling getChildren, an array is made with the |
1173
|
|
|
* children of the parent node, instead of using the DOMElement childNodes function, which, when used |
1174
|
|
|
* along with appendChild, would shift the nodes position and the current foreach will behave in |
1175
|
|
|
* unpredictable ways. |
1176
|
|
|
*/ |
1177
|
|
|
} |
1178
|
|
|
} |
1179
|
|
|
|
1180
|
|
|
$articleContent = $this->prepArticle($articleContent); |
1181
|
|
|
|
1182
|
|
|
if ($hasContent) { |
1183
|
|
|
// Find out text direction from ancestors of final top candidate. |
1184
|
|
|
$ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors()); |
1185
|
|
|
foreach ($ancestors as $ancestor) { |
1186
|
|
|
$articleDir = $ancestor->getAttribute('dir'); |
1187
|
|
|
if ($articleDir) { |
1188
|
|
|
$this->setDirection($articleDir); |
1189
|
|
|
$this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir)); |
1190
|
|
|
break; |
1191
|
|
|
} |
1192
|
|
|
} |
1193
|
|
|
|
1194
|
|
|
return $articleContent; |
1195
|
|
|
} else { |
1196
|
|
|
return false; |
1197
|
|
|
} |
1198
|
|
|
} |
1199
|
|
|
|
1200
|
|
|
/** |
1201
|
|
|
* Cleans up the final article. |
1202
|
|
|
* |
1203
|
|
|
* @param DOMDocument $article |
1204
|
|
|
* |
1205
|
|
|
* @return DOMDocument |
1206
|
|
|
*/ |
1207
|
|
|
public function prepArticle(DOMDocument $article) |
1208
|
|
|
{ |
1209
|
|
|
$this->logger->info('[PrepArticle] Preparing final article...'); |
1210
|
|
|
|
1211
|
|
|
$this->_cleanStyles($article); |
1212
|
|
|
$this->_clean($article, 'style'); |
1213
|
|
|
|
1214
|
|
|
// Check for data tables before we continue, to avoid removing items in |
1215
|
|
|
// those tables, which will often be isolated even though they're |
1216
|
|
|
// visually linked to other content-ful elements (text, images, etc.). |
1217
|
|
|
$this->_markDataTables($article); |
1218
|
|
|
|
1219
|
|
|
// Clean out junk from the article content |
1220
|
|
|
$this->_cleanConditionally($article, 'form'); |
1221
|
|
|
$this->_cleanConditionally($article, 'fieldset'); |
1222
|
|
|
$this->_clean($article, 'object'); |
1223
|
|
|
$this->_clean($article, 'embed'); |
1224
|
|
|
$this->_clean($article, 'h1'); |
1225
|
|
|
$this->_clean($article, 'footer'); |
1226
|
|
|
$this->_clean($article, 'link'); |
1227
|
|
|
$this->_clean($article, 'aside'); |
1228
|
|
|
|
1229
|
|
|
// Clean out elements have "share" in their id/class combinations from final top candidates, |
1230
|
|
|
// which means we don't remove the top candidates even they have "share". |
1231
|
|
|
foreach ($article->childNodes as $child) { |
1232
|
|
|
$this->_cleanMatchedNodes($child, '/share/i'); |
1233
|
|
|
} |
1234
|
|
|
|
1235
|
|
|
/* |
1236
|
|
|
* If there is only one h2 and its text content substantially equals article title, |
1237
|
|
|
* they are probably using it as a header and not a subheader, |
1238
|
|
|
* so remove it since we already extract the title separately. |
1239
|
|
|
*/ |
1240
|
|
|
$h2 = $article->getElementsByTagName('h2'); |
1241
|
|
|
if ($h2->length === 1) { |
1242
|
|
|
$lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1); |
1243
|
|
|
|
1244
|
|
|
if (abs($lengthSimilarRate) < 0.5) { |
1245
|
|
|
if ($lengthSimilarRate > 0) { |
1246
|
|
|
$titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false; |
1247
|
|
|
} else { |
1248
|
|
|
$titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false; |
1249
|
|
|
} |
1250
|
|
|
if ($titlesMatch) { |
1251
|
|
|
$this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...'); |
1252
|
|
|
$this->_clean($article, 'h2'); |
1253
|
|
|
} |
1254
|
|
|
} |
1255
|
|
|
} |
1256
|
|
|
|
1257
|
|
|
$this->_clean($article, 'iframe'); |
1258
|
|
|
$this->_clean($article, 'input'); |
1259
|
|
|
$this->_clean($article, 'textarea'); |
1260
|
|
|
$this->_clean($article, 'select'); |
1261
|
|
|
$this->_clean($article, 'button'); |
1262
|
|
|
$this->_cleanHeaders($article); |
1263
|
|
|
|
1264
|
|
|
// Do these last as the previous stuff may have removed junk |
1265
|
|
|
// that will affect these |
1266
|
|
|
$this->_cleanConditionally($article, 'table'); |
1267
|
|
|
$this->_cleanConditionally($article, 'ul'); |
1268
|
|
|
$this->_cleanConditionally($article, 'div'); |
1269
|
|
|
|
1270
|
|
|
$this->_cleanExtraParagraphs($article); |
1271
|
|
|
|
1272
|
|
|
foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { |
1273
|
|
|
$next = $br->nextSibling; |
1274
|
|
|
if ($next && $next->nodeName === 'p') { |
1275
|
|
|
$this->logger->debug('[PrepArticle] Removing br node next to a p node.'); |
1276
|
|
|
$br->parentNode->removeChild($br); |
1277
|
|
|
} |
1278
|
|
|
} |
1279
|
|
|
|
1280
|
|
|
// Remove single-cell tables |
1281
|
|
|
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { |
1282
|
|
|
/** @var DOMNode $table */ |
1283
|
|
|
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table; |
1284
|
|
|
if ($tbody->hasSingleTagInsideElement('tr')) { |
|
|
|
|
1285
|
|
|
$row = $tbody->getFirstElementChild(); |
|
|
|
|
1286
|
|
|
if ($row->hasSingleTagInsideElement('td')) { |
1287
|
|
|
$cell = $row->getFirstElementChild(); |
1288
|
|
|
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function($carry, $node) { |
1289
|
|
|
return $node->isPhrasingContent() && $carry; |
1290
|
|
|
}, true)) ? 'p' : 'div'); |
1291
|
|
|
$table->parentNode->replaceChild($cell, $table); |
1292
|
|
|
} |
1293
|
|
|
} |
1294
|
|
|
} |
1295
|
|
|
|
1296
|
|
|
return $article; |
1297
|
|
|
} |
1298
|
|
|
|
1299
|
|
|
/** |
1300
|
|
|
* Look for 'data' (as opposed to 'layout') tables, for which we use |
1301
|
|
|
* similar checks as |
1302
|
|
|
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920. |
1303
|
|
|
* |
1304
|
|
|
* @param DOMDocument $article |
1305
|
|
|
* |
1306
|
|
|
* @return void |
1307
|
|
|
*/ |
1308
|
|
|
public function _markDataTables(DOMDocument $article) |
1309
|
|
|
{ |
1310
|
|
|
$tables = $article->getElementsByTagName('table'); |
1311
|
|
|
foreach ($tables as $table) { |
1312
|
|
|
/** @var DOMElement $table */ |
1313
|
|
|
$role = $table->getAttribute('role'); |
1314
|
|
|
if ($role === 'presentation') { |
1315
|
|
|
$table->setReadabilityDataTable(false); |
1316
|
|
|
continue; |
1317
|
|
|
} |
1318
|
|
|
$datatable = $table->getAttribute('datatable'); |
1319
|
|
|
if ($datatable == '0') { |
1320
|
|
|
$table->setReadabilityDataTable(false); |
1321
|
|
|
continue; |
1322
|
|
|
} |
1323
|
|
|
$summary = $table->getAttribute('summary'); |
1324
|
|
|
if ($summary) { |
1325
|
|
|
$table->setReadabilityDataTable(true); |
1326
|
|
|
continue; |
1327
|
|
|
} |
1328
|
|
|
|
1329
|
|
|
$caption = $table->getElementsByTagName('caption'); |
1330
|
|
|
if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { |
1331
|
|
|
$table->setReadabilityDataTable(true); |
1332
|
|
|
continue; |
1333
|
|
|
} |
1334
|
|
|
|
1335
|
|
|
// If the table has a descendant with any of these tags, consider a data table: |
1336
|
|
|
foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { |
1337
|
|
|
if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { |
1338
|
|
|
$table->setReadabilityDataTable(true); |
1339
|
|
|
continue 2; |
1340
|
|
|
} |
1341
|
|
|
} |
1342
|
|
|
|
1343
|
|
|
// Nested tables indicate a layout table: |
1344
|
|
|
if ($table->getElementsByTagName('table')->length > 0) { |
1345
|
|
|
$table->setReadabilityDataTable(false); |
1346
|
|
|
continue; |
1347
|
|
|
} |
1348
|
|
|
|
1349
|
|
|
$sizeInfo = $table->getRowAndColumnCount(); |
1350
|
|
|
if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { |
1351
|
|
|
$table->setReadabilityDataTable(true); |
1352
|
|
|
continue; |
1353
|
|
|
} |
1354
|
|
|
// Now just go by size entirely: |
1355
|
|
|
$table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10); |
1356
|
|
|
} |
1357
|
|
|
} |
1358
|
|
|
|
1359
|
|
|
/** |
1360
|
|
|
* Remove the style attribute on every e and under. |
1361
|
|
|
* |
1362
|
|
|
* @param $node DOMDocument|DOMNode |
1363
|
|
|
**/ |
1364
|
|
|
public function _cleanStyles($node) |
1365
|
|
|
{ |
1366
|
|
|
if (property_exists($node, 'tagName') && $node->tagName === 'svg') { |
1367
|
|
|
return; |
1368
|
|
|
} |
1369
|
|
|
|
1370
|
|
|
// Do not bother if there's no method to remove an attribute |
1371
|
|
|
if (method_exists($node, 'removeAttribute')) { |
1372
|
|
|
$presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace']; |
1373
|
|
|
// Remove `style` and deprecated presentational attributes |
1374
|
|
|
foreach ($presentational_attributes as $presentational_attribute) { |
1375
|
|
|
$node->removeAttribute($presentational_attribute); |
1376
|
|
|
} |
1377
|
|
|
|
1378
|
|
|
$deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre']; |
1379
|
|
|
if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) { |
1380
|
|
|
$node->removeAttribute('width'); |
1381
|
|
|
$node->removeAttribute('height'); |
1382
|
|
|
} |
1383
|
|
|
} |
1384
|
|
|
|
1385
|
|
|
$cur = $node->firstChild; |
1386
|
|
|
while ($cur !== null) { |
1387
|
|
|
$this->_cleanStyles($cur); |
1388
|
|
|
$cur = $cur->nextSibling; |
1389
|
|
|
} |
1390
|
|
|
} |
1391
|
|
|
|
1392
|
|
|
/** |
1393
|
|
|
* Clean out elements whose id/class combinations match specific string. |
1394
|
|
|
* |
1395
|
|
|
* @param $node DOMElement Node to clean |
1396
|
|
|
* @param $regex string Match id/class combination. |
1397
|
|
|
* |
1398
|
|
|
* @return void |
1399
|
|
|
**/ |
1400
|
|
|
public function _cleanMatchedNodes($node, $regex) |
1401
|
|
|
{ |
1402
|
|
|
$endOfSearchMarkerNode = NodeUtility::getNextNode($node, true); |
1403
|
|
|
$next = NodeUtility::getNextNode($node); |
1404
|
|
|
while ($next && $next !== $endOfSearchMarkerNode) { |
1405
|
|
|
if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { |
1406
|
|
|
$this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id'))); |
1407
|
|
|
$next = NodeUtility::removeAndGetNext($next); |
1408
|
|
|
} else { |
1409
|
|
|
$next = NodeUtility::getNextNode($next); |
1410
|
|
|
} |
1411
|
|
|
} |
1412
|
|
|
} |
1413
|
|
|
|
1414
|
|
|
/** |
1415
|
|
|
* @param DOMDocument $article |
1416
|
|
|
* |
1417
|
|
|
* @return void |
1418
|
|
|
*/ |
1419
|
|
|
public function _cleanExtraParagraphs(DOMDocument $article) |
1420
|
|
|
{ |
1421
|
|
|
$paragraphs = $article->getElementsByTagName('p'); |
1422
|
|
|
$length = $paragraphs->length; |
1423
|
|
|
|
1424
|
|
|
for ($i = 0; $i < $length; $i++) { |
1425
|
|
|
$paragraph = $paragraphs->item($length - 1 - $i); |
1426
|
|
|
|
1427
|
|
|
$imgCount = $paragraph->getElementsByTagName('img')->length; |
1428
|
|
|
$embedCount = $paragraph->getElementsByTagName('embed')->length; |
1429
|
|
|
$objectCount = $paragraph->getElementsByTagName('object')->length; |
1430
|
|
|
// At this point, nasty iframes have been removed, only remain embedded video ones. |
1431
|
|
|
$iframeCount = $paragraph->getElementsByTagName('iframe')->length; |
1432
|
|
|
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; |
1433
|
|
|
|
1434
|
|
|
if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { |
1435
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128))); |
1436
|
|
|
$paragraph->parentNode->removeChild($paragraph); |
1437
|
|
|
} |
1438
|
|
|
} |
1439
|
|
|
} |
1440
|
|
|
|
1441
|
|
|
/** |
1442
|
|
|
* @param DOMDocument $article |
1443
|
|
|
* @param string $tag Tag to clean conditionally |
1444
|
|
|
* |
1445
|
|
|
* @return void |
1446
|
|
|
*/ |
1447
|
|
|
public function _cleanConditionally(DOMDocument $article, $tag) |
1448
|
|
|
{ |
1449
|
|
|
if (!$this->configuration->getCleanConditionally()) { |
1450
|
|
|
return; |
1451
|
|
|
} |
1452
|
|
|
|
1453
|
|
|
$isList = in_array($tag, ['ul', 'ol']); |
1454
|
|
|
|
1455
|
|
|
/* |
1456
|
|
|
* Gather counts for other typical elements embedded within. |
1457
|
|
|
* Traverse backwards so we can remove nodes at the same time |
1458
|
|
|
* without effecting the traversal. |
1459
|
|
|
*/ |
1460
|
|
|
|
1461
|
|
|
$DOMNodeList = $article->getElementsByTagName($tag); |
1462
|
|
|
$length = $DOMNodeList->length; |
1463
|
|
|
for ($i = 0; $i < $length; $i++) { |
1464
|
|
|
/** @var $node DOMElement */ |
1465
|
|
|
$node = $DOMNodeList->item($length - 1 - $i); |
1466
|
|
|
|
1467
|
|
|
// First check if we're in a data table, in which case don't remove us. |
1468
|
|
|
if ($node->hasAncestorTag('table', -1, function($node) { |
1469
|
|
|
return $node->isReadabilityDataTable(); |
1470
|
|
|
})) { |
1471
|
|
|
continue; |
1472
|
|
|
} |
1473
|
|
|
|
1474
|
|
|
$weight = 0; |
1475
|
|
|
if ($this->configuration->getWeightClasses()) { |
1476
|
|
|
$weight = $node->getClassWeight(); |
1477
|
|
|
} |
1478
|
|
|
|
1479
|
|
|
if ($weight < 0) { |
1480
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag)); |
1481
|
|
|
|
1482
|
|
|
NodeUtility::removeNode($node); |
1483
|
|
|
continue; |
1484
|
|
|
} |
1485
|
|
|
|
1486
|
|
|
if (substr_count($node->getTextContent(), ',') < 10) { |
1487
|
|
|
/* |
1488
|
|
|
* If there are not very many commas, and the number of |
1489
|
|
|
* non-paragraph elements is more than paragraphs or other |
1490
|
|
|
* ominous signs, remove the element. |
1491
|
|
|
*/ |
1492
|
|
|
|
1493
|
|
|
$p = $node->getElementsByTagName('p')->length; |
1494
|
|
|
$img = $node->getElementsByTagName('img')->length; |
1495
|
|
|
$li = $node->getElementsByTagName('li')->length - 100; |
1496
|
|
|
$input = $node->getElementsByTagName('input')->length; |
1497
|
|
|
|
1498
|
|
|
$embedCount = 0; |
1499
|
|
|
$embeds = $node->getElementsByTagName('embed'); |
1500
|
|
|
|
1501
|
|
|
foreach ($embeds as $embedNode) { |
1502
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { |
1503
|
|
|
$embedCount++; |
1504
|
|
|
} |
1505
|
|
|
} |
1506
|
|
|
|
1507
|
|
|
$linkDensity = $node->getLinkDensity(); |
1508
|
|
|
$contentLength = mb_strlen($node->getTextContent(true)); |
1509
|
|
|
|
1510
|
|
|
$haveToRemove = |
1511
|
|
|
($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || |
1512
|
|
|
(!$isList && $li > $p) || |
1513
|
|
|
($input > floor($p / 3)) || |
1514
|
|
|
(!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || |
1515
|
|
|
(!$isList && $weight < 25 && $linkDensity > 0.2) || |
1516
|
|
|
($weight >= 25 && $linkDensity > 0.5) || |
1517
|
|
|
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1); |
1518
|
|
|
|
1519
|
|
|
if ($haveToRemove) { |
1520
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag)); |
1521
|
|
|
|
1522
|
|
|
NodeUtility::removeNode($node); |
1523
|
|
|
} |
1524
|
|
|
} |
1525
|
|
|
} |
1526
|
|
|
} |
1527
|
|
|
|
1528
|
|
|
/** |
1529
|
|
|
* Clean a node of all elements of type "tag". |
1530
|
|
|
* (Unless it's a youtube/vimeo video. People love movies.). |
1531
|
|
|
* |
1532
|
|
|
* @param $article DOMDocument |
1533
|
|
|
* @param $tag string tag to clean |
1534
|
|
|
* |
1535
|
|
|
* @return void |
1536
|
|
|
**/ |
1537
|
|
|
public function _clean(DOMDocument $article, $tag) |
1538
|
|
|
{ |
1539
|
|
|
$isEmbed = in_array($tag, ['object', 'embed', 'iframe']); |
1540
|
|
|
|
1541
|
|
|
$DOMNodeList = $article->getElementsByTagName($tag); |
1542
|
|
|
$length = $DOMNodeList->length; |
1543
|
|
|
for ($i = 0; $i < $length; $i++) { |
1544
|
|
|
$item = $DOMNodeList->item($length - 1 - $i); |
1545
|
|
|
|
1546
|
|
|
// Allow youtube and vimeo videos through as people usually want to see those. |
1547
|
|
|
if ($isEmbed) { |
1548
|
|
|
$attributeValues = []; |
1549
|
|
|
foreach ($item->attributes as $value) { |
1550
|
|
|
$attributeValues[] = $value->nodeValue; |
1551
|
|
|
} |
1552
|
|
|
$attributeValues = implode('|', $attributeValues); |
1553
|
|
|
|
1554
|
|
|
// First, check the elements attributes to see if any of them contain youtube or vimeo |
1555
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) { |
1556
|
|
|
continue; |
1557
|
|
|
} |
1558
|
|
|
|
1559
|
|
|
// Then check the elements inside this element for the same. |
1560
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { |
1561
|
|
|
continue; |
1562
|
|
|
} |
1563
|
|
|
} |
1564
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName)); |
1565
|
|
|
|
1566
|
|
|
NodeUtility::removeNode($item); |
1567
|
|
|
} |
1568
|
|
|
} |
1569
|
|
|
|
1570
|
|
|
/** |
1571
|
|
|
* Clean out spurious headers from an Element. Checks things like classnames and link density. |
1572
|
|
|
* |
1573
|
|
|
* @param DOMDocument $article |
1574
|
|
|
* |
1575
|
|
|
* @return void |
1576
|
|
|
**/ |
1577
|
|
|
public function _cleanHeaders(DOMDocument $article) |
1578
|
|
|
{ |
1579
|
|
|
for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
1580
|
|
|
$headers = $article->getElementsByTagName('h'.$headerIndex); |
1581
|
|
|
/** @var $header DOMElement */ |
1582
|
|
|
foreach ($headers as $header) { |
1583
|
|
|
$weight = 0; |
1584
|
|
|
if ($this->configuration->getWeightClasses()) { |
1585
|
|
|
$weight = $header->getClassWeight(); |
1586
|
|
|
} |
1587
|
|
|
|
1588
|
|
|
if ($weight < 0) { |
1589
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); |
1590
|
|
|
|
1591
|
|
|
NodeUtility::removeNode($header); |
1592
|
|
|
} |
1593
|
|
|
} |
1594
|
|
|
} |
1595
|
|
|
} |
1596
|
|
|
|
1597
|
|
|
/** |
1598
|
|
|
* Removes the class="" attribute from every element in the given |
1599
|
|
|
* subtree. |
1600
|
|
|
* |
1601
|
|
|
* Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes |
1602
|
|
|
* here so no need to filter those. |
1603
|
|
|
* |
1604
|
|
|
* @param DOMDocument|DOMNode $node |
1605
|
|
|
* |
1606
|
|
|
* @return void |
1607
|
|
|
**/ |
1608
|
|
|
public function _cleanClasses($node) |
1609
|
|
|
{ |
1610
|
|
|
if ($node->getAttribute('class') !== '') { |
1611
|
|
|
$node->removeAttribute('class'); |
1612
|
|
|
} |
1613
|
|
|
|
1614
|
|
|
for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) { |
1615
|
|
|
$this->_cleanClasses($node); |
|
|
|
|
1616
|
|
|
} |
1617
|
|
|
} |
1618
|
|
|
|
1619
|
|
|
/** |
1620
|
|
|
* @param DOMDocument $article |
1621
|
|
|
* |
1622
|
|
|
* @return DOMDocument |
1623
|
|
|
*/ |
1624
|
|
|
public function postProcessContent(DOMDocument $article) |
1625
|
|
|
{ |
1626
|
|
|
$this->logger->info('[PostProcess] PostProcessing content...'); |
1627
|
|
|
|
1628
|
|
|
// Readability cannot open relative uris so we convert them to absolute uris. |
1629
|
|
|
if ($this->configuration->getFixRelativeURLs()) { |
1630
|
|
|
foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) { |
1631
|
|
|
/** @var DOMElement $link */ |
1632
|
|
|
$href = $link->getAttribute('href'); |
1633
|
|
|
if ($href) { |
1634
|
|
|
// Replace links with javascript: URIs with text content, since |
1635
|
|
|
// they won't work after scripts have been removed from the page. |
1636
|
|
|
if (strpos($href, 'javascript:') === 0) { |
1637
|
|
|
$this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128))); |
1638
|
|
|
|
1639
|
|
|
$text = $article->createTextNode($link->textContent); |
1640
|
|
|
$link->parentNode->replaceChild($text, $link); |
1641
|
|
|
} else { |
1642
|
|
|
$this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128))); |
1643
|
|
|
|
1644
|
|
|
$link->setAttribute('href', $this->toAbsoluteURI($href)); |
1645
|
|
|
} |
1646
|
|
|
} |
1647
|
|
|
} |
1648
|
|
|
|
1649
|
|
|
foreach ($article->getElementsByTagName('img') as $img) { |
1650
|
|
|
/** @var DOMElement $img */ |
1651
|
|
|
/* |
1652
|
|
|
* Extract all possible sources of img url and select the first one on the list. |
1653
|
|
|
*/ |
1654
|
|
|
$url = [ |
1655
|
|
|
$img->getAttribute('src'), |
1656
|
|
|
$img->getAttribute('data-src'), |
1657
|
|
|
$img->getAttribute('data-original'), |
1658
|
|
|
$img->getAttribute('data-orig'), |
1659
|
|
|
$img->getAttribute('data-url') |
1660
|
|
|
]; |
1661
|
|
|
|
1662
|
|
|
$src = array_filter($url); |
1663
|
|
|
$src = reset($src); |
1664
|
|
|
if ($src) { |
1665
|
|
|
$this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); |
1666
|
|
|
|
1667
|
|
|
$img->setAttribute('src', $this->toAbsoluteURI($src)); |
1668
|
|
|
} |
1669
|
|
|
} |
1670
|
|
|
} |
1671
|
|
|
|
1672
|
|
|
$this->_cleanClasses($article); |
1673
|
|
|
|
1674
|
|
|
return $article; |
1675
|
|
|
} |
1676
|
|
|
|
1677
|
|
|
/** |
1678
|
|
|
* @return null|string |
1679
|
|
|
*/ |
1680
|
|
|
public function __toString() |
1681
|
|
|
{ |
1682
|
|
|
return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent()); |
1683
|
|
|
} |
1684
|
|
|
|
1685
|
|
|
/** |
1686
|
|
|
* @return string|null |
1687
|
|
|
*/ |
1688
|
|
|
public function getTitle() |
1689
|
|
|
{ |
1690
|
|
|
return $this->title; |
1691
|
|
|
} |
1692
|
|
|
|
1693
|
|
|
/** |
1694
|
|
|
* @param string $title |
1695
|
|
|
*/ |
1696
|
|
|
protected function setTitle($title) |
1697
|
|
|
{ |
1698
|
|
|
$this->title = $title; |
1699
|
|
|
} |
1700
|
|
|
|
1701
|
|
|
/** |
1702
|
|
|
* @return string|null |
1703
|
|
|
*/ |
1704
|
|
|
public function getContent() |
1705
|
|
|
{ |
1706
|
|
|
return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; |
1707
|
|
|
} |
1708
|
|
|
|
1709
|
|
|
/** |
1710
|
|
|
* @return DOMDocument|null |
1711
|
|
|
*/ |
1712
|
|
|
public function getDOMDocument() |
1713
|
|
|
{ |
1714
|
|
|
return $this->content; |
1715
|
|
|
} |
1716
|
|
|
|
1717
|
|
|
/** |
1718
|
|
|
* @param DOMDocument $content |
1719
|
|
|
*/ |
1720
|
|
|
protected function setContent(DOMDocument $content) |
1721
|
|
|
{ |
1722
|
|
|
$this->content = $content; |
1723
|
|
|
} |
1724
|
|
|
|
1725
|
|
|
/** |
1726
|
|
|
* @return null|string |
1727
|
|
|
*/ |
1728
|
|
|
public function getExcerpt() |
1729
|
|
|
{ |
1730
|
|
|
return $this->excerpt; |
1731
|
|
|
} |
1732
|
|
|
|
1733
|
|
|
/** |
1734
|
|
|
* @param null|string $excerpt |
1735
|
|
|
*/ |
1736
|
|
|
public function setExcerpt($excerpt) |
1737
|
|
|
{ |
1738
|
|
|
$this->excerpt = $excerpt; |
1739
|
|
|
} |
1740
|
|
|
|
1741
|
|
|
/** |
1742
|
|
|
* @return string|null |
1743
|
|
|
*/ |
1744
|
|
|
public function getImage() |
1745
|
|
|
{ |
1746
|
|
|
return $this->image; |
1747
|
|
|
} |
1748
|
|
|
|
1749
|
|
|
/** |
1750
|
|
|
* @param string $image |
1751
|
|
|
*/ |
1752
|
|
|
protected function setImage($image) |
1753
|
|
|
{ |
1754
|
|
|
$this->image = $image; |
1755
|
|
|
} |
1756
|
|
|
|
1757
|
|
|
/** |
1758
|
|
|
* @return string|null |
1759
|
|
|
*/ |
1760
|
|
|
public function getAuthor() |
1761
|
|
|
{ |
1762
|
|
|
return $this->author; |
1763
|
|
|
} |
1764
|
|
|
|
1765
|
|
|
/** |
1766
|
|
|
* @param string $author |
1767
|
|
|
*/ |
1768
|
|
|
protected function setAuthor($author) |
1769
|
|
|
{ |
1770
|
|
|
$this->author = $author; |
1771
|
|
|
} |
1772
|
|
|
|
1773
|
|
|
/** |
1774
|
|
|
* @return string|null |
1775
|
|
|
*/ |
1776
|
|
|
public function getSiteName() |
1777
|
|
|
{ |
1778
|
|
|
return $this->siteName; |
1779
|
|
|
} |
1780
|
|
|
|
1781
|
|
|
/** |
1782
|
|
|
* @param string $siteName |
1783
|
|
|
*/ |
1784
|
|
|
protected function setSiteName($siteName) |
1785
|
|
|
{ |
1786
|
|
|
$this->siteName = $siteName; |
1787
|
|
|
} |
1788
|
|
|
|
1789
|
|
|
/** |
1790
|
|
|
* @return null|string |
1791
|
|
|
*/ |
1792
|
|
|
public function getDirection() |
1793
|
|
|
{ |
1794
|
|
|
return $this->direction; |
1795
|
|
|
} |
1796
|
|
|
|
1797
|
|
|
/** |
1798
|
|
|
* @param null|string $direction |
1799
|
|
|
*/ |
1800
|
|
|
public function setDirection($direction) |
1801
|
|
|
{ |
1802
|
|
|
$this->direction = $direction; |
1803
|
|
|
} |
1804
|
|
|
} |
1805
|
|
|
|