|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace andreskrey\Readability; |
|
4
|
|
|
|
|
5
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMDocument; |
|
6
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMElement; |
|
7
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMNode; |
|
8
|
|
|
use andreskrey\Readability\Nodes\DOM\DOMText; |
|
9
|
|
|
use andreskrey\Readability\Nodes\NodeUtility; |
|
10
|
|
|
use Psr\Log\LoggerInterface; |
|
11
|
|
|
|
|
12
|
|
|
/** |
|
13
|
|
|
* Class Readability. |
|
14
|
|
|
*/ |
|
15
|
|
|
class Readability |
|
16
|
|
|
{ |
|
17
|
|
|
/** |
|
18
|
|
|
* Main DOMDocument where all the magic happens. |
|
19
|
|
|
* |
|
20
|
|
|
* @var DOMDocument |
|
21
|
|
|
*/ |
|
22
|
|
|
protected $dom; |
|
23
|
|
|
|
|
24
|
|
|
/** |
|
25
|
|
|
* Title of the article. |
|
26
|
|
|
* |
|
27
|
|
|
* @var string|null |
|
28
|
|
|
*/ |
|
29
|
|
|
protected $title = null; |
|
30
|
|
|
|
|
31
|
|
|
/** |
|
32
|
|
|
* Final DOMDocument with the fully parsed HTML. |
|
33
|
|
|
* |
|
34
|
|
|
* @var DOMDocument|null |
|
35
|
|
|
*/ |
|
36
|
|
|
protected $content = null; |
|
37
|
|
|
|
|
38
|
|
|
/** |
|
39
|
|
|
* Excerpt of the article. |
|
40
|
|
|
* |
|
41
|
|
|
* @var string|null |
|
42
|
|
|
*/ |
|
43
|
|
|
protected $excerpt = null; |
|
44
|
|
|
|
|
45
|
|
|
/** |
|
46
|
|
|
* Main image of the article. |
|
47
|
|
|
* |
|
48
|
|
|
* @var string|null |
|
49
|
|
|
*/ |
|
50
|
|
|
protected $image = null; |
|
51
|
|
|
|
|
52
|
|
|
/** |
|
53
|
|
|
* Author of the article. Extracted from the byline tags and other social media properties. |
|
54
|
|
|
* |
|
55
|
|
|
* @var string|null |
|
56
|
|
|
*/ |
|
57
|
|
|
protected $author = null; |
|
58
|
|
|
|
|
59
|
|
|
/** |
|
60
|
|
|
* Website name. |
|
61
|
|
|
* |
|
62
|
|
|
* @var string|null |
|
63
|
|
|
*/ |
|
64
|
|
|
protected $siteName = null; |
|
65
|
|
|
|
|
66
|
|
|
/** |
|
67
|
|
|
* Direction of the text. |
|
68
|
|
|
* |
|
69
|
|
|
* @var string|null |
|
70
|
|
|
*/ |
|
71
|
|
|
protected $direction = null; |
|
72
|
|
|
|
|
73
|
|
|
/** |
|
74
|
|
|
* Configuration object. |
|
75
|
|
|
* |
|
76
|
|
|
* @var Configuration |
|
77
|
|
|
*/ |
|
78
|
|
|
private $configuration; |
|
79
|
|
|
|
|
80
|
|
|
/** |
|
81
|
|
|
* Logger object. |
|
82
|
|
|
* |
|
83
|
|
|
* @var LoggerInterface |
|
84
|
|
|
*/ |
|
85
|
|
|
private $logger; |
|
86
|
|
|
|
|
87
|
|
|
/** |
|
88
|
|
|
* Collection of attempted text extractions. |
|
89
|
|
|
* |
|
90
|
|
|
* @var array |
|
91
|
|
|
*/ |
|
92
|
|
|
private $attempts = []; |
|
93
|
|
|
|
|
94
|
|
|
/** |
|
95
|
|
|
* @var array |
|
96
|
|
|
*/ |
|
97
|
|
|
private $defaultTagsToScore = [ |
|
98
|
|
|
'section', |
|
99
|
|
|
'h2', |
|
100
|
|
|
'h3', |
|
101
|
|
|
'h4', |
|
102
|
|
|
'h5', |
|
103
|
|
|
'h6', |
|
104
|
|
|
'p', |
|
105
|
|
|
'td', |
|
106
|
|
|
'pre', |
|
107
|
|
|
]; |
|
108
|
|
|
|
|
109
|
|
|
/** |
|
110
|
|
|
* @var array |
|
111
|
|
|
*/ |
|
112
|
|
|
private $alterToDIVExceptions = [ |
|
113
|
|
|
'div', |
|
114
|
|
|
'article', |
|
115
|
|
|
'section', |
|
116
|
|
|
'p', |
|
117
|
|
|
]; |
|
118
|
|
|
|
|
119
|
|
|
/** |
|
120
|
|
|
* Readability constructor. |
|
121
|
|
|
* |
|
122
|
|
|
* @param Configuration $configuration |
|
123
|
|
|
*/ |
|
124
|
|
|
public function __construct(Configuration $configuration) |
|
125
|
|
|
{ |
|
126
|
|
|
$this->configuration = $configuration; |
|
127
|
|
|
$this->logger = $this->configuration->getLogger(); |
|
128
|
|
|
} |
|
129
|
|
|
|
|
130
|
|
|
/** |
|
131
|
|
|
* Main parse function. |
|
132
|
|
|
* |
|
133
|
|
|
* @param $html |
|
134
|
|
|
* |
|
135
|
|
|
* @throws ParseException |
|
136
|
|
|
* |
|
137
|
|
|
* @return bool |
|
138
|
|
|
*/ |
|
139
|
|
|
public function parse($html) |
|
140
|
|
|
{ |
|
141
|
|
|
$this->logger->info('*** Starting parse process...'); |
|
142
|
|
|
|
|
143
|
|
|
$this->dom = $this->loadHTML($html); |
|
144
|
|
|
|
|
145
|
|
|
// Checking for minimum HTML to work with. |
|
146
|
|
|
if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) { |
|
147
|
|
|
$this->logger->emergency('No body tag present or body tag empty'); |
|
148
|
|
|
|
|
149
|
|
|
throw new ParseException('Invalid or incomplete HTML.'); |
|
150
|
|
|
} |
|
151
|
|
|
|
|
152
|
|
|
$this->getMetadata(); |
|
153
|
|
|
|
|
154
|
|
|
$this->getMainImage(); |
|
155
|
|
|
|
|
156
|
|
|
while (true) { |
|
157
|
|
|
$root = $root->firstChild; |
|
158
|
|
|
|
|
159
|
|
|
$elementsToScore = $this->getNodes($root); |
|
160
|
|
|
$this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore))); |
|
161
|
|
|
|
|
162
|
|
|
$result = $this->rateNodes($elementsToScore); |
|
163
|
|
|
|
|
164
|
|
|
/* |
|
165
|
|
|
* Now that we've gone through the full algorithm, check to see if |
|
166
|
|
|
* we got any meaningful content. If we didn't, we may need to re-run |
|
167
|
|
|
* grabArticle with different flags set. This gives us a higher likelihood of |
|
168
|
|
|
* finding the content, and the sieve approach gives us a higher likelihood of |
|
169
|
|
|
* finding the -right- content. |
|
170
|
|
|
*/ |
|
171
|
|
|
|
|
172
|
|
|
$length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); |
|
173
|
|
|
|
|
174
|
|
|
$this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); |
|
175
|
|
|
|
|
176
|
|
|
if ($result && $length < $this->configuration->getCharThreshold()) { |
|
177
|
|
|
$this->dom = $this->loadHTML($html); |
|
178
|
|
|
$root = $this->dom->getElementsByTagName('body')->item(0); |
|
179
|
|
|
|
|
180
|
|
|
if ($this->configuration->getStripUnlikelyCandidates()) { |
|
181
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false'); |
|
182
|
|
|
$this->configuration->setStripUnlikelyCandidates(false); |
|
183
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
|
184
|
|
|
} elseif ($this->configuration->getWeightClasses()) { |
|
185
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false'); |
|
186
|
|
|
$this->configuration->setWeightClasses(false); |
|
187
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
|
188
|
|
|
} elseif ($this->configuration->getCleanConditionally()) { |
|
189
|
|
|
$this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false'); |
|
190
|
|
|
$this->configuration->setCleanConditionally(false); |
|
191
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
|
192
|
|
|
} else { |
|
193
|
|
|
$this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.'); |
|
194
|
|
|
$this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
|
195
|
|
|
|
|
196
|
|
|
// No luck after removing flags, just return the longest text we found during the different loops |
|
197
|
|
|
usort($this->attempts, function($a, $b) { |
|
198
|
|
|
return $a['textLength'] < $b['textLength']; |
|
199
|
|
|
}); |
|
200
|
|
|
|
|
201
|
|
|
// But first check if we actually have something |
|
202
|
|
|
if (!$this->attempts[0]['textLength']) { |
|
203
|
|
|
$this->logger->emergency('[Parsing] Could not parse text, giving up :('); |
|
204
|
|
|
|
|
205
|
|
|
throw new ParseException('Could not parse text.'); |
|
206
|
|
|
} |
|
207
|
|
|
|
|
208
|
|
|
$this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.'); |
|
209
|
|
|
|
|
210
|
|
|
$result = $this->attempts[0]['articleContent']; |
|
211
|
|
|
break; |
|
212
|
|
|
} |
|
213
|
|
|
} else { |
|
214
|
|
|
break; |
|
215
|
|
|
} |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
|
|
$result = $this->postProcessContent($result); |
|
|
|
|
|
|
219
|
|
|
|
|
220
|
|
|
// If we haven't found an excerpt in the article's metadata, use the article's |
|
221
|
|
|
// first paragraph as the excerpt. This can be used for displaying a preview of |
|
222
|
|
|
// the article's content. |
|
223
|
|
|
if (!$this->getExcerpt()) { |
|
224
|
|
|
$this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); |
|
225
|
|
|
$paragraphs = $result->getElementsByTagName('p'); |
|
226
|
|
|
if ($paragraphs->length > 0) { |
|
227
|
|
|
$this->setExcerpt(trim($paragraphs->item(0)->textContent)); |
|
228
|
|
|
} |
|
229
|
|
|
} |
|
230
|
|
|
|
|
231
|
|
|
$this->setContent($result); |
|
232
|
|
|
|
|
233
|
|
|
$this->logger->info('*** Parse successful :)'); |
|
234
|
|
|
|
|
235
|
|
|
return true; |
|
236
|
|
|
} |
|
237
|
|
|
|
|
238
|
|
|
/** |
|
239
|
|
|
* Creates a DOM Document object and loads the provided HTML on it. |
|
240
|
|
|
* |
|
241
|
|
|
* Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) |
|
242
|
|
|
* Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs |
|
243
|
|
|
* because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both |
|
244
|
|
|
* objects and ruining the backup. |
|
245
|
|
|
* |
|
246
|
|
|
* @param string $html |
|
247
|
|
|
* |
|
248
|
|
|
* @return DOMDocument |
|
249
|
|
|
*/ |
|
250
|
|
|
private function loadHTML($html) |
|
251
|
|
|
{ |
|
252
|
|
|
$this->logger->debug('[Loading] Loading HTML...'); |
|
253
|
|
|
|
|
254
|
|
|
// To avoid throwing a gazillion of errors on malformed HTMLs |
|
255
|
|
|
libxml_use_internal_errors(true); |
|
256
|
|
|
|
|
257
|
|
|
$dom = new DOMDocument('1.0', 'utf-8'); |
|
258
|
|
|
|
|
259
|
|
|
if (!$this->configuration->getSubstituteEntities()) { |
|
260
|
|
|
// Keep the original HTML entities |
|
261
|
|
|
$dom->substituteEntities = false; |
|
262
|
|
|
} |
|
263
|
|
|
|
|
264
|
|
|
if ($this->configuration->getNormalizeEntities()) { |
|
265
|
|
|
$this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); |
|
266
|
|
|
// Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content |
|
267
|
|
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
|
268
|
|
|
} |
|
269
|
|
|
|
|
270
|
|
|
if ($this->configuration->getSummonCthulhu()) { |
|
271
|
|
|
$this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); |
|
272
|
|
|
$html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); |
|
273
|
|
|
} |
|
274
|
|
|
|
|
275
|
|
|
// Prepend the XML tag to avoid having issues with special characters. Should be harmless. |
|
276
|
|
|
$dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
|
277
|
|
|
$dom->encoding = 'UTF-8'; |
|
278
|
|
|
|
|
279
|
|
|
$this->removeScripts($dom); |
|
280
|
|
|
|
|
281
|
|
|
$this->prepDocument($dom); |
|
282
|
|
|
|
|
283
|
|
|
$this->logger->debug('[Loading] Loaded HTML successfully.'); |
|
284
|
|
|
|
|
285
|
|
|
return $dom; |
|
286
|
|
|
} |
|
287
|
|
|
|
|
288
|
|
|
/** |
|
289
|
|
|
* Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. |
|
290
|
|
|
*/ |
|
291
|
|
|
private function getMetadata() |
|
292
|
|
|
{ |
|
293
|
|
|
$this->logger->debug('[Metadata] Retrieving metadata...'); |
|
294
|
|
|
|
|
295
|
|
|
$values = []; |
|
296
|
|
|
// property is a space-separated list of values |
|
297
|
|
|
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i'; |
|
298
|
|
|
|
|
299
|
|
|
// name is a single value |
|
300
|
|
|
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; |
|
301
|
|
|
|
|
302
|
|
|
// Find description tags. |
|
303
|
|
|
foreach ($this->dom->getElementsByTagName('meta') as $meta) { |
|
304
|
|
|
/* @var DOMNode $meta */ |
|
305
|
|
|
$elementName = $meta->getAttribute('name'); |
|
306
|
|
|
$elementProperty = $meta->getAttribute('property'); |
|
307
|
|
|
$content = $meta->getAttribute('content'); |
|
308
|
|
|
$matches = null; |
|
309
|
|
|
$name = null; |
|
310
|
|
|
|
|
311
|
|
|
if ($elementProperty) { |
|
312
|
|
|
if (preg_match($propertyPattern, $elementProperty, $matches)) { |
|
313
|
|
|
for ($i = count($matches) - 1; $i >= 0; $i--) { |
|
314
|
|
|
// Convert to lowercase, and remove any whitespace |
|
315
|
|
|
// so we can match below. |
|
316
|
|
|
$name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); |
|
317
|
|
|
// multiple authors |
|
318
|
|
|
$values[$name] = trim($content); |
|
319
|
|
|
} |
|
320
|
|
|
} |
|
321
|
|
|
} |
|
322
|
|
|
|
|
323
|
|
|
if (!$matches && $elementName && preg_match($namePattern, $elementName)) { |
|
324
|
|
|
$name = $elementName; |
|
325
|
|
|
if ($content) { |
|
326
|
|
|
// Convert to lowercase, remove any whitespace, and convert dots |
|
327
|
|
|
// to colons so we can match below. |
|
328
|
|
|
$name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name)); |
|
329
|
|
|
$values[$name] = trim($content); |
|
330
|
|
|
} |
|
331
|
|
|
} |
|
332
|
|
|
} |
|
333
|
|
|
|
|
334
|
|
|
// get title |
|
335
|
|
|
/* |
|
336
|
|
|
* This is a very convoluted way of extracting the first matching key of the $values array |
|
337
|
|
|
* against a set of options. |
|
338
|
|
|
* |
|
339
|
|
|
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. |
|
340
|
|
|
* Will probably replace it with ??s after dropping support of PHP5.6 |
|
341
|
|
|
*/ |
|
342
|
|
|
$key = current(array_intersect([ |
|
343
|
|
|
'dc:title', |
|
344
|
|
|
'dcterm:title', |
|
345
|
|
|
'og:title', |
|
346
|
|
|
'weibo:article:title', |
|
347
|
|
|
'weibo:webpage:title', |
|
348
|
|
|
'title', |
|
349
|
|
|
'twitter:title' |
|
350
|
|
|
], array_keys($values))); |
|
351
|
|
|
|
|
352
|
|
|
$this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); |
|
353
|
|
|
|
|
354
|
|
|
if (!$this->getTitle()) { |
|
355
|
|
|
$this->setTitle($this->getArticleTitle()); |
|
356
|
|
|
} |
|
357
|
|
|
|
|
358
|
|
|
// get author |
|
359
|
|
|
$key = current(array_intersect([ |
|
360
|
|
|
'dc:creator', |
|
361
|
|
|
'dcterm:creator', |
|
362
|
|
|
'author' |
|
363
|
|
|
], array_keys($values))); |
|
364
|
|
|
|
|
365
|
|
|
$this->setAuthor(isset($values[$key]) ? $values[$key] : null); |
|
366
|
|
|
|
|
367
|
|
|
// get description |
|
368
|
|
|
$key = current(array_intersect([ |
|
369
|
|
|
'dc:description', |
|
370
|
|
|
'dcterm:description', |
|
371
|
|
|
'og:description', |
|
372
|
|
|
'weibo:article:description', |
|
373
|
|
|
'weibo:webpage:description', |
|
374
|
|
|
'description', |
|
375
|
|
|
'twitter:description' |
|
376
|
|
|
], array_keys($values))); |
|
377
|
|
|
|
|
378
|
|
|
$this->setExcerpt(isset($values[$key]) ? $values[$key] : null); |
|
379
|
|
|
|
|
380
|
|
|
// get main image |
|
381
|
|
|
$key = current(array_intersect([ |
|
382
|
|
|
'image', |
|
383
|
|
|
'og:image', |
|
384
|
|
|
'twitter:image' |
|
385
|
|
|
], array_keys($values))); |
|
386
|
|
|
|
|
387
|
|
|
$this->setImage(isset($values[$key]) ? $values[$key] : null); |
|
388
|
|
|
|
|
389
|
|
|
$key = current(array_intersect([ |
|
390
|
|
|
'og:site_name' |
|
391
|
|
|
], array_keys($values))); |
|
392
|
|
|
|
|
393
|
|
|
$this->setSiteName(isset($values[$key]) ? $values[$key] : null); |
|
394
|
|
|
} |
|
395
|
|
|
|
|
396
|
|
|
/** |
|
397
|
|
|
* Returns all the images of the parsed article. |
|
398
|
|
|
* |
|
399
|
|
|
* @return array |
|
400
|
|
|
*/ |
|
401
|
|
|
public function getImages() |
|
402
|
|
|
{ |
|
403
|
|
|
$result = []; |
|
404
|
|
|
if ($this->getImage()) { |
|
405
|
|
|
$result[] = $this->getImage(); |
|
406
|
|
|
} |
|
407
|
|
|
|
|
408
|
|
|
if (null == $this->getDOMDocument()) { |
|
409
|
|
|
return $result; |
|
410
|
|
|
} |
|
411
|
|
|
|
|
412
|
|
|
foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) { |
|
413
|
|
|
if ($src = $img->getAttribute('src')) { |
|
414
|
|
|
$result[] = $src; |
|
415
|
|
|
} |
|
416
|
|
|
} |
|
417
|
|
|
|
|
418
|
|
|
if ($this->configuration->getFixRelativeURLs()) { |
|
419
|
|
|
foreach ($result as &$imgSrc) { |
|
420
|
|
|
$imgSrc = $this->toAbsoluteURI($imgSrc); |
|
421
|
|
|
} |
|
422
|
|
|
} |
|
423
|
|
|
|
|
424
|
|
|
$result = array_unique(array_filter($result)); |
|
425
|
|
|
|
|
426
|
|
|
return $result; |
|
427
|
|
|
} |
|
428
|
|
|
|
|
429
|
|
|
/** |
|
430
|
|
|
* Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't |
|
431
|
|
|
* find a correct image. |
|
432
|
|
|
*/ |
|
433
|
|
|
public function getMainImage() |
|
434
|
|
|
{ |
|
435
|
|
|
$imgUrl = false; |
|
436
|
|
|
|
|
437
|
|
|
if ($this->getImage() !== null) { |
|
438
|
|
|
$imgUrl = $this->getImage(); |
|
439
|
|
|
} |
|
440
|
|
|
|
|
441
|
|
|
if (!$imgUrl) { |
|
442
|
|
|
foreach ($this->dom->getElementsByTagName('link') as $link) { |
|
443
|
|
|
/** @var \DOMElement $link */ |
|
444
|
|
|
/* |
|
445
|
|
|
* Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and |
|
446
|
|
|
* finally check for the existence of the href attribute, which should hold the image url. |
|
447
|
|
|
*/ |
|
448
|
|
|
if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { |
|
449
|
|
|
$imgUrl = $link->getAttribute('href'); |
|
450
|
|
|
break; |
|
451
|
|
|
} |
|
452
|
|
|
} |
|
453
|
|
|
} |
|
454
|
|
|
|
|
455
|
|
|
if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) { |
|
456
|
|
|
$this->setImage($this->toAbsoluteURI($imgUrl)); |
|
457
|
|
|
} |
|
458
|
|
|
} |
|
459
|
|
|
|
|
460
|
|
|
/** |
|
461
|
|
|
* Returns the title of the html. Prioritizes the title from the metadata against the title tag. |
|
462
|
|
|
* |
|
463
|
|
|
* @return string|null |
|
464
|
|
|
*/ |
|
465
|
|
|
private function getArticleTitle() |
|
466
|
|
|
{ |
|
467
|
|
|
$originalTitle = null; |
|
468
|
|
|
|
|
469
|
|
|
if ($this->getTitle()) { |
|
470
|
|
|
$originalTitle = $this->getTitle(); |
|
471
|
|
|
} else { |
|
472
|
|
|
$this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...'); |
|
473
|
|
|
$titleTag = $this->dom->getElementsByTagName('title'); |
|
474
|
|
|
if ($titleTag->length > 0) { |
|
475
|
|
|
$this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue)); |
|
476
|
|
|
$originalTitle = $titleTag->item(0)->nodeValue; |
|
477
|
|
|
} |
|
478
|
|
|
} |
|
479
|
|
|
|
|
480
|
|
|
if ($originalTitle === null) { |
|
481
|
|
|
return null; |
|
482
|
|
|
} |
|
483
|
|
|
|
|
484
|
|
|
$curTitle = $originalTitle = trim($originalTitle); |
|
485
|
|
|
$titleHadHierarchicalSeparators = false; |
|
486
|
|
|
|
|
487
|
|
|
/* |
|
488
|
|
|
* If there's a separator in the title, first remove the final part |
|
489
|
|
|
* |
|
490
|
|
|
* Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false |
|
491
|
|
|
* I can assure you it works properly if you let the code run. |
|
492
|
|
|
*/ |
|
493
|
|
|
if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { |
|
494
|
|
|
$titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); |
|
495
|
|
|
$curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); |
|
496
|
|
|
|
|
497
|
|
|
$this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); |
|
498
|
|
|
|
|
499
|
|
|
// If the resulting title is too short (3 words or fewer), remove |
|
500
|
|
|
// the first part instead: |
|
501
|
|
|
if (count(preg_split('/\s+/', $curTitle)) < 3) { |
|
|
|
|
|
|
502
|
|
|
$curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); |
|
503
|
|
|
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
|
504
|
|
|
} |
|
505
|
|
|
} elseif (strpos($curTitle, ': ') !== false) { |
|
506
|
|
|
// Check if we have an heading containing this exact string, so we |
|
507
|
|
|
// could assume it's the full title. |
|
508
|
|
|
$match = false; |
|
509
|
|
|
for ($i = 1; $i <= 2; $i++) { |
|
510
|
|
|
foreach ($this->dom->getElementsByTagName('h'.$i) as $hTag) { |
|
511
|
|
|
// Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs |
|
512
|
|
|
if (trim($hTag->nodeValue) === trim($curTitle)) { |
|
513
|
|
|
$match = true; |
|
514
|
|
|
} |
|
515
|
|
|
} |
|
516
|
|
|
} |
|
517
|
|
|
|
|
518
|
|
|
// If we don't, let's extract the title out of the original title string. |
|
519
|
|
|
if (!$match) { |
|
520
|
|
|
$curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); |
|
521
|
|
|
|
|
522
|
|
|
$this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle)); |
|
523
|
|
|
|
|
524
|
|
|
// If the title is now too short, try the first colon instead: |
|
525
|
|
|
if (count(preg_split('/\s+/', $curTitle)) < 3) { |
|
526
|
|
|
$curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); |
|
527
|
|
|
$this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
|
528
|
|
|
} elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { |
|
529
|
|
|
// But if we have too many words before the colon there's something weird |
|
530
|
|
|
// with the titles and the H tags so let's just use the original title instead |
|
531
|
|
|
$curTitle = $originalTitle; |
|
532
|
|
|
} |
|
533
|
|
|
} |
|
534
|
|
|
} elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
|
535
|
|
|
$hOnes = $this->dom->getElementsByTagName('h1'); |
|
536
|
|
|
|
|
537
|
|
|
if ($hOnes->length === 1) { |
|
538
|
|
|
$curTitle = $hOnes->item(0)->nodeValue; |
|
539
|
|
|
$this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle)); |
|
540
|
|
|
} |
|
541
|
|
|
} |
|
542
|
|
|
|
|
543
|
|
|
$curTitle = trim($curTitle); |
|
544
|
|
|
|
|
545
|
|
|
/* |
|
546
|
|
|
* If we now have 4 words or fewer as our title, and either no |
|
547
|
|
|
* 'hierarchical' separators (\, /, > or ») were found in the original |
|
548
|
|
|
* title or we decreased the number of words by more than 1 word, use |
|
549
|
|
|
* the original title. |
|
550
|
|
|
*/ |
|
551
|
|
|
$curTitleWordCount = count(preg_split('/\s+/', $curTitle)); |
|
552
|
|
|
$originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1; |
|
553
|
|
|
|
|
554
|
|
|
if ($curTitleWordCount <= 4 && |
|
555
|
|
|
(!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) { |
|
556
|
|
|
$curTitle = $originalTitle; |
|
557
|
|
|
|
|
558
|
|
|
$this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle)); |
|
559
|
|
|
} |
|
560
|
|
|
|
|
561
|
|
|
return $curTitle; |
|
562
|
|
|
} |
|
563
|
|
|
|
|
564
|
|
|
/** |
|
565
|
|
|
* Convert URI to an absolute URI. |
|
566
|
|
|
* |
|
567
|
|
|
* @param $uri string URI to convert |
|
568
|
|
|
* |
|
569
|
|
|
* @return string |
|
570
|
|
|
*/ |
|
571
|
|
|
private function toAbsoluteURI($uri) |
|
572
|
|
|
{ |
|
573
|
|
|
list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); |
|
574
|
|
|
|
|
575
|
|
|
// If this is already an absolute URI, return it. |
|
576
|
|
|
if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { |
|
577
|
|
|
return $uri; |
|
578
|
|
|
} |
|
579
|
|
|
|
|
580
|
|
|
// Scheme-rooted relative URI. |
|
581
|
|
|
if (substr($uri, 0, 2) === '//') { |
|
582
|
|
|
return $scheme.'://'.substr($uri, 2); |
|
583
|
|
|
} |
|
584
|
|
|
|
|
585
|
|
|
// Prepath-rooted relative URI. |
|
586
|
|
|
if (substr($uri, 0, 1) === '/') { |
|
587
|
|
|
return $prePath.$uri; |
|
588
|
|
|
} |
|
589
|
|
|
|
|
590
|
|
|
// Dotslash relative URI. |
|
591
|
|
|
if (strpos($uri, './') === 0) { |
|
592
|
|
|
return $pathBase.substr($uri, 2); |
|
593
|
|
|
} |
|
594
|
|
|
// Ignore hash URIs: |
|
595
|
|
|
if (substr($uri, 0, 1) === '#') { |
|
596
|
|
|
return $uri; |
|
597
|
|
|
} |
|
598
|
|
|
|
|
599
|
|
|
// Standard relative URI; add entire path. pathBase already includes a |
|
600
|
|
|
// trailing "/". |
|
601
|
|
|
return $pathBase.$uri; |
|
602
|
|
|
} |
|
603
|
|
|
|
|
604
|
|
|
/** |
|
605
|
|
|
* Returns full path info of an URL. |
|
606
|
|
|
* |
|
607
|
|
|
* @param string $url |
|
608
|
|
|
* |
|
609
|
|
|
* @return array [$pathBase, $scheme, $prePath] |
|
610
|
|
|
*/ |
|
611
|
|
|
public function getPathInfo($url) |
|
612
|
|
|
{ |
|
613
|
|
|
// Check for base URLs |
|
614
|
|
|
if ($this->dom->baseURI !== null) { |
|
615
|
|
|
if (substr($this->dom->baseURI, 0, 1) === '/') { |
|
616
|
|
|
// URLs starting with '/' override completely the URL defined in the link |
|
617
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).$this->dom->baseURI; |
|
618
|
|
|
} else { |
|
619
|
|
|
// Otherwise just prepend the base to the actual path |
|
620
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'.rtrim($this->dom->baseURI, '/').'/'; |
|
621
|
|
|
} |
|
622
|
|
|
} else { |
|
623
|
|
|
$pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'; |
|
624
|
|
|
} |
|
625
|
|
|
|
|
626
|
|
|
$scheme = parse_url($pathBase, PHP_URL_SCHEME); |
|
627
|
|
|
$prePath = $scheme.'://'.parse_url($pathBase, PHP_URL_HOST); |
|
628
|
|
|
|
|
629
|
|
|
return [$pathBase, $scheme, $prePath]; |
|
630
|
|
|
} |
|
631
|
|
|
|
|
632
|
|
|
/** |
|
633
|
|
|
* Gets nodes from the root element. |
|
634
|
|
|
* |
|
635
|
|
|
* @param $node DOMNode|DOMText |
|
636
|
|
|
* |
|
637
|
|
|
* @return array |
|
638
|
|
|
*/ |
|
639
|
|
|
private function getNodes($node) |
|
640
|
|
|
{ |
|
641
|
|
|
$this->logger->info('[Get Nodes] Retrieving nodes...'); |
|
642
|
|
|
|
|
643
|
|
|
$stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates(); |
|
644
|
|
|
|
|
645
|
|
|
$elementsToScore = []; |
|
646
|
|
|
|
|
647
|
|
|
/* |
|
648
|
|
|
* First, node prepping. Trash nodes that look cruddy (like ones with the |
|
649
|
|
|
* class name "comment", etc), and turn divs into P tags where they have been |
|
650
|
|
|
* used inappropriately (as in, where they contain no other block level elements.) |
|
651
|
|
|
*/ |
|
652
|
|
|
|
|
653
|
|
|
while ($node) { |
|
654
|
|
|
// Remove DOMComments nodes as we don't need them and mess up children counting |
|
655
|
|
|
if ($node->nodeType === XML_COMMENT_NODE) { |
|
656
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
657
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
|
658
|
|
|
continue; |
|
659
|
|
|
} |
|
660
|
|
|
|
|
661
|
|
|
$matchString = $node->getAttribute('class').' '.$node->getAttribute('id'); |
|
662
|
|
|
|
|
663
|
|
|
if (!$node->isProbablyVisible()) { |
|
664
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString)); |
|
665
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
|
666
|
|
|
continue; |
|
667
|
|
|
} |
|
668
|
|
|
|
|
669
|
|
|
// Check to see if this node is a byline, and remove it if it is. |
|
670
|
|
|
if ($this->checkByline($node, $matchString)) { |
|
671
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
672
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
|
673
|
|
|
continue; |
|
674
|
|
|
} |
|
675
|
|
|
|
|
676
|
|
|
// Remove unlikely candidates |
|
677
|
|
|
if ($stripUnlikelyCandidates) { |
|
678
|
|
|
if ( |
|
679
|
|
|
preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && |
|
680
|
|
|
!preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && |
|
681
|
|
|
$node->nodeName !== 'body' && |
|
682
|
|
|
$node->nodeName !== 'a' |
|
683
|
|
|
) { |
|
684
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
685
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
|
686
|
|
|
continue; |
|
687
|
|
|
} |
|
688
|
|
|
} |
|
689
|
|
|
|
|
690
|
|
|
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). |
|
691
|
|
|
if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || |
|
692
|
|
|
$node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || |
|
693
|
|
|
$node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' || |
|
694
|
|
|
$node->nodeName === 'p') && |
|
695
|
|
|
$node->isElementWithoutContent()) { |
|
696
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName)); |
|
697
|
|
|
$node = NodeUtility::removeAndGetNext($node); |
|
698
|
|
|
continue; |
|
699
|
|
|
} |
|
700
|
|
|
|
|
701
|
|
|
if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) { |
|
702
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
703
|
|
|
$elementsToScore[] = $node; |
|
704
|
|
|
} |
|
705
|
|
|
|
|
706
|
|
|
// Turn all divs that don't have children block level elements into p's |
|
707
|
|
|
if ($node->nodeName === 'div') { |
|
708
|
|
|
// Put phrasing content into paragraphs. |
|
709
|
|
|
$p = null; |
|
710
|
|
|
$childNode = $node->firstChild; |
|
711
|
|
|
while ($childNode) { |
|
712
|
|
|
$nextSibling = $childNode->nextSibling; |
|
713
|
|
|
if ($childNode->isPhrasingContent()) { |
|
714
|
|
|
if ($p !== null) { |
|
715
|
|
|
$p->appendChild($childNode); |
|
716
|
|
|
} elseif (!$childNode->isWhitespace()) { |
|
717
|
|
|
$p = $this->dom->createElement('p'); |
|
718
|
|
|
$node->replaceChild($p, $childNode); |
|
719
|
|
|
$p->appendChild($childNode); |
|
720
|
|
|
} |
|
721
|
|
|
} elseif ($p !== null) { |
|
722
|
|
|
while ($p->lastChild && $p->lastChild->isWhitespace()) { |
|
|
|
|
|
|
723
|
|
|
$p->removeChild($p->lastChild); |
|
724
|
|
|
} |
|
725
|
|
|
$p = null; |
|
726
|
|
|
} |
|
727
|
|
|
$childNode = $nextSibling; |
|
728
|
|
|
} |
|
729
|
|
|
|
|
730
|
|
|
/* |
|
731
|
|
|
* Sites like http://mobile.slate.com encloses each paragraph with a DIV |
|
732
|
|
|
* element. DIVs with only a P element inside and no text content can be |
|
733
|
|
|
* safely converted into plain P elements to avoid confusing the scoring |
|
734
|
|
|
* algorithm with DIVs with are, in practice, paragraphs. |
|
735
|
|
|
*/ |
|
736
|
|
|
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { |
|
737
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
738
|
|
|
$pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0); |
|
739
|
|
|
$node->parentNode->replaceChild($pNode, $node); |
|
740
|
|
|
$node = $pNode; |
|
741
|
|
|
$elementsToScore[] = $node; |
|
742
|
|
|
} elseif (!$node->hasSingleChildBlockElement()) { |
|
743
|
|
|
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
|
744
|
|
|
$node = NodeUtility::setNodeTag($node, 'p'); |
|
745
|
|
|
$elementsToScore[] = $node; |
|
746
|
|
|
} |
|
747
|
|
|
} |
|
748
|
|
|
|
|
749
|
|
|
$node = NodeUtility::getNextNode($node); |
|
750
|
|
|
} |
|
751
|
|
|
|
|
752
|
|
|
return $elementsToScore; |
|
753
|
|
|
} |
|
754
|
|
|
|
|
755
|
|
|
/** |
|
756
|
|
|
* Checks if the node is a byline. |
|
757
|
|
|
* |
|
758
|
|
|
* @param DOMNode $node |
|
759
|
|
|
* @param string $matchString |
|
760
|
|
|
* |
|
761
|
|
|
* @return bool |
|
762
|
|
|
*/ |
|
763
|
|
|
private function checkByline($node, $matchString) |
|
764
|
|
|
{ |
|
765
|
|
|
if (!$this->configuration->getArticleByLine()) { |
|
766
|
|
|
return false; |
|
767
|
|
|
} |
|
768
|
|
|
|
|
769
|
|
|
/* |
|
770
|
|
|
* Check if the byline is already set |
|
771
|
|
|
*/ |
|
772
|
|
|
if ($this->getAuthor()) { |
|
773
|
|
|
return false; |
|
774
|
|
|
} |
|
775
|
|
|
|
|
776
|
|
|
$rel = $node->getAttribute('rel'); |
|
777
|
|
|
|
|
778
|
|
|
if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { |
|
779
|
|
|
$this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); |
|
780
|
|
|
$this->setAuthor(trim($node->getTextContent())); |
|
781
|
|
|
|
|
782
|
|
|
return true; |
|
783
|
|
|
} |
|
784
|
|
|
|
|
785
|
|
|
return false; |
|
786
|
|
|
} |
|
787
|
|
|
|
|
788
|
|
|
/** |
|
789
|
|
|
* Checks the validity of a byLine. Based on string length. |
|
790
|
|
|
* |
|
791
|
|
|
* @param string $text |
|
792
|
|
|
* |
|
793
|
|
|
* @return bool |
|
794
|
|
|
*/ |
|
795
|
|
|
private function isValidByline($text) |
|
796
|
|
|
{ |
|
797
|
|
|
if (gettype($text) == 'string') { |
|
798
|
|
|
$byline = trim($text); |
|
799
|
|
|
|
|
800
|
|
|
return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100); |
|
801
|
|
|
} |
|
802
|
|
|
|
|
803
|
|
|
return false; |
|
804
|
|
|
} |
|
805
|
|
|
|
|
806
|
|
|
/** |
|
807
|
|
|
* Removes all the scripts of the html. |
|
808
|
|
|
* |
|
809
|
|
|
* @param DOMDocument $dom |
|
810
|
|
|
*/ |
|
811
|
|
|
private function removeScripts(DOMDocument $dom) |
|
812
|
|
|
{ |
|
813
|
|
|
foreach (['script', 'noscript'] as $tag) { |
|
814
|
|
|
$nodes = $dom->getElementsByTagName($tag); |
|
815
|
|
|
foreach (iterator_to_array($nodes) as $node) { |
|
816
|
|
|
NodeUtility::removeNode($node); |
|
817
|
|
|
} |
|
818
|
|
|
} |
|
819
|
|
|
} |
|
820
|
|
|
|
|
821
|
|
|
/** |
|
822
|
|
|
* Prepares the document for parsing. |
|
823
|
|
|
* |
|
824
|
|
|
* @param DOMDocument $dom |
|
825
|
|
|
*/ |
|
826
|
|
|
private function prepDocument(DOMDocument $dom) |
|
827
|
|
|
{ |
|
828
|
|
|
$this->logger->info('[PrepDocument] Preparing document for parsing...'); |
|
829
|
|
|
|
|
830
|
|
|
foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { |
|
831
|
|
|
$next = $br->nextSibling; |
|
832
|
|
|
|
|
833
|
|
|
/* |
|
834
|
|
|
* Whether 2 or more <br> elements have been found and replaced with a |
|
835
|
|
|
* <p> block. |
|
836
|
|
|
*/ |
|
837
|
|
|
$replaced = false; |
|
838
|
|
|
|
|
839
|
|
|
/* |
|
840
|
|
|
* If we find a <br> chain, remove the <br>s until we hit another element |
|
841
|
|
|
* or non-whitespace. This leaves behind the first <br> in the chain |
|
842
|
|
|
* (which will be replaced with a <p> later). |
|
843
|
|
|
*/ |
|
844
|
|
|
while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) { |
|
845
|
|
|
$this->logger->debug('[PrepDocument] Removing chain of BR nodes...'); |
|
846
|
|
|
|
|
847
|
|
|
$replaced = true; |
|
848
|
|
|
$brSibling = $next->nextSibling; |
|
849
|
|
|
$next->parentNode->removeChild($next); |
|
850
|
|
|
$next = $brSibling; |
|
851
|
|
|
} |
|
852
|
|
|
|
|
853
|
|
|
/* |
|
854
|
|
|
* If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
|
855
|
|
|
* all sibling nodes as children of the <p> until we hit another <br> |
|
856
|
|
|
* chain. |
|
857
|
|
|
*/ |
|
858
|
|
|
|
|
859
|
|
|
if ($replaced) { |
|
860
|
|
|
$p = $dom->createElement('p'); |
|
861
|
|
|
$br->parentNode->replaceChild($p, $br); |
|
862
|
|
|
|
|
863
|
|
|
$next = $p->nextSibling; |
|
864
|
|
|
while ($next) { |
|
865
|
|
|
// If we've hit another <br><br>, we're done adding children to this <p>. |
|
866
|
|
|
if ($next->nodeName === 'br') { |
|
867
|
|
|
$nextElem = NodeUtility::nextElement($next->nextSibling); |
|
868
|
|
|
if ($nextElem && $nextElem->nodeName === 'br') { |
|
869
|
|
|
break; |
|
870
|
|
|
} |
|
871
|
|
|
} |
|
872
|
|
|
|
|
873
|
|
|
if (!$next->isPhrasingContent()) { |
|
|
|
|
|
|
874
|
|
|
break; |
|
875
|
|
|
} |
|
876
|
|
|
|
|
877
|
|
|
$this->logger->debug('[PrepDocument] Replacing BR with a P node...'); |
|
878
|
|
|
|
|
879
|
|
|
// Otherwise, make this node a child of the new <p>. |
|
880
|
|
|
$sibling = $next->nextSibling; |
|
881
|
|
|
$p->appendChild($next); |
|
882
|
|
|
$next = $sibling; |
|
883
|
|
|
} |
|
884
|
|
|
|
|
885
|
|
|
while ($p->lastChild && $p->lastChild->isWhitespace()) { |
|
886
|
|
|
$p->removeChild($p->lastChild); |
|
887
|
|
|
} |
|
888
|
|
|
|
|
889
|
|
|
if ($p->parentNode->tagName === 'p') { |
|
890
|
|
|
NodeUtility::setNodeTag($p->parentNode, 'div'); |
|
|
|
|
|
|
891
|
|
|
} |
|
892
|
|
|
} |
|
893
|
|
|
} |
|
894
|
|
|
|
|
895
|
|
|
// Replace font tags with span |
|
896
|
|
|
$fonts = $dom->getElementsByTagName('font'); |
|
897
|
|
|
$length = $fonts->length; |
|
898
|
|
|
for ($i = 0; $i < $length; $i++) { |
|
899
|
|
|
$this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); |
|
900
|
|
|
$font = $fonts->item($length - 1 - $i); |
|
901
|
|
|
NodeUtility::setNodeTag($font, 'span'); |
|
902
|
|
|
} |
|
903
|
|
|
} |
|
904
|
|
|
|
|
905
|
|
|
/** |
|
906
|
|
|
* Assign scores to each node. Returns full article parsed or false on error. |
|
907
|
|
|
* |
|
908
|
|
|
* @param array $nodes |
|
909
|
|
|
* |
|
910
|
|
|
* @return DOMDocument|bool |
|
911
|
|
|
*/ |
|
912
|
|
|
private function rateNodes($nodes) |
|
913
|
|
|
{ |
|
914
|
|
|
$this->logger->info('[Rating] Rating nodes...'); |
|
915
|
|
|
|
|
916
|
|
|
$candidates = []; |
|
917
|
|
|
|
|
918
|
|
|
/** @var DOMElement $node */ |
|
919
|
|
|
foreach ($nodes as $node) { |
|
920
|
|
|
if (is_null($node->parentNode)) { |
|
921
|
|
|
continue; |
|
922
|
|
|
} |
|
923
|
|
|
|
|
924
|
|
|
// Discard nodes with less than 25 characters, without blank space |
|
925
|
|
|
if (mb_strlen($node->getTextContent(true)) < 25) { |
|
926
|
|
|
continue; |
|
927
|
|
|
} |
|
928
|
|
|
|
|
929
|
|
|
$ancestors = $node->getNodeAncestors(); |
|
930
|
|
|
|
|
931
|
|
|
// Exclude nodes with no ancestor |
|
932
|
|
|
if (count($ancestors) === 0) { |
|
933
|
|
|
continue; |
|
934
|
|
|
} |
|
935
|
|
|
|
|
936
|
|
|
// Start with a point for the paragraph itself as a base. |
|
937
|
|
|
$contentScore = 1; |
|
938
|
|
|
|
|
939
|
|
|
// Add points for any commas within this paragraph. |
|
940
|
|
|
$contentScore += count(explode(',', $node->getTextContent(true))); |
|
941
|
|
|
|
|
942
|
|
|
// For every 100 characters in this paragraph, add another point. Up to 3 points. |
|
943
|
|
|
$contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); |
|
944
|
|
|
|
|
945
|
|
|
$this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128))); |
|
946
|
|
|
|
|
947
|
|
|
/** @var $ancestor DOMElement */ |
|
948
|
|
|
foreach ($ancestors as $level => $ancestor) { |
|
949
|
|
|
$this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...'); |
|
950
|
|
|
if (!$ancestor->isInitialized()) { |
|
951
|
|
|
$ancestor->initializeNode($this->configuration->getWeightClasses()); |
|
952
|
|
|
$candidates[] = $ancestor; |
|
953
|
|
|
} |
|
954
|
|
|
|
|
955
|
|
|
/* |
|
956
|
|
|
* Node score divider: |
|
957
|
|
|
* - parent: 1 (no division) |
|
958
|
|
|
* - grandparent: 2 |
|
959
|
|
|
* - great grandparent+: ancestor level * 3 |
|
960
|
|
|
*/ |
|
961
|
|
|
|
|
962
|
|
|
if ($level === 0) { |
|
963
|
|
|
$scoreDivider = 1; |
|
964
|
|
|
} elseif ($level === 1) { |
|
965
|
|
|
$scoreDivider = 2; |
|
966
|
|
|
} else { |
|
967
|
|
|
$scoreDivider = $level * 3; |
|
968
|
|
|
} |
|
969
|
|
|
|
|
970
|
|
|
$currentScore = $ancestor->contentScore; |
|
971
|
|
|
$ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider); |
|
972
|
|
|
|
|
973
|
|
|
$this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128))); |
|
974
|
|
|
} |
|
975
|
|
|
} |
|
976
|
|
|
|
|
977
|
|
|
/* |
|
978
|
|
|
* After we've calculated scores, loop through all of the possible |
|
979
|
|
|
* candidate nodes we found and find the one with the highest score. |
|
980
|
|
|
*/ |
|
981
|
|
|
|
|
982
|
|
|
$topCandidates = []; |
|
983
|
|
|
foreach ($candidates as $candidate) { |
|
984
|
|
|
|
|
985
|
|
|
/* |
|
986
|
|
|
* Scale the final candidates score based on link density. Good content |
|
987
|
|
|
* should have a relatively small link density (5% or less) and be mostly |
|
988
|
|
|
* unaffected by this operation. |
|
989
|
|
|
*/ |
|
990
|
|
|
|
|
991
|
|
|
$candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity()); |
|
992
|
|
|
|
|
993
|
|
|
for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) { |
|
994
|
|
|
$aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null; |
|
995
|
|
|
|
|
996
|
|
|
if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) { |
|
997
|
|
|
array_splice($topCandidates, $i, 0, [$candidate]); |
|
998
|
|
|
if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) { |
|
999
|
|
|
array_pop($topCandidates); |
|
1000
|
|
|
} |
|
1001
|
|
|
break; |
|
1002
|
|
|
} |
|
1003
|
|
|
} |
|
1004
|
|
|
} |
|
1005
|
|
|
|
|
1006
|
|
|
$topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; |
|
1007
|
|
|
$parentOfTopCandidate = null; |
|
1008
|
|
|
|
|
1009
|
|
|
/* |
|
1010
|
|
|
* If we still have no top candidate, just use the body as a last resort. |
|
1011
|
|
|
* We also have to copy the body node so it is something we can modify. |
|
1012
|
|
|
*/ |
|
1013
|
|
|
|
|
1014
|
|
|
if ($topCandidate === null || $topCandidate->nodeName === 'body') { |
|
1015
|
|
|
$this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.'); |
|
1016
|
|
|
|
|
1017
|
|
|
// Move all of the page's children into topCandidate |
|
1018
|
|
|
$topCandidate = new DOMDocument('1.0', 'utf-8'); |
|
1019
|
|
|
$topCandidate->encoding = 'UTF-8'; |
|
1020
|
|
|
$topCandidate->appendChild($topCandidate->createElement('div', '')); |
|
1021
|
|
|
$kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; |
|
1022
|
|
|
|
|
1023
|
|
|
// Cannot be foreached, don't ask me why. |
|
1024
|
|
|
for ($i = 0; $i < $kids->length; $i++) { |
|
1025
|
|
|
$import = $topCandidate->importNode($kids->item($i), true); |
|
1026
|
|
|
$topCandidate->firstChild->appendChild($import); |
|
1027
|
|
|
} |
|
1028
|
|
|
|
|
1029
|
|
|
// Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument. |
|
1030
|
|
|
$topCandidate = $topCandidate->firstChild; |
|
1031
|
|
|
} elseif ($topCandidate) { |
|
1032
|
|
|
$this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore)); |
|
1033
|
|
|
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array |
|
1034
|
|
|
// and whose scores are quite closed with current `topCandidate` node. |
|
1035
|
|
|
$alternativeCandidateAncestors = []; |
|
1036
|
|
|
for ($i = 1; $i < count($topCandidates); $i++) { |
|
|
|
|
|
|
1037
|
|
|
// In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero |
|
1038
|
|
|
// we have to use max() and replace zero with a low value like 0.1 |
|
1039
|
|
|
if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) { |
|
1040
|
|
|
array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false)); |
|
1041
|
|
|
} |
|
1042
|
|
|
} |
|
1043
|
|
|
|
|
1044
|
|
|
$MINIMUM_TOPCANDIDATES = 3; |
|
1045
|
|
|
if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { |
|
1046
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
1047
|
|
|
|
|
1048
|
|
|
// Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher |
|
1049
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { |
|
1050
|
|
|
$listsContainingThisAncestor = 0; |
|
1051
|
|
|
for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { |
|
1052
|
|
|
$listsContainingThisAncestor += (int) in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); |
|
1053
|
|
|
} |
|
1054
|
|
|
if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) { |
|
1055
|
|
|
$topCandidate = $parentOfTopCandidate; |
|
1056
|
|
|
break; |
|
1057
|
|
|
} |
|
1058
|
|
|
$parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
|
1059
|
|
|
} |
|
1060
|
|
|
} |
|
1061
|
|
|
|
|
1062
|
|
|
/* |
|
1063
|
|
|
* Because of our bonus system, parents of candidates might have scores |
|
1064
|
|
|
* themselves. They get half of the node. There won't be nodes with higher |
|
1065
|
|
|
* scores than our topCandidate, but if we see the score going *up* in the first |
|
1066
|
|
|
* few steps up the tree, that's a decent sign that there might be more content |
|
1067
|
|
|
* lurking in other places that we want to unify in. The sibling stuff |
|
1068
|
|
|
* below does some of that - but only if we've looked high enough up the DOM |
|
1069
|
|
|
* tree. |
|
1070
|
|
|
*/ |
|
1071
|
|
|
|
|
1072
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
1073
|
|
|
$lastScore = $topCandidate->contentScore; |
|
1074
|
|
|
|
|
1075
|
|
|
// The scores shouldn't get too low. |
|
1076
|
|
|
$scoreThreshold = $lastScore / 3; |
|
1077
|
|
|
|
|
1078
|
|
|
/* @var DOMElement $parentOfTopCandidate */ |
|
1079
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body') { |
|
1080
|
|
|
$parentScore = $parentOfTopCandidate->contentScore; |
|
1081
|
|
|
if ($parentScore < $scoreThreshold) { |
|
1082
|
|
|
break; |
|
1083
|
|
|
} |
|
1084
|
|
|
|
|
1085
|
|
|
if ($parentScore > $lastScore) { |
|
1086
|
|
|
// Alright! We found a better parent to use. |
|
1087
|
|
|
$topCandidate = $parentOfTopCandidate; |
|
1088
|
|
|
$this->logger->info('[Rating] Found a better top candidate.'); |
|
1089
|
|
|
break; |
|
1090
|
|
|
} |
|
1091
|
|
|
$lastScore = $parentOfTopCandidate->contentScore; |
|
1092
|
|
|
$parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
|
1093
|
|
|
} |
|
1094
|
|
|
|
|
1095
|
|
|
// If the top candidate is the only child, use parent instead. This will help sibling |
|
1096
|
|
|
// joining logic when adjacent content is actually located in parent's sibling node. |
|
1097
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
1098
|
|
|
while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { |
|
1099
|
|
|
$topCandidate = $parentOfTopCandidate; |
|
1100
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
1101
|
|
|
} |
|
1102
|
|
|
} |
|
1103
|
|
|
|
|
1104
|
|
|
/* |
|
1105
|
|
|
* Now that we have the top candidate, look through its siblings for content |
|
1106
|
|
|
* that might also be related. Things like preambles, content split by ads |
|
1107
|
|
|
* that we removed, etc. |
|
1108
|
|
|
*/ |
|
1109
|
|
|
|
|
1110
|
|
|
$this->logger->info('[Rating] Creating final article content document...'); |
|
1111
|
|
|
|
|
1112
|
|
|
$articleContent = new DOMDocument('1.0', 'utf-8'); |
|
1113
|
|
|
$articleContent->createElement('div'); |
|
1114
|
|
|
|
|
1115
|
|
|
$siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); |
|
1116
|
|
|
// Keep potential top candidate's parent node to try to get text direction of it later. |
|
1117
|
|
|
$parentOfTopCandidate = $topCandidate->parentNode; |
|
1118
|
|
|
$siblings = $parentOfTopCandidate->childNodes; |
|
1119
|
|
|
|
|
1120
|
|
|
$hasContent = false; |
|
1121
|
|
|
|
|
1122
|
|
|
$this->logger->info('[Rating] Adding top candidate siblings...'); |
|
1123
|
|
|
|
|
1124
|
|
|
/* @var DOMElement $sibling */ |
|
1125
|
|
|
// Can't foreach here because down there we might change the tag name and that causes the foreach to skip items |
|
1126
|
|
|
for ($i = 0; $i < $siblings->length; $i++) { |
|
1127
|
|
|
$sibling = $siblings[$i]; |
|
1128
|
|
|
$append = false; |
|
1129
|
|
|
|
|
1130
|
|
|
if ($sibling === $topCandidate) { |
|
1131
|
|
|
$this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...'); |
|
1132
|
|
|
|
|
1133
|
|
|
$append = true; |
|
1134
|
|
|
} else { |
|
1135
|
|
|
$contentBonus = 0; |
|
1136
|
|
|
|
|
1137
|
|
|
// Give a bonus if sibling nodes and top candidates have the example same classname |
|
1138
|
|
|
if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { |
|
1139
|
|
|
$contentBonus += $topCandidate->contentScore * 0.2; |
|
1140
|
|
|
} |
|
1141
|
|
|
if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) { |
|
1142
|
|
|
$append = true; |
|
1143
|
|
|
} elseif ($sibling->nodeName === 'p') { |
|
1144
|
|
|
$linkDensity = $sibling->getLinkDensity(); |
|
1145
|
|
|
$nodeContent = $sibling->getTextContent(true); |
|
1146
|
|
|
|
|
1147
|
|
|
if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) { |
|
1148
|
|
|
$append = true; |
|
1149
|
|
|
} elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { |
|
1150
|
|
|
$append = true; |
|
1151
|
|
|
} |
|
1152
|
|
|
} |
|
1153
|
|
|
} |
|
1154
|
|
|
|
|
1155
|
|
|
if ($append) { |
|
1156
|
|
|
$this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128))); |
|
1157
|
|
|
|
|
1158
|
|
|
$hasContent = true; |
|
1159
|
|
|
|
|
1160
|
|
|
if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) { |
|
1161
|
|
|
/* |
|
1162
|
|
|
* We have a node that isn't a common block level element, like a form or td tag. |
|
1163
|
|
|
* Turn it into a div so it doesn't get filtered out later by accident. |
|
1164
|
|
|
*/ |
|
1165
|
|
|
$sibling = NodeUtility::setNodeTag($sibling, 'div'); |
|
1166
|
|
|
} |
|
1167
|
|
|
|
|
1168
|
|
|
$import = $articleContent->importNode($sibling, true); |
|
1169
|
|
|
$articleContent->appendChild($import); |
|
1170
|
|
|
|
|
1171
|
|
|
/* |
|
1172
|
|
|
* No node shifting needs to be check because when calling getChildren, an array is made with the |
|
1173
|
|
|
* children of the parent node, instead of using the DOMElement childNodes function, which, when used |
|
1174
|
|
|
* along with appendChild, would shift the nodes position and the current foreach will behave in |
|
1175
|
|
|
* unpredictable ways. |
|
1176
|
|
|
*/ |
|
1177
|
|
|
} |
|
1178
|
|
|
} |
|
1179
|
|
|
|
|
1180
|
|
|
$articleContent = $this->prepArticle($articleContent); |
|
1181
|
|
|
|
|
1182
|
|
|
if ($hasContent) { |
|
1183
|
|
|
// Find out text direction from ancestors of final top candidate. |
|
1184
|
|
|
$ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors()); |
|
1185
|
|
|
foreach ($ancestors as $ancestor) { |
|
1186
|
|
|
$articleDir = $ancestor->getAttribute('dir'); |
|
1187
|
|
|
if ($articleDir) { |
|
1188
|
|
|
$this->setDirection($articleDir); |
|
1189
|
|
|
$this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir)); |
|
1190
|
|
|
break; |
|
1191
|
|
|
} |
|
1192
|
|
|
} |
|
1193
|
|
|
|
|
1194
|
|
|
return $articleContent; |
|
1195
|
|
|
} else { |
|
1196
|
|
|
return false; |
|
1197
|
|
|
} |
|
1198
|
|
|
} |
|
1199
|
|
|
|
|
1200
|
|
|
/** |
|
1201
|
|
|
* Cleans up the final article. |
|
1202
|
|
|
* |
|
1203
|
|
|
* @param DOMDocument $article |
|
1204
|
|
|
* |
|
1205
|
|
|
* @return DOMDocument |
|
1206
|
|
|
*/ |
|
1207
|
|
|
public function prepArticle(DOMDocument $article) |
|
1208
|
|
|
{ |
|
1209
|
|
|
$this->logger->info('[PrepArticle] Preparing final article...'); |
|
1210
|
|
|
|
|
1211
|
|
|
$this->_cleanStyles($article); |
|
1212
|
|
|
$this->_clean($article, 'style'); |
|
1213
|
|
|
|
|
1214
|
|
|
// Check for data tables before we continue, to avoid removing items in |
|
1215
|
|
|
// those tables, which will often be isolated even though they're |
|
1216
|
|
|
// visually linked to other content-ful elements (text, images, etc.). |
|
1217
|
|
|
$this->_markDataTables($article); |
|
1218
|
|
|
|
|
1219
|
|
|
// Clean out junk from the article content |
|
1220
|
|
|
$this->_cleanConditionally($article, 'form'); |
|
1221
|
|
|
$this->_cleanConditionally($article, 'fieldset'); |
|
1222
|
|
|
$this->_clean($article, 'object'); |
|
1223
|
|
|
$this->_clean($article, 'embed'); |
|
1224
|
|
|
$this->_clean($article, 'h1'); |
|
1225
|
|
|
$this->_clean($article, 'footer'); |
|
1226
|
|
|
$this->_clean($article, 'link'); |
|
1227
|
|
|
$this->_clean($article, 'aside'); |
|
1228
|
|
|
|
|
1229
|
|
|
// Clean out elements have "share" in their id/class combinations from final top candidates, |
|
1230
|
|
|
// which means we don't remove the top candidates even they have "share". |
|
1231
|
|
|
foreach ($article->childNodes as $child) { |
|
1232
|
|
|
$this->_cleanMatchedNodes($child, '/share/i'); |
|
1233
|
|
|
} |
|
1234
|
|
|
|
|
1235
|
|
|
/* |
|
1236
|
|
|
* If there is only one h2 and its text content substantially equals article title, |
|
1237
|
|
|
* they are probably using it as a header and not a subheader, |
|
1238
|
|
|
* so remove it since we already extract the title separately. |
|
1239
|
|
|
*/ |
|
1240
|
|
|
$h2 = $article->getElementsByTagName('h2'); |
|
1241
|
|
|
if ($h2->length === 1) { |
|
1242
|
|
|
$lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1); |
|
1243
|
|
|
|
|
1244
|
|
|
if (abs($lengthSimilarRate) < 0.5) { |
|
1245
|
|
|
if ($lengthSimilarRate > 0) { |
|
1246
|
|
|
$titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false; |
|
1247
|
|
|
} else { |
|
1248
|
|
|
$titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false; |
|
1249
|
|
|
} |
|
1250
|
|
|
if ($titlesMatch) { |
|
1251
|
|
|
$this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...'); |
|
1252
|
|
|
$this->_clean($article, 'h2'); |
|
1253
|
|
|
} |
|
1254
|
|
|
} |
|
1255
|
|
|
} |
|
1256
|
|
|
|
|
1257
|
|
|
$this->_clean($article, 'iframe'); |
|
1258
|
|
|
$this->_clean($article, 'input'); |
|
1259
|
|
|
$this->_clean($article, 'textarea'); |
|
1260
|
|
|
$this->_clean($article, 'select'); |
|
1261
|
|
|
$this->_clean($article, 'button'); |
|
1262
|
|
|
$this->_cleanHeaders($article); |
|
1263
|
|
|
|
|
1264
|
|
|
// Do these last as the previous stuff may have removed junk |
|
1265
|
|
|
// that will affect these |
|
1266
|
|
|
$this->_cleanConditionally($article, 'table'); |
|
1267
|
|
|
$this->_cleanConditionally($article, 'ul'); |
|
1268
|
|
|
$this->_cleanConditionally($article, 'div'); |
|
1269
|
|
|
|
|
1270
|
|
|
$this->_cleanExtraParagraphs($article); |
|
1271
|
|
|
|
|
1272
|
|
|
foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { |
|
1273
|
|
|
$next = $br->nextSibling; |
|
1274
|
|
|
if ($next && $next->nodeName === 'p') { |
|
1275
|
|
|
$this->logger->debug('[PrepArticle] Removing br node next to a p node.'); |
|
1276
|
|
|
$br->parentNode->removeChild($br); |
|
1277
|
|
|
} |
|
1278
|
|
|
} |
|
1279
|
|
|
|
|
1280
|
|
|
// Remove single-cell tables |
|
1281
|
|
|
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { |
|
1282
|
|
|
/** @var DOMNode $table */ |
|
1283
|
|
|
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table; |
|
1284
|
|
|
if ($tbody->hasSingleTagInsideElement('tr')) { |
|
|
|
|
|
|
1285
|
|
|
$row = $tbody->getFirstElementChild(); |
|
|
|
|
|
|
1286
|
|
|
if ($row->hasSingleTagInsideElement('td')) { |
|
1287
|
|
|
$cell = $row->getFirstElementChild(); |
|
1288
|
|
|
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function($carry, $node) { |
|
1289
|
|
|
return $node->isPhrasingContent() && $carry; |
|
1290
|
|
|
}, true)) ? 'p' : 'div'); |
|
1291
|
|
|
$table->parentNode->replaceChild($cell, $table); |
|
1292
|
|
|
} |
|
1293
|
|
|
} |
|
1294
|
|
|
} |
|
1295
|
|
|
|
|
1296
|
|
|
return $article; |
|
1297
|
|
|
} |
|
1298
|
|
|
|
|
1299
|
|
|
/** |
|
1300
|
|
|
* Look for 'data' (as opposed to 'layout') tables, for which we use |
|
1301
|
|
|
* similar checks as |
|
1302
|
|
|
* https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920. |
|
1303
|
|
|
* |
|
1304
|
|
|
* @param DOMDocument $article |
|
1305
|
|
|
* |
|
1306
|
|
|
* @return void |
|
1307
|
|
|
*/ |
|
1308
|
|
|
public function _markDataTables(DOMDocument $article) |
|
1309
|
|
|
{ |
|
1310
|
|
|
$tables = $article->getElementsByTagName('table'); |
|
1311
|
|
|
foreach ($tables as $table) { |
|
1312
|
|
|
/** @var DOMElement $table */ |
|
1313
|
|
|
$role = $table->getAttribute('role'); |
|
1314
|
|
|
if ($role === 'presentation') { |
|
1315
|
|
|
$table->setReadabilityDataTable(false); |
|
1316
|
|
|
continue; |
|
1317
|
|
|
} |
|
1318
|
|
|
$datatable = $table->getAttribute('datatable'); |
|
1319
|
|
|
if ($datatable == '0') { |
|
1320
|
|
|
$table->setReadabilityDataTable(false); |
|
1321
|
|
|
continue; |
|
1322
|
|
|
} |
|
1323
|
|
|
$summary = $table->getAttribute('summary'); |
|
1324
|
|
|
if ($summary) { |
|
1325
|
|
|
$table->setReadabilityDataTable(true); |
|
1326
|
|
|
continue; |
|
1327
|
|
|
} |
|
1328
|
|
|
|
|
1329
|
|
|
$caption = $table->getElementsByTagName('caption'); |
|
1330
|
|
|
if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { |
|
1331
|
|
|
$table->setReadabilityDataTable(true); |
|
1332
|
|
|
continue; |
|
1333
|
|
|
} |
|
1334
|
|
|
|
|
1335
|
|
|
// If the table has a descendant with any of these tags, consider a data table: |
|
1336
|
|
|
foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { |
|
1337
|
|
|
if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { |
|
1338
|
|
|
$table->setReadabilityDataTable(true); |
|
1339
|
|
|
continue 2; |
|
1340
|
|
|
} |
|
1341
|
|
|
} |
|
1342
|
|
|
|
|
1343
|
|
|
// Nested tables indicate a layout table: |
|
1344
|
|
|
if ($table->getElementsByTagName('table')->length > 0) { |
|
1345
|
|
|
$table->setReadabilityDataTable(false); |
|
1346
|
|
|
continue; |
|
1347
|
|
|
} |
|
1348
|
|
|
|
|
1349
|
|
|
$sizeInfo = $table->getRowAndColumnCount(); |
|
1350
|
|
|
if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { |
|
1351
|
|
|
$table->setReadabilityDataTable(true); |
|
1352
|
|
|
continue; |
|
1353
|
|
|
} |
|
1354
|
|
|
// Now just go by size entirely: |
|
1355
|
|
|
$table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10); |
|
1356
|
|
|
} |
|
1357
|
|
|
} |
|
1358
|
|
|
|
|
1359
|
|
|
/** |
|
1360
|
|
|
* Remove the style attribute on every e and under. |
|
1361
|
|
|
* |
|
1362
|
|
|
* @param $node DOMDocument|DOMNode |
|
1363
|
|
|
**/ |
|
1364
|
|
|
public function _cleanStyles($node) |
|
1365
|
|
|
{ |
|
1366
|
|
|
if (property_exists($node, 'tagName') && $node->tagName === 'svg') { |
|
1367
|
|
|
return; |
|
1368
|
|
|
} |
|
1369
|
|
|
|
|
1370
|
|
|
// Do not bother if there's no method to remove an attribute |
|
1371
|
|
|
if (method_exists($node, 'removeAttribute')) { |
|
1372
|
|
|
$presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace']; |
|
1373
|
|
|
// Remove `style` and deprecated presentational attributes |
|
1374
|
|
|
foreach ($presentational_attributes as $presentational_attribute) { |
|
1375
|
|
|
$node->removeAttribute($presentational_attribute); |
|
1376
|
|
|
} |
|
1377
|
|
|
|
|
1378
|
|
|
$deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre']; |
|
1379
|
|
|
if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) { |
|
1380
|
|
|
$node->removeAttribute('width'); |
|
1381
|
|
|
$node->removeAttribute('height'); |
|
1382
|
|
|
} |
|
1383
|
|
|
} |
|
1384
|
|
|
|
|
1385
|
|
|
$cur = $node->firstChild; |
|
1386
|
|
|
while ($cur !== null) { |
|
1387
|
|
|
$this->_cleanStyles($cur); |
|
1388
|
|
|
$cur = $cur->nextSibling; |
|
1389
|
|
|
} |
|
1390
|
|
|
} |
|
1391
|
|
|
|
|
1392
|
|
|
/** |
|
1393
|
|
|
* Clean out elements whose id/class combinations match specific string. |
|
1394
|
|
|
* |
|
1395
|
|
|
* @param $node DOMElement Node to clean |
|
1396
|
|
|
* @param $regex string Match id/class combination. |
|
1397
|
|
|
* |
|
1398
|
|
|
* @return void |
|
1399
|
|
|
**/ |
|
1400
|
|
|
public function _cleanMatchedNodes($node, $regex) |
|
1401
|
|
|
{ |
|
1402
|
|
|
$endOfSearchMarkerNode = NodeUtility::getNextNode($node, true); |
|
1403
|
|
|
$next = NodeUtility::getNextNode($node); |
|
1404
|
|
|
while ($next && $next !== $endOfSearchMarkerNode) { |
|
1405
|
|
|
if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { |
|
1406
|
|
|
$this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id'))); |
|
1407
|
|
|
$next = NodeUtility::removeAndGetNext($next); |
|
1408
|
|
|
} else { |
|
1409
|
|
|
$next = NodeUtility::getNextNode($next); |
|
1410
|
|
|
} |
|
1411
|
|
|
} |
|
1412
|
|
|
} |
|
1413
|
|
|
|
|
1414
|
|
|
/** |
|
1415
|
|
|
* @param DOMDocument $article |
|
1416
|
|
|
* |
|
1417
|
|
|
* @return void |
|
1418
|
|
|
*/ |
|
1419
|
|
|
public function _cleanExtraParagraphs(DOMDocument $article) |
|
1420
|
|
|
{ |
|
1421
|
|
|
$paragraphs = $article->getElementsByTagName('p'); |
|
1422
|
|
|
$length = $paragraphs->length; |
|
1423
|
|
|
|
|
1424
|
|
|
for ($i = 0; $i < $length; $i++) { |
|
1425
|
|
|
$paragraph = $paragraphs->item($length - 1 - $i); |
|
1426
|
|
|
|
|
1427
|
|
|
$imgCount = $paragraph->getElementsByTagName('img')->length; |
|
1428
|
|
|
$embedCount = $paragraph->getElementsByTagName('embed')->length; |
|
1429
|
|
|
$objectCount = $paragraph->getElementsByTagName('object')->length; |
|
1430
|
|
|
// At this point, nasty iframes have been removed, only remain embedded video ones. |
|
1431
|
|
|
$iframeCount = $paragraph->getElementsByTagName('iframe')->length; |
|
1432
|
|
|
$totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; |
|
1433
|
|
|
|
|
1434
|
|
|
if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { |
|
1435
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128))); |
|
1436
|
|
|
$paragraph->parentNode->removeChild($paragraph); |
|
1437
|
|
|
} |
|
1438
|
|
|
} |
|
1439
|
|
|
} |
|
1440
|
|
|
|
|
1441
|
|
|
/** |
|
1442
|
|
|
* @param DOMDocument $article |
|
1443
|
|
|
* @param string $tag Tag to clean conditionally |
|
1444
|
|
|
* |
|
1445
|
|
|
* @return void |
|
1446
|
|
|
*/ |
|
1447
|
|
|
public function _cleanConditionally(DOMDocument $article, $tag) |
|
1448
|
|
|
{ |
|
1449
|
|
|
if (!$this->configuration->getCleanConditionally()) { |
|
1450
|
|
|
return; |
|
1451
|
|
|
} |
|
1452
|
|
|
|
|
1453
|
|
|
$isList = in_array($tag, ['ul', 'ol']); |
|
1454
|
|
|
|
|
1455
|
|
|
/* |
|
1456
|
|
|
* Gather counts for other typical elements embedded within. |
|
1457
|
|
|
* Traverse backwards so we can remove nodes at the same time |
|
1458
|
|
|
* without effecting the traversal. |
|
1459
|
|
|
*/ |
|
1460
|
|
|
|
|
1461
|
|
|
$DOMNodeList = $article->getElementsByTagName($tag); |
|
1462
|
|
|
$length = $DOMNodeList->length; |
|
1463
|
|
|
for ($i = 0; $i < $length; $i++) { |
|
1464
|
|
|
/** @var $node DOMElement */ |
|
1465
|
|
|
$node = $DOMNodeList->item($length - 1 - $i); |
|
1466
|
|
|
|
|
1467
|
|
|
// First check if we're in a data table, in which case don't remove us. |
|
1468
|
|
|
if ($node->hasAncestorTag('table', -1, function($node) { |
|
1469
|
|
|
return $node->isReadabilityDataTable(); |
|
1470
|
|
|
})) { |
|
1471
|
|
|
continue; |
|
1472
|
|
|
} |
|
1473
|
|
|
|
|
1474
|
|
|
$weight = 0; |
|
1475
|
|
|
if ($this->configuration->getWeightClasses()) { |
|
1476
|
|
|
$weight = $node->getClassWeight(); |
|
1477
|
|
|
} |
|
1478
|
|
|
|
|
1479
|
|
|
if ($weight < 0) { |
|
1480
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag)); |
|
1481
|
|
|
|
|
1482
|
|
|
NodeUtility::removeNode($node); |
|
1483
|
|
|
continue; |
|
1484
|
|
|
} |
|
1485
|
|
|
|
|
1486
|
|
|
if (substr_count($node->getTextContent(), ',') < 10) { |
|
1487
|
|
|
/* |
|
1488
|
|
|
* If there are not very many commas, and the number of |
|
1489
|
|
|
* non-paragraph elements is more than paragraphs or other |
|
1490
|
|
|
* ominous signs, remove the element. |
|
1491
|
|
|
*/ |
|
1492
|
|
|
|
|
1493
|
|
|
$p = $node->getElementsByTagName('p')->length; |
|
1494
|
|
|
$img = $node->getElementsByTagName('img')->length; |
|
1495
|
|
|
$li = $node->getElementsByTagName('li')->length - 100; |
|
1496
|
|
|
$input = $node->getElementsByTagName('input')->length; |
|
1497
|
|
|
|
|
1498
|
|
|
$embedCount = 0; |
|
1499
|
|
|
$embeds = $node->getElementsByTagName('embed'); |
|
1500
|
|
|
|
|
1501
|
|
|
foreach ($embeds as $embedNode) { |
|
1502
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { |
|
1503
|
|
|
$embedCount++; |
|
1504
|
|
|
} |
|
1505
|
|
|
} |
|
1506
|
|
|
|
|
1507
|
|
|
$linkDensity = $node->getLinkDensity(); |
|
1508
|
|
|
$contentLength = mb_strlen($node->getTextContent(true)); |
|
1509
|
|
|
|
|
1510
|
|
|
$haveToRemove = |
|
1511
|
|
|
($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || |
|
1512
|
|
|
(!$isList && $li > $p) || |
|
1513
|
|
|
($input > floor($p / 3)) || |
|
1514
|
|
|
(!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || |
|
1515
|
|
|
(!$isList && $weight < 25 && $linkDensity > 0.2) || |
|
1516
|
|
|
($weight >= 25 && $linkDensity > 0.5) || |
|
1517
|
|
|
(($embedCount === 1 && $contentLength < 75) || $embedCount > 1); |
|
1518
|
|
|
|
|
1519
|
|
|
if ($haveToRemove) { |
|
1520
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag)); |
|
1521
|
|
|
|
|
1522
|
|
|
NodeUtility::removeNode($node); |
|
1523
|
|
|
} |
|
1524
|
|
|
} |
|
1525
|
|
|
} |
|
1526
|
|
|
} |
|
1527
|
|
|
|
|
1528
|
|
|
/** |
|
1529
|
|
|
* Clean a node of all elements of type "tag". |
|
1530
|
|
|
* (Unless it's a youtube/vimeo video. People love movies.). |
|
1531
|
|
|
* |
|
1532
|
|
|
* @param $article DOMDocument |
|
1533
|
|
|
* @param $tag string tag to clean |
|
1534
|
|
|
* |
|
1535
|
|
|
* @return void |
|
1536
|
|
|
**/ |
|
1537
|
|
|
public function _clean(DOMDocument $article, $tag) |
|
1538
|
|
|
{ |
|
1539
|
|
|
$isEmbed = in_array($tag, ['object', 'embed', 'iframe']); |
|
1540
|
|
|
|
|
1541
|
|
|
$DOMNodeList = $article->getElementsByTagName($tag); |
|
1542
|
|
|
$length = $DOMNodeList->length; |
|
1543
|
|
|
for ($i = 0; $i < $length; $i++) { |
|
1544
|
|
|
$item = $DOMNodeList->item($length - 1 - $i); |
|
1545
|
|
|
|
|
1546
|
|
|
// Allow youtube and vimeo videos through as people usually want to see those. |
|
1547
|
|
|
if ($isEmbed) { |
|
1548
|
|
|
$attributeValues = []; |
|
1549
|
|
|
foreach ($item->attributes as $value) { |
|
1550
|
|
|
$attributeValues[] = $value->nodeValue; |
|
1551
|
|
|
} |
|
1552
|
|
|
$attributeValues = implode('|', $attributeValues); |
|
1553
|
|
|
|
|
1554
|
|
|
// First, check the elements attributes to see if any of them contain youtube or vimeo |
|
1555
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) { |
|
1556
|
|
|
continue; |
|
1557
|
|
|
} |
|
1558
|
|
|
|
|
1559
|
|
|
// Then check the elements inside this element for the same. |
|
1560
|
|
|
if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { |
|
1561
|
|
|
continue; |
|
1562
|
|
|
} |
|
1563
|
|
|
} |
|
1564
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName)); |
|
1565
|
|
|
|
|
1566
|
|
|
NodeUtility::removeNode($item); |
|
1567
|
|
|
} |
|
1568
|
|
|
} |
|
1569
|
|
|
|
|
1570
|
|
|
/** |
|
1571
|
|
|
* Clean out spurious headers from an Element. Checks things like classnames and link density. |
|
1572
|
|
|
* |
|
1573
|
|
|
* @param DOMDocument $article |
|
1574
|
|
|
* |
|
1575
|
|
|
* @return void |
|
1576
|
|
|
**/ |
|
1577
|
|
|
public function _cleanHeaders(DOMDocument $article) |
|
1578
|
|
|
{ |
|
1579
|
|
|
for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
|
1580
|
|
|
$headers = $article->getElementsByTagName('h'.$headerIndex); |
|
1581
|
|
|
/** @var $header DOMElement */ |
|
1582
|
|
|
foreach ($headers as $header) { |
|
1583
|
|
|
$weight = 0; |
|
1584
|
|
|
if ($this->configuration->getWeightClasses()) { |
|
1585
|
|
|
$weight = $header->getClassWeight(); |
|
1586
|
|
|
} |
|
1587
|
|
|
|
|
1588
|
|
|
if ($weight < 0) { |
|
1589
|
|
|
$this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); |
|
1590
|
|
|
|
|
1591
|
|
|
NodeUtility::removeNode($header); |
|
1592
|
|
|
} |
|
1593
|
|
|
} |
|
1594
|
|
|
} |
|
1595
|
|
|
} |
|
1596
|
|
|
|
|
1597
|
|
|
/** |
|
1598
|
|
|
* Removes the class="" attribute from every element in the given |
|
1599
|
|
|
* subtree. |
|
1600
|
|
|
* |
|
1601
|
|
|
* Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes |
|
1602
|
|
|
* here so no need to filter those. |
|
1603
|
|
|
* |
|
1604
|
|
|
* @param DOMDocument|DOMNode $node |
|
1605
|
|
|
* |
|
1606
|
|
|
* @return void |
|
1607
|
|
|
**/ |
|
1608
|
|
|
public function _cleanClasses($node) |
|
1609
|
|
|
{ |
|
1610
|
|
|
if ($node->getAttribute('class') !== '') { |
|
1611
|
|
|
$node->removeAttribute('class'); |
|
1612
|
|
|
} |
|
1613
|
|
|
|
|
1614
|
|
|
for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) { |
|
1615
|
|
|
$this->_cleanClasses($node); |
|
|
|
|
|
|
1616
|
|
|
} |
|
1617
|
|
|
} |
|
1618
|
|
|
|
|
1619
|
|
|
/** |
|
1620
|
|
|
* @param DOMDocument $article |
|
1621
|
|
|
* |
|
1622
|
|
|
* @return DOMDocument |
|
1623
|
|
|
*/ |
|
1624
|
|
|
public function postProcessContent(DOMDocument $article) |
|
1625
|
|
|
{ |
|
1626
|
|
|
$this->logger->info('[PostProcess] PostProcessing content...'); |
|
1627
|
|
|
|
|
1628
|
|
|
// Readability cannot open relative uris so we convert them to absolute uris. |
|
1629
|
|
|
if ($this->configuration->getFixRelativeURLs()) { |
|
1630
|
|
|
foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) { |
|
1631
|
|
|
/** @var DOMElement $link */ |
|
1632
|
|
|
$href = $link->getAttribute('href'); |
|
1633
|
|
|
if ($href) { |
|
1634
|
|
|
// Replace links with javascript: URIs with text content, since |
|
1635
|
|
|
// they won't work after scripts have been removed from the page. |
|
1636
|
|
|
if (strpos($href, 'javascript:') === 0) { |
|
1637
|
|
|
$this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128))); |
|
1638
|
|
|
|
|
1639
|
|
|
$text = $article->createTextNode($link->textContent); |
|
1640
|
|
|
$link->parentNode->replaceChild($text, $link); |
|
1641
|
|
|
} else { |
|
1642
|
|
|
$this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128))); |
|
1643
|
|
|
|
|
1644
|
|
|
$link->setAttribute('href', $this->toAbsoluteURI($href)); |
|
1645
|
|
|
} |
|
1646
|
|
|
} |
|
1647
|
|
|
} |
|
1648
|
|
|
|
|
1649
|
|
|
foreach ($article->getElementsByTagName('img') as $img) { |
|
1650
|
|
|
/** @var DOMElement $img */ |
|
1651
|
|
|
/* |
|
1652
|
|
|
* Extract all possible sources of img url and select the first one on the list. |
|
1653
|
|
|
*/ |
|
1654
|
|
|
$url = [ |
|
1655
|
|
|
$img->getAttribute('src'), |
|
1656
|
|
|
$img->getAttribute('data-src'), |
|
1657
|
|
|
$img->getAttribute('data-original'), |
|
1658
|
|
|
$img->getAttribute('data-orig'), |
|
1659
|
|
|
$img->getAttribute('data-url') |
|
1660
|
|
|
]; |
|
1661
|
|
|
|
|
1662
|
|
|
$src = array_filter($url); |
|
1663
|
|
|
$src = reset($src); |
|
1664
|
|
|
if ($src) { |
|
1665
|
|
|
$this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); |
|
1666
|
|
|
|
|
1667
|
|
|
$img->setAttribute('src', $this->toAbsoluteURI($src)); |
|
1668
|
|
|
} |
|
1669
|
|
|
} |
|
1670
|
|
|
} |
|
1671
|
|
|
|
|
1672
|
|
|
$this->_cleanClasses($article); |
|
1673
|
|
|
|
|
1674
|
|
|
return $article; |
|
1675
|
|
|
} |
|
1676
|
|
|
|
|
1677
|
|
|
/** |
|
1678
|
|
|
* @return null|string |
|
1679
|
|
|
*/ |
|
1680
|
|
|
public function __toString() |
|
1681
|
|
|
{ |
|
1682
|
|
|
return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent()); |
|
1683
|
|
|
} |
|
1684
|
|
|
|
|
1685
|
|
|
/** |
|
1686
|
|
|
* @return string|null |
|
1687
|
|
|
*/ |
|
1688
|
|
|
public function getTitle() |
|
1689
|
|
|
{ |
|
1690
|
|
|
return $this->title; |
|
1691
|
|
|
} |
|
1692
|
|
|
|
|
1693
|
|
|
/** |
|
1694
|
|
|
* @param string $title |
|
1695
|
|
|
*/ |
|
1696
|
|
|
protected function setTitle($title) |
|
1697
|
|
|
{ |
|
1698
|
|
|
$this->title = $title; |
|
1699
|
|
|
} |
|
1700
|
|
|
|
|
1701
|
|
|
/** |
|
1702
|
|
|
* @return string|null |
|
1703
|
|
|
*/ |
|
1704
|
|
|
public function getContent() |
|
1705
|
|
|
{ |
|
1706
|
|
|
return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; |
|
1707
|
|
|
} |
|
1708
|
|
|
|
|
1709
|
|
|
/** |
|
1710
|
|
|
* @return DOMDocument|null |
|
1711
|
|
|
*/ |
|
1712
|
|
|
public function getDOMDocument() |
|
1713
|
|
|
{ |
|
1714
|
|
|
return $this->content; |
|
1715
|
|
|
} |
|
1716
|
|
|
|
|
1717
|
|
|
/** |
|
1718
|
|
|
* @param DOMDocument $content |
|
1719
|
|
|
*/ |
|
1720
|
|
|
protected function setContent(DOMDocument $content) |
|
1721
|
|
|
{ |
|
1722
|
|
|
$this->content = $content; |
|
1723
|
|
|
} |
|
1724
|
|
|
|
|
1725
|
|
|
/** |
|
1726
|
|
|
* @return null|string |
|
1727
|
|
|
*/ |
|
1728
|
|
|
public function getExcerpt() |
|
1729
|
|
|
{ |
|
1730
|
|
|
return $this->excerpt; |
|
1731
|
|
|
} |
|
1732
|
|
|
|
|
1733
|
|
|
/** |
|
1734
|
|
|
* @param null|string $excerpt |
|
1735
|
|
|
*/ |
|
1736
|
|
|
public function setExcerpt($excerpt) |
|
1737
|
|
|
{ |
|
1738
|
|
|
$this->excerpt = $excerpt; |
|
1739
|
|
|
} |
|
1740
|
|
|
|
|
1741
|
|
|
/** |
|
1742
|
|
|
* @return string|null |
|
1743
|
|
|
*/ |
|
1744
|
|
|
public function getImage() |
|
1745
|
|
|
{ |
|
1746
|
|
|
return $this->image; |
|
1747
|
|
|
} |
|
1748
|
|
|
|
|
1749
|
|
|
/** |
|
1750
|
|
|
* @param string $image |
|
1751
|
|
|
*/ |
|
1752
|
|
|
protected function setImage($image) |
|
1753
|
|
|
{ |
|
1754
|
|
|
$this->image = $image; |
|
1755
|
|
|
} |
|
1756
|
|
|
|
|
1757
|
|
|
/** |
|
1758
|
|
|
* @return string|null |
|
1759
|
|
|
*/ |
|
1760
|
|
|
public function getAuthor() |
|
1761
|
|
|
{ |
|
1762
|
|
|
return $this->author; |
|
1763
|
|
|
} |
|
1764
|
|
|
|
|
1765
|
|
|
/** |
|
1766
|
|
|
* @param string $author |
|
1767
|
|
|
*/ |
|
1768
|
|
|
protected function setAuthor($author) |
|
1769
|
|
|
{ |
|
1770
|
|
|
$this->author = $author; |
|
1771
|
|
|
} |
|
1772
|
|
|
|
|
1773
|
|
|
/** |
|
1774
|
|
|
* @return string|null |
|
1775
|
|
|
*/ |
|
1776
|
|
|
public function getSiteName() |
|
1777
|
|
|
{ |
|
1778
|
|
|
return $this->siteName; |
|
1779
|
|
|
} |
|
1780
|
|
|
|
|
1781
|
|
|
/** |
|
1782
|
|
|
* @param string $siteName |
|
1783
|
|
|
*/ |
|
1784
|
|
|
protected function setSiteName($siteName) |
|
1785
|
|
|
{ |
|
1786
|
|
|
$this->siteName = $siteName; |
|
1787
|
|
|
} |
|
1788
|
|
|
|
|
1789
|
|
|
/** |
|
1790
|
|
|
* @return null|string |
|
1791
|
|
|
*/ |
|
1792
|
|
|
public function getDirection() |
|
1793
|
|
|
{ |
|
1794
|
|
|
return $this->direction; |
|
1795
|
|
|
} |
|
1796
|
|
|
|
|
1797
|
|
|
/** |
|
1798
|
|
|
* @param null|string $direction |
|
1799
|
|
|
*/ |
|
1800
|
|
|
public function setDirection($direction) |
|
1801
|
|
|
{ |
|
1802
|
|
|
$this->direction = $direction; |
|
1803
|
|
|
} |
|
1804
|
|
|
} |
|
1805
|
|
|
|