1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Wallabag\CoreBundle\Helper; |
4
|
|
|
|
5
|
|
|
use Graby\Graby; |
6
|
|
|
use Psr\Log\LoggerInterface; |
7
|
|
|
use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; |
8
|
|
|
use Symfony\Component\Validator\Constraints\Locale as LocaleConstraint; |
9
|
|
|
use Symfony\Component\Validator\Constraints\Url as UrlConstraint; |
10
|
|
|
use Symfony\Component\Validator\Validator\ValidatorInterface; |
11
|
|
|
use Wallabag\CoreBundle\Entity\Entry; |
12
|
|
|
use Wallabag\CoreBundle\Tools\Utils; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* This kind of proxy class takes care of getting the content from an url |
16
|
|
|
* and updates the entry with what it found. |
17
|
|
|
*/ |
18
|
|
|
class ContentProxy |
19
|
|
|
{ |
20
|
|
|
protected $graby; |
21
|
|
|
protected $tagger; |
22
|
|
|
protected $ignoreOriginProcessor; |
23
|
|
|
protected $validator; |
24
|
|
|
protected $logger; |
25
|
|
|
protected $mimeGuesser; |
26
|
|
|
protected $fetchingErrorMessage; |
27
|
|
|
protected $eventDispatcher; |
28
|
|
|
protected $storeArticleHeaders; |
29
|
|
|
|
30
|
|
|
public function __construct(Graby $graby, RuleBasedTagger $tagger, RuleBasedIgnoreOriginProcessor $ignoreOriginProcessor, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage, $storeArticleHeaders = false) |
31
|
|
|
{ |
32
|
|
|
$this->graby = $graby; |
33
|
|
|
$this->tagger = $tagger; |
34
|
|
|
$this->ignoreOriginProcessor = $ignoreOriginProcessor; |
35
|
|
|
$this->validator = $validator; |
36
|
|
|
$this->logger = $logger; |
37
|
|
|
$this->mimeGuesser = new MimeTypeExtensionGuesser(); |
38
|
|
|
$this->fetchingErrorMessage = $fetchingErrorMessage; |
39
|
|
|
$this->storeArticleHeaders = $storeArticleHeaders; |
40
|
|
|
} |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* Update entry using either fetched or provided content. |
44
|
|
|
* |
45
|
|
|
* @param Entry $entry Entry to update |
46
|
|
|
* @param string $url Url of the content |
47
|
|
|
* @param array $content Array with content provided for import with AT LEAST keys title, html, url to skip the fetchContent from the url |
48
|
|
|
* @param bool $disableContentUpdate Whether to skip trying to fetch content using Graby |
49
|
|
|
*/ |
50
|
|
|
public function updateEntry(Entry $entry, $url, array $content = [], $disableContentUpdate = false) |
51
|
|
|
{ |
52
|
|
|
$this->graby->toggleImgNoReferrer(true); |
53
|
|
|
if (!empty($content['html'])) { |
54
|
|
|
$content['html'] = $this->graby->cleanupHtml($content['html'], $url); |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
58
|
|
|
$fetchedContent = $this->graby->fetchContent($url); |
59
|
|
|
|
60
|
|
|
$fetchedContent['title'] = $this->sanitizeContentTitle( |
61
|
|
|
$fetchedContent['title'], |
62
|
|
|
isset($fetchedContent['headers']['content-type']) ? $fetchedContent['headers']['content-type'] : '' |
63
|
|
|
); |
64
|
|
|
|
65
|
|
|
// when content is imported, we have information in $content |
66
|
|
|
// in case fetching content goes bad, we'll keep the imported information instead of overriding them |
67
|
|
|
if (empty($content) || $fetchedContent['html'] !== $this->fetchingErrorMessage) { |
68
|
|
|
$content = $fetchedContent; |
69
|
|
|
} |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
// be sure to keep the url in case of error |
73
|
|
|
// so we'll be able to refetch it in the future |
74
|
|
|
$content['url'] = !empty($content['url']) ? $content['url'] : $url; |
75
|
|
|
|
76
|
|
|
// In one case (at least in tests), url is empty here |
77
|
|
|
// so we set it using $url provided in the updateEntry call. |
78
|
|
|
// Not sure what are the other possible cases where this property is empty |
79
|
|
|
if (empty($entry->getUrl()) && !empty($url)) { |
80
|
|
|
$entry->setUrl($url); |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
$entry->setGivenUrl($url); |
84
|
|
|
|
85
|
|
|
$this->stockEntry($entry, $content); |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* Use a Symfony validator to ensure the language is well formatted. |
90
|
|
|
* |
91
|
|
|
* @param string $value Language to validate and save |
92
|
|
|
*/ |
93
|
|
|
public function updateLanguage(Entry $entry, $value) |
94
|
|
|
{ |
95
|
|
|
// some lang are defined as fr-FR, es-ES. |
96
|
|
|
// replacing - by _ might increase language support |
97
|
|
|
$value = str_replace('-', '_', $value); |
98
|
|
|
|
99
|
|
|
$errors = $this->validator->validate( |
100
|
|
|
$value, |
101
|
|
|
(new LocaleConstraint()) |
102
|
|
|
); |
103
|
|
|
|
104
|
|
|
if (0 === \count($errors)) { |
105
|
|
|
$entry->setLanguage($value); |
106
|
|
|
|
107
|
|
|
return; |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
$this->logger->warning('Language validation failed. ' . (string) $errors); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* Use a Symfony validator to ensure the preview picture is a real url. |
115
|
|
|
* |
116
|
|
|
* @param string $value URL to validate and save |
117
|
|
|
*/ |
118
|
|
|
public function updatePreviewPicture(Entry $entry, $value) |
119
|
|
|
{ |
120
|
|
|
$errors = $this->validator->validate( |
121
|
|
|
$value, |
122
|
|
|
(new UrlConstraint()) |
123
|
|
|
); |
124
|
|
|
|
125
|
|
|
if (0 === \count($errors)) { |
126
|
|
|
$entry->setPreviewPicture($value); |
127
|
|
|
|
128
|
|
|
return; |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
$this->logger->warning('PreviewPicture validation failed. ' . (string) $errors); |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* Update date. |
136
|
|
|
* |
137
|
|
|
* @param string $value Date to validate and save |
138
|
|
|
*/ |
139
|
|
|
public function updatePublishedAt(Entry $entry, $value) |
140
|
|
|
{ |
141
|
|
|
$date = $value; |
142
|
|
|
|
143
|
|
|
// is it a timestamp? |
144
|
|
|
if (false !== filter_var($date, \FILTER_VALIDATE_INT)) { |
145
|
|
|
$date = '@' . $date; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
try { |
149
|
|
|
// is it already a DateTime? |
150
|
|
|
// (it's inside the try/catch in case of fail to be parse time string) |
151
|
|
|
if (!$date instanceof \DateTime) { |
152
|
|
|
$date = new \DateTime($date); |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
$entry->setPublishedAt($date); |
156
|
|
|
} catch (\Exception $e) { |
157
|
|
|
$this->logger->warning('Error while defining date', ['e' => $e, 'url' => $entry->getUrl(), 'date' => $value]); |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
/** |
162
|
|
|
* Helper to extract and save host from entry url. |
163
|
|
|
*/ |
164
|
|
|
public function setEntryDomainName(Entry $entry) |
165
|
|
|
{ |
166
|
|
|
$domainName = parse_url($entry->getUrl(), \PHP_URL_HOST); |
167
|
|
|
if (false !== $domainName) { |
168
|
|
|
$entry->setDomainName($domainName); |
169
|
|
|
} |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
/** |
173
|
|
|
* Helper to set a default title using: |
174
|
|
|
* - url basename, if applicable |
175
|
|
|
* - hostname. |
176
|
|
|
*/ |
177
|
|
|
public function setDefaultEntryTitle(Entry $entry) |
178
|
|
|
{ |
179
|
|
|
$url = parse_url($entry->getUrl()); |
180
|
|
|
$path = pathinfo($url['path'], \PATHINFO_BASENAME); |
181
|
|
|
|
182
|
|
|
if (empty($path)) { |
183
|
|
|
$path = $url['host']; |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
$entry->setTitle($path); |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
/** |
190
|
|
|
* Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. |
191
|
|
|
* |
192
|
|
|
* @param string $title |
193
|
|
|
* @param string $contentType |
194
|
|
|
* |
195
|
|
|
* @return string |
196
|
|
|
*/ |
197
|
|
|
private function sanitizeContentTitle($title, $contentType) |
198
|
|
|
{ |
199
|
|
|
if ('application/pdf' === $contentType) { |
200
|
|
|
$title = $this->convertPdfEncodingToUTF8($title); |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
return $this->sanitizeUTF8Text($title); |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
/** |
207
|
|
|
* If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not |
208
|
|
|
* UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. |
209
|
|
|
* |
210
|
|
|
* @param $title |
211
|
|
|
* |
212
|
|
|
* @return string (maybe contains invalid UTF-8 character) |
213
|
|
|
*/ |
214
|
|
|
private function convertPdfEncodingToUTF8($title) |
215
|
|
|
{ |
216
|
|
|
// first try UTF-8 because its easier to detect its present/absence |
217
|
|
|
foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { |
218
|
|
|
if (mb_check_encoding($title, $encoding)) { |
219
|
|
|
return mb_convert_encoding($title, 'UTF-8', $encoding); |
220
|
|
|
} |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
return $title; |
224
|
|
|
} |
225
|
|
|
|
226
|
|
|
/** |
227
|
|
|
* Remove invalid UTF-8 characters from the given string. |
228
|
|
|
* |
229
|
|
|
* @param string $rawText |
230
|
|
|
* |
231
|
|
|
* @return string |
232
|
|
|
*/ |
233
|
|
|
private function sanitizeUTF8Text($rawText) |
234
|
|
|
{ |
235
|
|
|
if (mb_check_encoding($rawText, 'UTF-8')) { |
236
|
|
|
return $rawText; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
/** |
243
|
|
|
* Stock entry with fetched or imported content. |
244
|
|
|
* Will fall back to OpenGraph data if available. |
245
|
|
|
* |
246
|
|
|
* @param Entry $entry Entry to stock |
247
|
|
|
* @param array $content Array with at least title, url & html |
248
|
|
|
*/ |
249
|
|
|
private function stockEntry(Entry $entry, array $content) |
250
|
|
|
{ |
251
|
|
|
$this->updateOriginUrl($entry, $content['url']); |
252
|
|
|
|
253
|
|
|
$this->setEntryDomainName($entry); |
254
|
|
|
|
255
|
|
|
if (!empty($content['title'])) { |
256
|
|
|
$entry->setTitle($content['title']); |
257
|
|
|
} |
258
|
|
|
|
259
|
|
|
if (empty($content['html'])) { |
260
|
|
|
$content['html'] = $this->fetchingErrorMessage; |
261
|
|
|
|
262
|
|
|
if (!empty($content['description'])) { |
263
|
|
|
$content['html'] .= '<p><i>But we found a short description: </i></p>'; |
264
|
|
|
$content['html'] .= $content['description']; |
265
|
|
|
} |
266
|
|
|
} |
267
|
|
|
|
268
|
|
|
$entry->setContent($content['html']); |
269
|
|
|
$entry->setReadingTime(Utils::getReadingTime($content['html'])); |
270
|
|
|
|
271
|
|
|
if (!empty($content['status'])) { |
272
|
|
|
$entry->setHttpStatus($content['status']); |
273
|
|
|
} |
274
|
|
|
|
275
|
|
|
if (!empty($content['authors']) && \is_array($content['authors'])) { |
276
|
|
|
$entry->setPublishedBy($content['authors']); |
277
|
|
|
} |
278
|
|
|
|
279
|
|
|
if (!empty($content['headers'])) { |
280
|
|
|
$entry->setHeaders($content['headers']); |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
if (!empty($content['date'])) { |
284
|
|
|
$this->updatePublishedAt($entry, $content['date']); |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
if (!empty($content['language'])) { |
288
|
|
|
$this->updateLanguage($entry, $content['language']); |
289
|
|
|
} |
290
|
|
|
|
291
|
|
|
$previewPictureUrl = ''; |
292
|
|
|
if (!empty($content['image'])) { |
293
|
|
|
$previewPictureUrl = $content['image']; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
// if content is an image, define it as a preview too |
297
|
|
|
if (!empty($content['headers']['content-type']) && \in_array($this->mimeGuesser->guess($content['headers']['content-type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { |
298
|
|
|
$previewPictureUrl = $content['url']; |
299
|
|
|
} elseif (empty($previewPictureUrl)) { |
300
|
|
|
$this->logger->debug('Extracting images from content to provide a default preview picture'); |
301
|
|
|
$imagesUrls = DownloadImages::extractImagesUrlsFromHtml($content['html']); |
302
|
|
|
$this->logger->debug(\count($imagesUrls) . ' pictures found'); |
303
|
|
|
|
304
|
|
|
if (!empty($imagesUrls)) { |
305
|
|
|
$previewPictureUrl = $imagesUrls[0]; |
306
|
|
|
} |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
if (!empty($content['headers']['content-type'])) { |
310
|
|
|
$entry->setMimetype($content['headers']['content-type']); |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
if (!empty($previewPictureUrl)) { |
314
|
|
|
$this->updatePreviewPicture($entry, $previewPictureUrl); |
315
|
|
|
} |
316
|
|
|
|
317
|
|
|
try { |
318
|
|
|
$this->tagger->tag($entry); |
319
|
|
|
} catch (\Exception $e) { |
320
|
|
|
$this->logger->error('Error while trying to automatically tag an entry.', [ |
321
|
|
|
'entry_url' => $content['url'], |
322
|
|
|
'error_msg' => $e->getMessage(), |
323
|
|
|
]); |
324
|
|
|
} |
325
|
|
|
} |
326
|
|
|
|
327
|
|
|
/** |
328
|
|
|
* Update the origin_url field when a redirection occurs |
329
|
|
|
* This field is set if it is empty and new url does not match ignore list. |
330
|
|
|
* |
331
|
|
|
* @param string $url |
332
|
|
|
*/ |
333
|
|
|
private function updateOriginUrl(Entry $entry, $url) |
334
|
|
|
{ |
335
|
|
|
if (empty($url) || $entry->getUrl() === $url) { |
336
|
|
|
return false; |
337
|
|
|
} |
338
|
|
|
|
339
|
|
|
$parsed_entry_url = parse_url($entry->getUrl()); |
340
|
|
|
$parsed_content_url = parse_url($url); |
341
|
|
|
|
342
|
|
|
/** |
343
|
|
|
* The following part computes the list of part changes between two |
344
|
|
|
* parse_url arrays. |
345
|
|
|
* |
346
|
|
|
* As array_diff_assoc only computes changes to go from the left array |
347
|
|
|
* to the right one, we make two differents arrays to have both |
348
|
|
|
* directions. We merge these two arrays and sort keys before passing |
349
|
|
|
* the result to the switch. |
350
|
|
|
* |
351
|
|
|
* The resulting array gives us all changing parts between the two |
352
|
|
|
* urls: scheme, host, path, query and/or fragment. |
353
|
|
|
*/ |
354
|
|
|
$diff_ec = array_diff_assoc($parsed_entry_url, $parsed_content_url); |
355
|
|
|
$diff_ce = array_diff_assoc($parsed_content_url, $parsed_entry_url); |
356
|
|
|
|
357
|
|
|
$diff = array_merge($diff_ec, $diff_ce); |
358
|
|
|
$diff_keys = array_keys($diff); |
359
|
|
|
sort($diff_keys); |
360
|
|
|
|
361
|
|
|
if ($this->ignoreOriginProcessor->process($entry)) { |
362
|
|
|
$entry->setUrl($url); |
363
|
|
|
|
364
|
|
|
return false; |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
/** |
368
|
|
|
* This switch case lets us apply different behaviors according to |
369
|
|
|
* changing parts of urls. |
370
|
|
|
* |
371
|
|
|
* As $diff_keys is an array, we provide arrays as cases. ['path'] means |
372
|
|
|
* 'only the path is different between the two urls' whereas |
373
|
|
|
* ['fragment', 'query'] means 'only fragment and query string parts are |
374
|
|
|
* different between the two urls'. |
375
|
|
|
* |
376
|
|
|
* Note that values in $diff_keys are sorted. |
377
|
|
|
*/ |
378
|
|
|
switch ($diff_keys) { |
379
|
|
|
case ['path']: |
380
|
|
|
if (($parsed_entry_url['path'] . '/' === $parsed_content_url['path']) // diff is trailing slash, we only replace the url of the entry |
381
|
|
|
|| ($url === urldecode($entry->getUrl()))) { // we update entry url if new url is a decoded version of it, see EntryRepository#findByUrlAndUserId |
382
|
|
|
$entry->setUrl($url); |
383
|
|
|
} |
384
|
|
|
break; |
385
|
|
|
case ['scheme']: |
386
|
|
|
$entry->setUrl($url); |
387
|
|
|
break; |
388
|
|
|
case ['fragment']: |
389
|
|
|
// noop |
390
|
|
|
break; |
391
|
|
|
default: |
392
|
|
|
if (empty($entry->getOriginUrl())) { |
393
|
|
|
$entry->setOriginUrl($entry->getUrl()); |
394
|
|
|
} |
395
|
|
|
$entry->setUrl($url); |
396
|
|
|
break; |
397
|
|
|
} |
398
|
|
|
} |
399
|
|
|
|
400
|
|
|
/** |
401
|
|
|
* Validate that the given content has at least a title, an html and a url. |
402
|
|
|
* |
403
|
|
|
* @return bool true if valid otherwise false |
404
|
|
|
*/ |
405
|
|
|
private function validateContent(array $content) |
406
|
|
|
{ |
407
|
|
|
return !empty($content['title']) && !empty($content['html']) && !empty($content['url']); |
408
|
|
|
} |
409
|
|
|
} |
410
|
|
|
|