1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Wallabag\CoreBundle\Helper; |
4
|
|
|
|
5
|
|
|
use Graby\Graby; |
6
|
|
|
use Psr\Log\LoggerInterface; |
7
|
|
|
use Symfony\Component\HttpFoundation\File\MimeType\MimeTypeExtensionGuesser; |
8
|
|
|
use Symfony\Component\Validator\Constraints\Locale as LocaleConstraint; |
9
|
|
|
use Symfony\Component\Validator\Constraints\Url as UrlConstraint; |
10
|
|
|
use Symfony\Component\Validator\Validator\ValidatorInterface; |
11
|
|
|
use Wallabag\CoreBundle\Entity\Entry; |
12
|
|
|
use Wallabag\CoreBundle\Tools\Utils; |
13
|
|
|
|
14
|
|
|
/** |
15
|
|
|
* This kind of proxy class take care of getting the content from an url |
16
|
|
|
* and update the entry with what it found. |
17
|
|
|
*/ |
18
|
|
|
class ContentProxy |
19
|
|
|
{ |
20
|
|
|
protected $graby; |
21
|
|
|
protected $tagger; |
22
|
|
|
protected $validator; |
23
|
|
|
protected $logger; |
24
|
|
|
protected $mimeGuesser; |
25
|
|
|
protected $fetchingErrorMessage; |
26
|
|
|
protected $eventDispatcher; |
27
|
|
|
protected $storeArticleHeaders; |
28
|
|
|
|
29
|
|
|
public function __construct(Graby $graby, RuleBasedTagger $tagger, ValidatorInterface $validator, LoggerInterface $logger, $fetchingErrorMessage, $storeArticleHeaders = false) |
30
|
|
|
{ |
31
|
|
|
$this->graby = $graby; |
32
|
|
|
$this->tagger = $tagger; |
33
|
|
|
$this->validator = $validator; |
34
|
|
|
$this->logger = $logger; |
35
|
|
|
$this->mimeGuesser = new MimeTypeExtensionGuesser(); |
36
|
|
|
$this->fetchingErrorMessage = $fetchingErrorMessage; |
37
|
|
|
$this->storeArticleHeaders = $storeArticleHeaders; |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Update entry using either fetched or provided content. |
42
|
|
|
* |
43
|
|
|
* @param Entry $entry Entry to update |
44
|
|
|
* @param string $url Url of the content |
45
|
|
|
* @param array $content Array with content provided for import with AT LEAST keys title, html, url to skip the fetchContent from the url |
46
|
|
|
* @param bool $disableContentUpdate Whether to skip trying to fetch content using Graby |
47
|
|
|
*/ |
48
|
|
|
public function updateEntry(Entry $entry, $url, array $content = [], $disableContentUpdate = false) |
49
|
|
|
{ |
50
|
|
|
if (!empty($content['html'])) { |
51
|
|
|
$content['html'] = $this->graby->cleanupHtml($content['html'], $url); |
52
|
|
|
} |
53
|
|
|
|
54
|
|
|
if ((empty($content) || false === $this->validateContent($content)) && false === $disableContentUpdate) { |
55
|
|
|
$fetchedContent = $this->graby->fetchContent($url); |
56
|
|
|
$fetchedContent['title'] = $this->sanitizeContentTitle($fetchedContent['title'], $fetchedContent['content_type']); |
57
|
|
|
|
58
|
|
|
// when content is imported, we have information in $content |
59
|
|
|
// in case fetching content goes bad, we'll keep the imported information instead of overriding them |
60
|
|
|
if (empty($content) || $fetchedContent['html'] !== $this->fetchingErrorMessage) { |
61
|
|
|
$content = $fetchedContent; |
62
|
|
|
} |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
// be sure to keep the url in case of error |
66
|
|
|
// so we'll be able to refetch it in the future |
67
|
|
|
$content['url'] = !empty($content['url']) ? $content['url'] : $url; |
68
|
|
|
|
69
|
|
|
$this->stockEntry($entry, $content); |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* Use a Symfony validator to ensure the language is well formatted. |
74
|
|
|
* |
75
|
|
|
* @param Entry $entry |
76
|
|
|
* @param string $value Language to validate and save |
77
|
|
|
*/ |
78
|
|
|
public function updateLanguage(Entry $entry, $value) |
79
|
|
|
{ |
80
|
|
|
// some lang are defined as fr-FR, es-ES. |
81
|
|
|
// replacing - by _ might increase language support |
82
|
|
|
$value = str_replace('-', '_', $value); |
83
|
|
|
|
84
|
|
|
$errors = $this->validator->validate( |
85
|
|
|
$value, |
86
|
|
|
(new LocaleConstraint()) |
87
|
|
|
); |
88
|
|
|
|
89
|
|
|
if (0 === \count($errors)) { |
90
|
|
|
$entry->setLanguage($value); |
91
|
|
|
|
92
|
|
|
return; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
$this->logger->warning('Language validation failed. ' . (string) $errors); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Use a Symfony validator to ensure the preview picture is a real url. |
100
|
|
|
* |
101
|
|
|
* @param Entry $entry |
102
|
|
|
* @param string $value URL to validate and save |
103
|
|
|
*/ |
104
|
|
|
public function updatePreviewPicture(Entry $entry, $value) |
105
|
|
|
{ |
106
|
|
|
$errors = $this->validator->validate( |
107
|
|
|
$value, |
108
|
|
|
(new UrlConstraint()) |
109
|
|
|
); |
110
|
|
|
|
111
|
|
|
if (0 === \count($errors)) { |
112
|
|
|
$entry->setPreviewPicture($value); |
113
|
|
|
|
114
|
|
|
return; |
115
|
|
|
} |
116
|
|
|
|
117
|
|
|
$this->logger->warning('PreviewPicture validation failed. ' . (string) $errors); |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Update date. |
122
|
|
|
* |
123
|
|
|
* @param Entry $entry |
124
|
|
|
* @param string $value Date to validate and save |
125
|
|
|
*/ |
126
|
|
|
public function updatePublishedAt(Entry $entry, $value) |
127
|
|
|
{ |
128
|
|
|
$date = $value; |
129
|
|
|
|
130
|
|
|
// is it a timestamp? |
131
|
|
|
if (false !== filter_var($date, FILTER_VALIDATE_INT)) { |
132
|
|
|
$date = '@' . $date; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
try { |
136
|
|
|
// is it already a DateTime? |
137
|
|
|
// (it's inside the try/catch in case of fail to be parse time string) |
138
|
|
|
if (!$date instanceof \DateTime) { |
139
|
|
|
$date = new \DateTime($date); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
$entry->setPublishedAt($date); |
143
|
|
|
} catch (\Exception $e) { |
144
|
|
|
$this->logger->warning('Error while defining date', ['e' => $e, 'url' => $entry->getUrl(), 'date' => $value]); |
145
|
|
|
} |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Helper to extract and save host from entry url. |
150
|
|
|
* |
151
|
|
|
* @param Entry $entry |
152
|
|
|
*/ |
153
|
|
|
public function setEntryDomainName(Entry $entry) |
154
|
|
|
{ |
155
|
|
|
$domainName = parse_url($entry->getUrl(), PHP_URL_HOST); |
156
|
|
|
if (false !== $domainName) { |
157
|
|
|
$entry->setDomainName($domainName); |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
/** |
162
|
|
|
* Helper to set a default title using: |
163
|
|
|
* - url basename, if applicable |
164
|
|
|
* - hostname. |
165
|
|
|
* |
166
|
|
|
* @param Entry $entry |
167
|
|
|
*/ |
168
|
|
|
public function setDefaultEntryTitle(Entry $entry) |
169
|
|
|
{ |
170
|
|
|
$url = parse_url($entry->getUrl()); |
171
|
|
|
$path = pathinfo($url['path'], PATHINFO_BASENAME); |
172
|
|
|
|
173
|
|
|
if (empty($path)) { |
174
|
|
|
$path = $url['host']; |
175
|
|
|
} |
176
|
|
|
|
177
|
|
|
$entry->setTitle($path); |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
/** |
181
|
|
|
* Try to sanitize the title of the fetched content from wrong character encodings and invalid UTF-8 character. |
182
|
|
|
* |
183
|
|
|
* @param $title |
184
|
|
|
* @param $contentType |
185
|
|
|
* |
186
|
|
|
* @return string |
187
|
|
|
*/ |
188
|
|
|
private function sanitizeContentTitle($title, $contentType) |
189
|
|
|
{ |
190
|
|
|
if ('application/pdf' === $contentType) { |
191
|
|
|
$title = $this->convertPdfEncodingToUTF8($title); |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
return $this->sanitizeUTF8Text($title); |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* If the title from the fetched content comes from a PDF, then its very possible that the character encoding is not |
199
|
|
|
* UTF-8. This methods tries to identify the character encoding and translate the title to UTF-8. |
200
|
|
|
* |
201
|
|
|
* @param $title |
202
|
|
|
* |
203
|
|
|
* @return string (maybe contains invalid UTF-8 character) |
204
|
|
|
*/ |
205
|
|
|
private function convertPdfEncodingToUTF8($title) |
206
|
|
|
{ |
207
|
|
|
// first try UTF-8 because its easier to detect its present/absence |
208
|
|
|
foreach (['UTF-8', 'UTF-16BE', 'WINDOWS-1252'] as $encoding) { |
209
|
|
|
if (mb_check_encoding($title, $encoding)) { |
210
|
|
|
return mb_convert_encoding($title, 'UTF-8', $encoding); |
211
|
|
|
} |
212
|
|
|
} |
213
|
|
|
|
214
|
|
|
return $title; |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
/** |
218
|
|
|
* Remove invalid UTF-8 characters from the given string. |
219
|
|
|
* |
220
|
|
|
* @param string $rawText |
221
|
|
|
* |
222
|
|
|
* @return string |
223
|
|
|
*/ |
224
|
|
|
private function sanitizeUTF8Text($rawText) |
225
|
|
|
{ |
226
|
|
|
if (mb_check_encoding($rawText, 'UTF-8')) { |
227
|
|
|
return $rawText; |
228
|
|
|
} |
229
|
|
|
|
230
|
|
|
return iconv('UTF-8', 'UTF-8//IGNORE', $rawText); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
/** |
234
|
|
|
* Stock entry with fetched or imported content. |
235
|
|
|
* Will fall back to OpenGraph data if available. |
236
|
|
|
* |
237
|
|
|
* @param Entry $entry Entry to stock |
238
|
|
|
* @param array $content Array with at least title, url & html |
239
|
|
|
*/ |
240
|
|
|
private function stockEntry(Entry $entry, array $content) |
241
|
|
|
{ |
242
|
|
|
$entry->setUrl($content['url']); |
243
|
|
|
|
244
|
|
|
$this->setEntryDomainName($entry); |
245
|
|
|
|
246
|
|
|
if (!empty($content['title'])) { |
247
|
|
|
$entry->setTitle($content['title']); |
248
|
|
|
} elseif (!empty($content['open_graph']['og_title'])) { |
249
|
|
|
$entry->setTitle($content['open_graph']['og_title']); |
250
|
|
|
} |
251
|
|
|
|
252
|
|
|
$html = $content['html']; |
253
|
|
|
if (false === $html) { |
254
|
|
|
$html = $this->fetchingErrorMessage; |
255
|
|
|
|
256
|
|
|
if (!empty($content['open_graph']['og_description'])) { |
257
|
|
|
$html .= '<p><i>But we found a short description: </i></p>'; |
258
|
|
|
$html .= $content['open_graph']['og_description']; |
259
|
|
|
} |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
$entry->setContent($html); |
263
|
|
|
$entry->setReadingTime(Utils::getReadingTime($html)); |
264
|
|
|
|
265
|
|
|
if (!empty($content['status'])) { |
266
|
|
|
$entry->setHttpStatus($content['status']); |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
if (!empty($content['authors']) && \is_array($content['authors'])) { |
270
|
|
|
$entry->setPublishedBy($content['authors']); |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
if (!empty($content['all_headers']) && $this->storeArticleHeaders) { |
274
|
|
|
$entry->setHeaders($content['all_headers']); |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
if (!empty($content['date'])) { |
278
|
|
|
$this->updatePublishedAt($entry, $content['date']); |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
if (!empty($content['language'])) { |
282
|
|
|
$this->updateLanguage($entry, $content['language']); |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
if (!empty($content['open_graph']['og_image'])) { |
286
|
|
|
$this->updatePreviewPicture($entry, $content['open_graph']['og_image']); |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
// if content is an image, define it as a preview too |
290
|
|
|
if (!empty($content['content_type']) && \in_array($this->mimeGuesser->guess($content['content_type']), ['jpeg', 'jpg', 'gif', 'png'], true)) { |
291
|
|
|
$this->updatePreviewPicture($entry, $content['url']); |
292
|
|
|
} |
293
|
|
|
|
294
|
|
|
if (!empty($content['content_type'])) { |
295
|
|
|
$entry->setMimetype($content['content_type']); |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
try { |
299
|
|
|
$this->tagger->tag($entry); |
300
|
|
|
} catch (\Exception $e) { |
301
|
|
|
$this->logger->error('Error while trying to automatically tag an entry.', [ |
302
|
|
|
'entry_url' => $content['url'], |
303
|
|
|
'error_msg' => $e->getMessage(), |
304
|
|
|
]); |
305
|
|
|
} |
306
|
|
|
} |
307
|
|
|
|
308
|
|
|
/** |
309
|
|
|
* Validate that the given content has at least a title, an html and a url. |
310
|
|
|
* |
311
|
|
|
* @param array $content |
312
|
|
|
* |
313
|
|
|
* @return bool true if valid otherwise false |
314
|
|
|
*/ |
315
|
|
|
private function validateContent(array $content) |
316
|
|
|
{ |
317
|
|
|
return !empty($content['title']) && !empty($content['html']) && !empty($content['url']); |
318
|
|
|
} |
319
|
|
|
} |
320
|
|
|
|