1 | <?php |
||
2 | |||
3 | /** |
||
4 | * @file |
||
5 | * This file is part of the PdfParser library. |
||
6 | * |
||
7 | * @author Sébastien MALOT <[email protected]> |
||
8 | * |
||
9 | * @date 2017-01-03 |
||
10 | * |
||
11 | * @license LGPLv3 |
||
12 | * |
||
13 | * @url <https://github.com/smalot/pdfparser> |
||
14 | * |
||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
17 | * |
||
18 | * This program is free software: you can redistribute it and/or modify |
||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||
20 | * the Free Software Foundation, either version 3 of the License, or |
||
21 | * (at your option) any later version. |
||
22 | * |
||
23 | * This program is distributed in the hope that it will be useful, |
||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
26 | * GNU Lesser General Public License for more details. |
||
27 | * |
||
28 | * You should have received a copy of the GNU Lesser General Public License |
||
29 | * along with this program. |
||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
31 | */ |
||
32 | |||
33 | namespace Smalot\PdfParser; |
||
34 | |||
35 | use Smalot\PdfParser\Encoding\PDFDocEncoding; |
||
36 | use Smalot\PdfParser\Exception\MissingCatalogException; |
||
37 | |||
38 | /** |
||
39 | * Technical references : |
||
40 | * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html |
||
41 | * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php |
||
42 | * - http://www.php.net/manual/en/ref.pdf.php#74211 |
||
43 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm |
||
44 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm |
||
45 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm |
||
46 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm |
||
47 | * |
||
48 | * Class Document |
||
49 | */ |
||
50 | class Document |
||
51 | { |
||
52 | /** |
||
53 | * @var PDFObject[] |
||
54 | */ |
||
55 | protected $objects = []; |
||
56 | |||
57 | /** |
||
58 | * @var array |
||
59 | */ |
||
60 | protected $dictionary = []; |
||
61 | |||
62 | /** |
||
63 | * @var Header |
||
64 | */ |
||
65 | protected $trailer; |
||
66 | |||
67 | /** |
||
68 | * @var array<mixed> |
||
69 | */ |
||
70 | protected $metadata = []; |
||
71 | |||
72 | /** |
||
73 | * @var array |
||
74 | */ |
||
75 | protected $details; |
||
76 | |||
77 | 104 | public function __construct() |
|
78 | { |
||
79 | 104 | $this->trailer = new Header([], $this); |
|
80 | } |
||
81 | |||
82 | 75 | public function init() |
|
83 | { |
||
84 | 75 | $this->buildDictionary(); |
|
85 | |||
86 | 75 | $this->buildDetails(); |
|
87 | |||
88 | // Propagate init to objects. |
||
89 | 75 | foreach ($this->objects as $object) { |
|
90 | 74 | $object->getHeader()->init(); |
|
91 | 74 | $object->init(); |
|
92 | } |
||
93 | } |
||
94 | |||
95 | /** |
||
96 | * Build dictionary based on type header field. |
||
97 | */ |
||
98 | 75 | protected function buildDictionary() |
|
99 | { |
||
100 | // Build dictionary. |
||
101 | 75 | $this->dictionary = []; |
|
102 | |||
103 | 75 | foreach ($this->objects as $id => $object) { |
|
104 | // Cache objects by type and subtype |
||
105 | 74 | $type = $object->getHeader()->get('Type')->getContent(); |
|
106 | |||
107 | 74 | if (null != $type) { |
|
108 | 74 | if (!isset($this->dictionary[$type])) { |
|
109 | 74 | $this->dictionary[$type] = [ |
|
110 | 74 | 'all' => [], |
|
111 | 74 | 'subtype' => [], |
|
112 | 74 | ]; |
|
113 | } |
||
114 | |||
115 | 74 | $this->dictionary[$type]['all'][$id] = $object; |
|
116 | |||
117 | 74 | $subtype = $object->getHeader()->get('Subtype')->getContent(); |
|
118 | 74 | if (null != $subtype) { |
|
119 | 67 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
|
120 | 67 | $this->dictionary[$type]['subtype'][$subtype] = []; |
|
121 | } |
||
122 | 67 | $this->dictionary[$type]['subtype'][$subtype][$id] = $object; |
|
123 | } |
||
124 | } |
||
125 | } |
||
126 | } |
||
127 | |||
128 | /** |
||
129 | * Build details array. |
||
130 | */ |
||
131 | 75 | protected function buildDetails() |
|
132 | { |
||
133 | // Build details array. |
||
134 | 75 | $details = []; |
|
135 | |||
136 | // Extract document info |
||
137 | 75 | if ($this->trailer->has('Info')) { |
|
138 | /** @var PDFObject $info */ |
||
139 | 62 | $info = $this->trailer->get('Info'); |
|
140 | // This could be an ElementMissing object, so we need to check for |
||
141 | // the getHeader method first. |
||
142 | 62 | if (null !== $info && method_exists($info, 'getHeader')) { |
|
143 | 62 | $details = $info->getHeader()->getDetails(); |
|
144 | } |
||
145 | } |
||
146 | |||
147 | // Retrieve the page count |
||
148 | try { |
||
149 | 75 | $pages = $this->getPages(); |
|
150 | 73 | $details['Pages'] = \count($pages); |
|
151 | 3 | } catch (\Exception $e) { |
|
152 | 3 | $details['Pages'] = 0; |
|
153 | } |
||
154 | |||
155 | // Decode and repair encoded document properties |
||
156 | 75 | foreach ($details as $key => $value) { |
|
157 | 75 | if (\is_string($value)) { |
|
158 | // If the string is already UTF-8 encoded, that means we only |
||
159 | // need to repair Adobe's ham-fisted insertion of line-feeds |
||
160 | // every ~127 characters, which doesn't seem to be multi-byte |
||
161 | // safe |
||
162 | 61 | if (mb_check_encoding($value, 'UTF-8')) { |
|
163 | // Remove literal backslash + line-feed "\\r" |
||
164 | 60 | $value = str_replace("\x5c\x0d", '', $value); |
|
165 | |||
166 | // Remove backslash plus bytes written into high part of |
||
167 | // multibyte unicode character |
||
168 | 60 | while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) { |
|
169 | 1 | $diff = (\ord($match[1]) - 182) * 64; |
|
170 | 1 | $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff)); |
|
171 | 1 | $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value); |
|
172 | } |
||
173 | |||
174 | // Remove bytes written into low part of multibyte unicode |
||
175 | // character |
||
176 | 60 | while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) { |
|
177 | $diff = \ord($match[2]) - 181; |
||
178 | $newbyte = \chr(\ord($match[1]) + $diff); |
||
179 | $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value); |
||
180 | } |
||
181 | |||
182 | // Remove this byte string that Adobe occasionally adds |
||
183 | // between two single byte characters in a unicode string |
||
184 | 60 | $value = str_replace("\xe5\xb0\x8d", '', $value); |
|
185 | |||
186 | 60 | $details[$key] = $value; |
|
187 | } else { |
||
188 | // If the string is just PDFDocEncoding, remove any line-feeds |
||
189 | // and decode the whole thing. |
||
190 | 11 | $value = str_replace("\\\r", '', $value); |
|
191 | 11 | $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value); |
|
192 | } |
||
193 | } |
||
194 | } |
||
195 | |||
196 | 75 | $details = array_merge($details, $this->metadata); |
|
197 | |||
198 | 75 | $this->details = $details; |
|
199 | } |
||
200 | |||
201 | /** |
||
202 | * Extract XMP Metadata |
||
203 | */ |
||
204 | 42 | public function extractXMPMetadata(string $content): void |
|
205 | { |
||
206 | 42 | $xml = xml_parser_create(); |
|
207 | 42 | xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1); |
|
208 | |||
209 | 42 | if (1 === xml_parse_into_struct($xml, $content, $values, $index)) { |
|
210 | /* |
||
211 | * short overview about the following code parts: |
||
212 | * |
||
213 | * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on, |
||
214 | * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the |
||
215 | * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered, |
||
216 | * we save the current $metadata context in the $stack, then create a child array of $metadata and |
||
217 | * make that the current $metadata context. When a "close" XML tag is encountered, the operations are |
||
218 | * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current |
||
219 | * element) is set as the current $metadata context. |
||
220 | */ |
||
221 | 42 | $metadata = []; |
|
222 | 42 | $stack = []; |
|
223 | 42 | foreach ($values as $val) { |
|
224 | // Standardize to lowercase |
||
225 | 42 | $val['tag'] = strtolower($val['tag']); |
|
226 | |||
227 | // Ignore structural x: and rdf: XML elements |
||
228 | 42 | if (0 === strpos($val['tag'], 'x:')) { |
|
229 | 42 | continue; |
|
230 | 42 | } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) { |
|
231 | 42 | continue; |
|
232 | } |
||
233 | |||
234 | 42 | switch ($val['type']) { |
|
235 | 42 | case 'open': |
|
236 | // Create an array of list items |
||
237 | 38 | if ('rdf:li' == $val['tag']) { |
|
238 | 5 | $metadata[] = []; |
|
239 | |||
240 | // Move up one level in the stack |
||
241 | 5 | $stack[\count($stack)] = &$metadata; |
|
242 | 5 | $metadata = &$metadata[\count($metadata) - 1]; |
|
243 | } else { |
||
244 | // Else create an array of named values |
||
245 | 38 | $metadata[$val['tag']] = []; |
|
246 | |||
247 | // Move up one level in the stack |
||
248 | 38 | $stack[\count($stack)] = &$metadata; |
|
249 | 38 | $metadata = &$metadata[$val['tag']]; |
|
250 | } |
||
251 | 38 | break; |
|
252 | |||
253 | 42 | case 'complete': |
|
254 | 42 | if (isset($val['value'])) { |
|
255 | // Assign a value to this list item |
||
256 | 42 | if ('rdf:li' == $val['tag']) { |
|
257 | 34 | $metadata[] = $val['value']; |
|
258 | |||
259 | // Else assign a value to this property |
||
260 | } else { |
||
261 | 42 | $metadata[$val['tag']] = $val['value']; |
|
262 | } |
||
263 | } |
||
264 | 42 | break; |
|
265 | |||
266 | 38 | case 'close': |
|
267 | // If the value of this property is an array |
||
268 | 38 | if (\is_array($metadata)) { |
|
269 | // If the value is a single element array |
||
270 | // where the element is of type string, use |
||
271 | // the value of the first list item as the |
||
272 | // value for this property |
||
273 | 38 | if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) { |
|
274 | 34 | $metadata = $metadata[0]; |
|
275 | 10 | } elseif (0 == \count($metadata)) { |
|
276 | // if the value is an empty array, set |
||
277 | // the value of this property to the empty |
||
278 | // string |
||
279 | 7 | $metadata = ''; |
|
280 | } |
||
281 | } |
||
282 | |||
283 | // Move down one level in the stack |
||
284 | 38 | $metadata = &$stack[\count($stack) - 1]; |
|
285 | 38 | unset($stack[\count($stack) - 1]); |
|
286 | 38 | break; |
|
287 | } |
||
288 | } |
||
289 | |||
290 | // Only use this metadata if it's referring to a PDF |
||
291 | 42 | if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) { |
|
292 | // According to the XMP specifications: 'Conflict resolution |
||
293 | // for separate packets that describe the same resource is |
||
294 | // beyond the scope of this document.' - Section 6.1 |
||
295 | // Source: https://www.adobe.com/devnet/xmp.html |
||
296 | // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf |
||
297 | // So if there are multiple XMP blocks, just merge the values |
||
298 | // of each found block over top of the existing values |
||
299 | 42 | $this->metadata = array_merge($this->metadata, $metadata); |
|
300 | } |
||
301 | } |
||
302 | 42 | xml_parser_free($xml); |
|
303 | } |
||
304 | |||
305 | 1 | public function getDictionary(): array |
|
306 | { |
||
307 | 1 | return $this->dictionary; |
|
308 | } |
||
309 | |||
310 | /** |
||
311 | * @param PDFObject[] $objects |
||
312 | */ |
||
313 | 74 | public function setObjects($objects = []) |
|
314 | { |
||
315 | 74 | $this->objects = (array) $objects; |
|
316 | |||
317 | 74 | $this->init(); |
|
318 | } |
||
319 | |||
320 | /** |
||
321 | * @return PDFObject[] |
||
322 | */ |
||
323 | 2 | public function getObjects() |
|
324 | { |
||
325 | 2 | return $this->objects; |
|
326 | } |
||
327 | |||
328 | /** |
||
329 | * @return PDFObject|Font|Page|Element|null |
||
330 | */ |
||
331 | 71 | public function getObjectById(string $id) |
|
332 | { |
||
333 | 71 | if (isset($this->objects[$id])) { |
|
334 | 71 | return $this->objects[$id]; |
|
335 | } |
||
336 | |||
337 | 3 | return null; |
|
338 | } |
||
339 | |||
340 | 76 | public function hasObjectsByType(string $type, ?string $subtype = null): bool |
|
341 | { |
||
342 | 76 | return 0 < \count($this->getObjectsByType($type, $subtype)); |
|
343 | } |
||
344 | |||
345 | 80 | public function getObjectsByType(string $type, ?string $subtype = null): array |
|
346 | { |
||
347 | 80 | if (!isset($this->dictionary[$type])) { |
|
348 | 15 | return []; |
|
349 | } |
||
350 | |||
351 | 73 | if (null != $subtype) { |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
352 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
||
353 | return []; |
||
354 | } |
||
355 | |||
356 | return $this->dictionary[$type]['subtype'][$subtype]; |
||
357 | } |
||
358 | |||
359 | 73 | return $this->dictionary[$type]['all']; |
|
360 | } |
||
361 | |||
362 | /** |
||
363 | * @return Font[] |
||
364 | */ |
||
365 | 55 | public function getFonts() |
|
366 | { |
||
367 | 55 | return $this->getObjectsByType('Font'); |
|
368 | } |
||
369 | |||
370 | 49 | public function getFirstFont(): ?Font |
|
371 | { |
||
372 | 49 | $fonts = $this->getFonts(); |
|
373 | 49 | if ([] === $fonts) { |
|
374 | 5 | return null; |
|
375 | } |
||
376 | |||
377 | 44 | return reset($fonts); |
|
378 | } |
||
379 | |||
380 | /** |
||
381 | * @return Page[] |
||
382 | * |
||
383 | * @throws MissingCatalogException |
||
384 | */ |
||
385 | 76 | public function getPages() |
|
386 | { |
||
387 | 76 | if ($this->hasObjectsByType('Catalog')) { |
|
388 | // Search for catalog to list pages. |
||
389 | 67 | $catalogues = $this->getObjectsByType('Catalog'); |
|
390 | 67 | $catalogue = reset($catalogues); |
|
391 | |||
392 | /** @var Pages $object */ |
||
393 | 67 | $object = $catalogue->get('Pages'); |
|
394 | 67 | if (method_exists($object, 'getPages')) { |
|
395 | 67 | return $object->getPages(true); |
|
396 | } |
||
397 | } |
||
398 | |||
399 | 10 | if ($this->hasObjectsByType('Pages')) { |
|
400 | // Search for pages to list kids. |
||
401 | 1 | $pages = []; |
|
402 | |||
403 | /** @var Pages[] $objects */ |
||
404 | 1 | $objects = $this->getObjectsByType('Pages'); |
|
405 | 1 | foreach ($objects as $object) { |
|
406 | 1 | $pages = array_merge($pages, $object->getPages(true)); |
|
407 | } |
||
408 | |||
409 | 1 | return $pages; |
|
410 | } |
||
411 | |||
412 | 10 | if ($this->hasObjectsByType('Page')) { |
|
413 | // Search for 'page' (unordered pages). |
||
414 | 7 | $pages = $this->getObjectsByType('Page'); |
|
415 | |||
416 | 7 | return array_values($pages); |
|
417 | } |
||
418 | |||
419 | 4 | throw new MissingCatalogException('Missing catalog.'); |
|
420 | } |
||
421 | |||
422 | 25 | public function getText(?int $pageLimit = null): string |
|
423 | { |
||
424 | 25 | $texts = []; |
|
425 | 25 | $pages = $this->getPages(); |
|
426 | |||
427 | // Only use the first X number of pages if $pageLimit is set and numeric. |
||
428 | 25 | if (\is_int($pageLimit) && 0 < $pageLimit) { |
|
429 | 1 | $pages = \array_slice($pages, 0, $pageLimit); |
|
430 | } |
||
431 | |||
432 | 25 | foreach ($pages as $index => $page) { |
|
433 | /** |
||
434 | * In some cases, the $page variable may be null. |
||
435 | */ |
||
436 | 25 | if (null === $page) { |
|
437 | continue; |
||
438 | } |
||
439 | 25 | if ($text = trim($page->getText())) { |
|
440 | 24 | $texts[] = $text; |
|
441 | } |
||
442 | } |
||
443 | |||
444 | 25 | return implode("\n\n", $texts); |
|
445 | } |
||
446 | |||
447 | public function getTrailer(): Header |
||
448 | { |
||
449 | return $this->trailer; |
||
450 | } |
||
451 | |||
452 | 66 | public function setTrailer(Header $trailer) |
|
453 | { |
||
454 | 66 | $this->trailer = $trailer; |
|
455 | } |
||
456 | |||
457 | 18 | public function getDetails(): array |
|
458 | { |
||
459 | 18 | return $this->details; |
|
460 | } |
||
461 | } |
||
462 |