smalot /
pdfparser
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * @file |
||
| 5 | * This file is part of the PdfParser library. |
||
| 6 | * |
||
| 7 | * @author Sébastien MALOT <[email protected]> |
||
| 8 | * |
||
| 9 | * @date 2017-01-03 |
||
| 10 | * |
||
| 11 | * @license LGPLv3 |
||
| 12 | * |
||
| 13 | * @url <https://github.com/smalot/pdfparser> |
||
| 14 | * |
||
| 15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
| 16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
| 17 | * |
||
| 18 | * This program is free software: you can redistribute it and/or modify |
||
| 19 | * it under the terms of the GNU Lesser General Public License as published by |
||
| 20 | * the Free Software Foundation, either version 3 of the License, or |
||
| 21 | * (at your option) any later version. |
||
| 22 | * |
||
| 23 | * This program is distributed in the hope that it will be useful, |
||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 26 | * GNU Lesser General Public License for more details. |
||
| 27 | * |
||
| 28 | * You should have received a copy of the GNU Lesser General Public License |
||
| 29 | * along with this program. |
||
| 30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
| 31 | */ |
||
| 32 | |||
| 33 | namespace Smalot\PdfParser; |
||
| 34 | |||
| 35 | use Smalot\PdfParser\Encoding\PDFDocEncoding; |
||
| 36 | use Smalot\PdfParser\Exception\MissingCatalogException; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * Technical references : |
||
| 40 | * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html |
||
| 41 | * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php |
||
| 42 | * - http://www.php.net/manual/en/ref.pdf.php#74211 |
||
| 43 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm |
||
| 44 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm |
||
| 45 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm |
||
| 46 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm |
||
| 47 | * |
||
| 48 | * Class Document |
||
| 49 | */ |
||
| 50 | class Document |
||
| 51 | { |
||
| 52 | /** |
||
| 53 | * @var PDFObject[] |
||
| 54 | */ |
||
| 55 | protected $objects = []; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * @var array |
||
| 59 | */ |
||
| 60 | protected $dictionary = []; |
||
| 61 | |||
| 62 | /** |
||
| 63 | * @var Header |
||
| 64 | */ |
||
| 65 | protected $trailer; |
||
| 66 | |||
| 67 | /** |
||
| 68 | * @var array<mixed> |
||
| 69 | */ |
||
| 70 | protected $metadata = []; |
||
| 71 | |||
| 72 | /** |
||
| 73 | * @var array |
||
| 74 | */ |
||
| 75 | protected $details; |
||
| 76 | |||
| 77 | 104 | public function __construct() |
|
| 78 | { |
||
| 79 | 104 | $this->trailer = new Header([], $this); |
|
| 80 | } |
||
| 81 | |||
| 82 | 75 | public function init() |
|
| 83 | { |
||
| 84 | 75 | $this->buildDictionary(); |
|
| 85 | |||
| 86 | 75 | $this->buildDetails(); |
|
| 87 | |||
| 88 | // Propagate init to objects. |
||
| 89 | 75 | foreach ($this->objects as $object) { |
|
| 90 | 74 | $object->getHeader()->init(); |
|
| 91 | 74 | $object->init(); |
|
| 92 | } |
||
| 93 | } |
||
| 94 | |||
| 95 | /** |
||
| 96 | * Build dictionary based on type header field. |
||
| 97 | */ |
||
| 98 | 75 | protected function buildDictionary() |
|
| 99 | { |
||
| 100 | // Build dictionary. |
||
| 101 | 75 | $this->dictionary = []; |
|
| 102 | |||
| 103 | 75 | foreach ($this->objects as $id => $object) { |
|
| 104 | // Cache objects by type and subtype |
||
| 105 | 74 | $type = $object->getHeader()->get('Type')->getContent(); |
|
| 106 | |||
| 107 | 74 | if (null != $type) { |
|
| 108 | 74 | if (!isset($this->dictionary[$type])) { |
|
| 109 | 74 | $this->dictionary[$type] = [ |
|
| 110 | 74 | 'all' => [], |
|
| 111 | 74 | 'subtype' => [], |
|
| 112 | 74 | ]; |
|
| 113 | } |
||
| 114 | |||
| 115 | 74 | $this->dictionary[$type]['all'][$id] = $object; |
|
| 116 | |||
| 117 | 74 | $subtype = $object->getHeader()->get('Subtype')->getContent(); |
|
| 118 | 74 | if (null != $subtype) { |
|
| 119 | 67 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
|
| 120 | 67 | $this->dictionary[$type]['subtype'][$subtype] = []; |
|
| 121 | } |
||
| 122 | 67 | $this->dictionary[$type]['subtype'][$subtype][$id] = $object; |
|
| 123 | } |
||
| 124 | } |
||
| 125 | } |
||
| 126 | } |
||
| 127 | |||
| 128 | /** |
||
| 129 | * Build details array. |
||
| 130 | */ |
||
| 131 | 75 | protected function buildDetails() |
|
| 132 | { |
||
| 133 | // Build details array. |
||
| 134 | 75 | $details = []; |
|
| 135 | |||
| 136 | // Extract document info |
||
| 137 | 75 | if ($this->trailer->has('Info')) { |
|
| 138 | /** @var PDFObject $info */ |
||
| 139 | 62 | $info = $this->trailer->get('Info'); |
|
| 140 | // This could be an ElementMissing object, so we need to check for |
||
| 141 | // the getHeader method first. |
||
| 142 | 62 | if (null !== $info && method_exists($info, 'getHeader')) { |
|
| 143 | 62 | $details = $info->getHeader()->getDetails(); |
|
| 144 | } |
||
| 145 | } |
||
| 146 | |||
| 147 | // Retrieve the page count |
||
| 148 | try { |
||
| 149 | 75 | $pages = $this->getPages(); |
|
| 150 | 73 | $details['Pages'] = \count($pages); |
|
| 151 | 3 | } catch (\Exception $e) { |
|
| 152 | 3 | $details['Pages'] = 0; |
|
| 153 | } |
||
| 154 | |||
| 155 | // Decode and repair encoded document properties |
||
| 156 | 75 | foreach ($details as $key => $value) { |
|
| 157 | 75 | if (\is_string($value)) { |
|
| 158 | // If the string is already UTF-8 encoded, that means we only |
||
| 159 | // need to repair Adobe's ham-fisted insertion of line-feeds |
||
| 160 | // every ~127 characters, which doesn't seem to be multi-byte |
||
| 161 | // safe |
||
| 162 | 61 | if (mb_check_encoding($value, 'UTF-8')) { |
|
| 163 | // Remove literal backslash + line-feed "\\r" |
||
| 164 | 60 | $value = str_replace("\x5c\x0d", '', $value); |
|
| 165 | |||
| 166 | // Remove backslash plus bytes written into high part of |
||
| 167 | // multibyte unicode character |
||
| 168 | 60 | while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) { |
|
| 169 | 1 | $diff = (\ord($match[1]) - 182) * 64; |
|
| 170 | 1 | $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff)); |
|
| 171 | 1 | $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value); |
|
| 172 | } |
||
| 173 | |||
| 174 | // Remove bytes written into low part of multibyte unicode |
||
| 175 | // character |
||
| 176 | 60 | while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) { |
|
| 177 | $diff = \ord($match[2]) - 181; |
||
| 178 | $newbyte = \chr(\ord($match[1]) + $diff); |
||
| 179 | $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value); |
||
| 180 | } |
||
| 181 | |||
| 182 | // Remove this byte string that Adobe occasionally adds |
||
| 183 | // between two single byte characters in a unicode string |
||
| 184 | 60 | $value = str_replace("\xe5\xb0\x8d", '', $value); |
|
| 185 | |||
| 186 | 60 | $details[$key] = $value; |
|
| 187 | } else { |
||
| 188 | // If the string is just PDFDocEncoding, remove any line-feeds |
||
| 189 | // and decode the whole thing. |
||
| 190 | 11 | $value = str_replace("\\\r", '', $value); |
|
| 191 | 11 | $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value); |
|
| 192 | } |
||
| 193 | } |
||
| 194 | } |
||
| 195 | |||
| 196 | 75 | $details = array_merge($details, $this->metadata); |
|
| 197 | |||
| 198 | 75 | $this->details = $details; |
|
| 199 | } |
||
| 200 | |||
| 201 | /** |
||
| 202 | * Extract XMP Metadata |
||
| 203 | */ |
||
| 204 | 42 | public function extractXMPMetadata(string $content): void |
|
| 205 | { |
||
| 206 | 42 | $xml = xml_parser_create(); |
|
| 207 | 42 | xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1); |
|
| 208 | |||
| 209 | 42 | if (1 === xml_parse_into_struct($xml, $content, $values, $index)) { |
|
| 210 | /* |
||
| 211 | * short overview about the following code parts: |
||
| 212 | * |
||
| 213 | * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on, |
||
| 214 | * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the |
||
| 215 | * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered, |
||
| 216 | * we save the current $metadata context in the $stack, then create a child array of $metadata and |
||
| 217 | * make that the current $metadata context. When a "close" XML tag is encountered, the operations are |
||
| 218 | * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current |
||
| 219 | * element) is set as the current $metadata context. |
||
| 220 | */ |
||
| 221 | 42 | $metadata = []; |
|
| 222 | 42 | $stack = []; |
|
| 223 | 42 | foreach ($values as $val) { |
|
| 224 | // Standardize to lowercase |
||
| 225 | 42 | $val['tag'] = strtolower($val['tag']); |
|
| 226 | |||
| 227 | // Ignore structural x: and rdf: XML elements |
||
| 228 | 42 | if (0 === strpos($val['tag'], 'x:')) { |
|
| 229 | 42 | continue; |
|
| 230 | 42 | } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) { |
|
| 231 | 42 | continue; |
|
| 232 | } |
||
| 233 | |||
| 234 | 42 | switch ($val['type']) { |
|
| 235 | 42 | case 'open': |
|
| 236 | // Create an array of list items |
||
| 237 | 38 | if ('rdf:li' == $val['tag']) { |
|
| 238 | 5 | $metadata[] = []; |
|
| 239 | |||
| 240 | // Move up one level in the stack |
||
| 241 | 5 | $stack[\count($stack)] = &$metadata; |
|
| 242 | 5 | $metadata = &$metadata[\count($metadata) - 1]; |
|
| 243 | } else { |
||
| 244 | // Else create an array of named values |
||
| 245 | 38 | $metadata[$val['tag']] = []; |
|
| 246 | |||
| 247 | // Move up one level in the stack |
||
| 248 | 38 | $stack[\count($stack)] = &$metadata; |
|
| 249 | 38 | $metadata = &$metadata[$val['tag']]; |
|
| 250 | } |
||
| 251 | 38 | break; |
|
| 252 | |||
| 253 | 42 | case 'complete': |
|
| 254 | 42 | if (isset($val['value'])) { |
|
| 255 | // Assign a value to this list item |
||
| 256 | 42 | if ('rdf:li' == $val['tag']) { |
|
| 257 | 34 | $metadata[] = $val['value']; |
|
| 258 | |||
| 259 | // Else assign a value to this property |
||
| 260 | } else { |
||
| 261 | 42 | $metadata[$val['tag']] = $val['value']; |
|
| 262 | } |
||
| 263 | } |
||
| 264 | 42 | break; |
|
| 265 | |||
| 266 | 38 | case 'close': |
|
| 267 | // If the value of this property is an array |
||
| 268 | 38 | if (\is_array($metadata)) { |
|
| 269 | // If the value is a single element array |
||
| 270 | // where the element is of type string, use |
||
| 271 | // the value of the first list item as the |
||
| 272 | // value for this property |
||
| 273 | 38 | if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) { |
|
| 274 | 34 | $metadata = $metadata[0]; |
|
| 275 | 10 | } elseif (0 == \count($metadata)) { |
|
| 276 | // if the value is an empty array, set |
||
| 277 | // the value of this property to the empty |
||
| 278 | // string |
||
| 279 | 7 | $metadata = ''; |
|
| 280 | } |
||
| 281 | } |
||
| 282 | |||
| 283 | // Move down one level in the stack |
||
| 284 | 38 | $metadata = &$stack[\count($stack) - 1]; |
|
| 285 | 38 | unset($stack[\count($stack) - 1]); |
|
| 286 | 38 | break; |
|
| 287 | } |
||
| 288 | } |
||
| 289 | |||
| 290 | // Only use this metadata if it's referring to a PDF |
||
| 291 | 42 | if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) { |
|
| 292 | // According to the XMP specifications: 'Conflict resolution |
||
| 293 | // for separate packets that describe the same resource is |
||
| 294 | // beyond the scope of this document.' - Section 6.1 |
||
| 295 | // Source: https://www.adobe.com/devnet/xmp.html |
||
| 296 | // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf |
||
| 297 | // So if there are multiple XMP blocks, just merge the values |
||
| 298 | // of each found block over top of the existing values |
||
| 299 | 42 | $this->metadata = array_merge($this->metadata, $metadata); |
|
| 300 | } |
||
| 301 | } |
||
| 302 | 42 | ||
| 303 | // TODO: remove this if-clause and its content when dropping PHP 7 support |
||
| 304 | if (version_compare(PHP_VERSION, '8.0.0', '<')) { |
||
| 305 | 1 | // ref: https://www.php.net/manual/en/function.xml-parser-free.php |
|
| 306 | xml_parser_free($xml); |
||
| 307 | 1 | ||
| 308 | // to avoid memory leaks; documentation said: |
||
| 309 | // > it was necessary to also explicitly unset the reference to parser to avoid memory leaks |
||
| 310 | unset($xml); |
||
| 311 | } |
||
| 312 | } |
||
| 313 | 74 | ||
| 314 | public function getDictionary(): array |
||
| 315 | 74 | { |
|
| 316 | return $this->dictionary; |
||
| 317 | 74 | } |
|
| 318 | |||
| 319 | /** |
||
| 320 | * @param PDFObject[] $objects |
||
| 321 | */ |
||
| 322 | public function setObjects($objects = []) |
||
| 323 | 2 | { |
|
| 324 | $this->objects = (array) $objects; |
||
| 325 | 2 | ||
| 326 | $this->init(); |
||
| 327 | } |
||
| 328 | |||
| 329 | /** |
||
| 330 | * @return PDFObject[] |
||
| 331 | 71 | */ |
|
| 332 | public function getObjects() |
||
| 333 | 71 | { |
|
| 334 | 71 | return $this->objects; |
|
| 335 | } |
||
| 336 | |||
| 337 | 3 | /** |
|
| 338 | * @return PDFObject|Font|Page|Element|null |
||
| 339 | */ |
||
| 340 | 76 | public function getObjectById(string $id) |
|
| 341 | { |
||
| 342 | 76 | if (isset($this->objects[$id])) { |
|
| 343 | return $this->objects[$id]; |
||
| 344 | } |
||
| 345 | 80 | ||
| 346 | return null; |
||
| 347 | 80 | } |
|
| 348 | 15 | ||
| 349 | public function hasObjectsByType(string $type, ?string $subtype = null): bool |
||
| 350 | { |
||
| 351 | 73 | return 0 < \count($this->getObjectsByType($type, $subtype)); |
|
| 352 | } |
||
| 353 | |||
| 354 | public function getObjectsByType(string $type, ?string $subtype = null): array |
||
| 355 | { |
||
| 356 | if (!isset($this->dictionary[$type])) { |
||
| 357 | return []; |
||
| 358 | } |
||
| 359 | 73 | ||
| 360 | if (null != $subtype) { |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 361 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
||
| 362 | return []; |
||
| 363 | } |
||
| 364 | |||
| 365 | 55 | return $this->dictionary[$type]['subtype'][$subtype]; |
|
| 366 | } |
||
| 367 | 55 | ||
| 368 | return $this->dictionary[$type]['all']; |
||
| 369 | } |
||
| 370 | 49 | ||
| 371 | /** |
||
| 372 | 49 | * @return Font[] |
|
| 373 | 49 | */ |
|
| 374 | 5 | public function getFonts() |
|
| 375 | { |
||
| 376 | return $this->getObjectsByType('Font'); |
||
| 377 | 44 | } |
|
| 378 | |||
| 379 | public function getFirstFont(): ?Font |
||
| 380 | { |
||
| 381 | $fonts = $this->getFonts(); |
||
| 382 | if ([] === $fonts) { |
||
| 383 | return null; |
||
| 384 | } |
||
| 385 | 76 | ||
| 386 | return reset($fonts); |
||
| 387 | 76 | } |
|
| 388 | |||
| 389 | 67 | /** |
|
| 390 | 67 | * @return Page[] |
|
| 391 | * |
||
| 392 | * @throws MissingCatalogException |
||
| 393 | 67 | */ |
|
| 394 | 67 | public function getPages() |
|
| 395 | 67 | { |
|
| 396 | if ($this->hasObjectsByType('Catalog')) { |
||
| 397 | // Search for catalog to list pages. |
||
| 398 | $catalogues = $this->getObjectsByType('Catalog'); |
||
| 399 | 10 | $catalogue = reset($catalogues); |
|
| 400 | |||
| 401 | 1 | /** @var Pages $object */ |
|
| 402 | $object = $catalogue->get('Pages'); |
||
| 403 | if (method_exists($object, 'getPages')) { |
||
| 404 | 1 | return $object->getPages(true); |
|
| 405 | 1 | } |
|
| 406 | 1 | } |
|
| 407 | |||
| 408 | if ($this->hasObjectsByType('Pages')) { |
||
| 409 | 1 | // Search for pages to list kids. |
|
| 410 | $pages = []; |
||
| 411 | |||
| 412 | 10 | /** @var Pages[] $objects */ |
|
| 413 | $objects = $this->getObjectsByType('Pages'); |
||
| 414 | 7 | foreach ($objects as $object) { |
|
| 415 | $pages = array_merge($pages, $object->getPages(true)); |
||
| 416 | 7 | } |
|
| 417 | |||
| 418 | return $pages; |
||
| 419 | 4 | } |
|
| 420 | |||
| 421 | if ($this->hasObjectsByType('Page')) { |
||
| 422 | 25 | // Search for 'page' (unordered pages). |
|
| 423 | $pages = $this->getObjectsByType('Page'); |
||
| 424 | 25 | ||
| 425 | 25 | return array_values($pages); |
|
| 426 | } |
||
| 427 | |||
| 428 | 25 | throw new MissingCatalogException('Missing catalog.'); |
|
| 429 | 1 | } |
|
| 430 | |||
| 431 | public function getText(?int $pageLimit = null): string |
||
| 432 | 25 | { |
|
| 433 | $texts = []; |
||
| 434 | $pages = $this->getPages(); |
||
| 435 | |||
| 436 | 25 | // Only use the first X number of pages if $pageLimit is set and numeric. |
|
| 437 | if (\is_int($pageLimit) && 0 < $pageLimit) { |
||
| 438 | $pages = \array_slice($pages, 0, $pageLimit); |
||
| 439 | 25 | } |
|
| 440 | 24 | ||
| 441 | foreach ($pages as $index => $page) { |
||
| 442 | /** |
||
| 443 | * In some cases, the $page variable may be null. |
||
| 444 | 25 | */ |
|
| 445 | if (null === $page) { |
||
| 446 | continue; |
||
| 447 | } |
||
| 448 | if ($text = trim($page->getText())) { |
||
| 449 | $texts[] = $text; |
||
| 450 | } |
||
| 451 | } |
||
| 452 | 66 | ||
| 453 | return implode("\n\n", $texts); |
||
| 454 | 66 | } |
|
| 455 | |||
| 456 | public function getTrailer(): Header |
||
| 457 | 18 | { |
|
| 458 | return $this->trailer; |
||
| 459 | 18 | } |
|
| 460 | |||
| 461 | public function setTrailer(Header $trailer) |
||
| 462 | { |
||
| 463 | $this->trailer = $trailer; |
||
| 464 | } |
||
| 465 | |||
| 466 | public function getDetails(): array |
||
| 467 | { |
||
| 468 | return $this->details; |
||
| 469 | } |
||
| 470 | } |
||
| 471 |