smalot /
pdfparser
| 1 | <?php |
||||
| 2 | |||||
| 3 | /** |
||||
| 4 | * @file |
||||
| 5 | * This file is part of the PdfParser library. |
||||
| 6 | * |
||||
| 7 | * @author Sébastien MALOT <[email protected]> |
||||
| 8 | * |
||||
| 9 | * @date 2017-01-03 |
||||
| 10 | * |
||||
| 11 | * @license LGPLv3 |
||||
| 12 | * |
||||
| 13 | * @url <https://github.com/smalot/pdfparser> |
||||
| 14 | * |
||||
| 15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||
| 16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||
| 17 | * |
||||
| 18 | * This program is free software: you can redistribute it and/or modify |
||||
| 19 | * it under the terms of the GNU Lesser General Public License as published by |
||||
| 20 | * the Free Software Foundation, either version 3 of the License, or |
||||
| 21 | * (at your option) any later version. |
||||
| 22 | * |
||||
| 23 | * This program is distributed in the hope that it will be useful, |
||||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
| 26 | * GNU Lesser General Public License for more details. |
||||
| 27 | * |
||||
| 28 | * You should have received a copy of the GNU Lesser General Public License |
||||
| 29 | * along with this program. |
||||
| 30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||
| 31 | */ |
||||
| 32 | |||||
| 33 | namespace Smalot\PdfParser; |
||||
| 34 | |||||
| 35 | /** |
||||
| 36 | * Technical references : |
||||
| 37 | * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html |
||||
| 38 | * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php |
||||
| 39 | * - http://www.php.net/manual/en/ref.pdf.php#74211 |
||||
| 40 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm |
||||
| 41 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm |
||||
| 42 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm |
||||
| 43 | * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm |
||||
| 44 | * |
||||
| 45 | * Class Document |
||||
| 46 | */ |
||||
| 47 | class Document |
||||
| 48 | { |
||||
| 49 | /** |
||||
| 50 | * @var PDFObject[] |
||||
| 51 | */ |
||||
| 52 | protected $objects = []; |
||||
| 53 | |||||
| 54 | /** |
||||
| 55 | * @var array |
||||
| 56 | */ |
||||
| 57 | protected $dictionary = []; |
||||
| 58 | |||||
| 59 | /** |
||||
| 60 | * @var Header |
||||
| 61 | */ |
||||
| 62 | protected $trailer; |
||||
| 63 | |||||
| 64 | /** |
||||
| 65 | * @var Metadata |
||||
|
0 ignored issues
–
show
|
|||||
| 66 | */ |
||||
| 67 | protected $metadata = []; |
||||
| 68 | |||||
| 69 | 72 | /** |
|||
| 70 | * @var array |
||||
| 71 | 72 | */ |
|||
| 72 | 72 | protected $details; |
|||
| 73 | |||||
| 74 | 49 | public function __construct() |
|||
| 75 | { |
||||
| 76 | 49 | $this->trailer = new Header([], $this); |
|||
| 77 | } |
||||
| 78 | 49 | ||||
| 79 | public function init() |
||||
| 80 | { |
||||
| 81 | 49 | $this->buildDictionary(); |
|||
| 82 | 49 | ||||
| 83 | 49 | $this->buildDetails(); |
|||
| 84 | |||||
| 85 | 49 | // Propagate init to objects. |
|||
| 86 | foreach ($this->objects as $object) { |
||||
| 87 | $object->getHeader()->init(); |
||||
| 88 | $object->init(); |
||||
| 89 | } |
||||
| 90 | 49 | } |
|||
| 91 | |||||
| 92 | /** |
||||
| 93 | 49 | * Build dictionary based on type header field. |
|||
| 94 | */ |
||||
| 95 | 49 | protected function buildDictionary() |
|||
| 96 | { |
||||
| 97 | 49 | // Build dictionary. |
|||
| 98 | $this->dictionary = []; |
||||
| 99 | 49 | ||||
| 100 | 49 | foreach ($this->objects as $id => $object) { |
|||
| 101 | 49 | // Cache objects by type and subtype |
|||
| 102 | $type = $object->getHeader()->get('Type')->getContent(); |
||||
| 103 | |||||
| 104 | if (null != $type) { |
||||
| 105 | if (!isset($this->dictionary[$type])) { |
||||
| 106 | $this->dictionary[$type] = [ |
||||
| 107 | 49 | 'all' => [], |
|||
| 108 | 'subtype' => [], |
||||
| 109 | 49 | ]; |
|||
| 110 | 49 | } |
|||
| 111 | 42 | ||||
| 112 | 42 | $this->dictionary[$type]['all'][$id] = $object; |
|||
| 113 | |||||
| 114 | 42 | $subtype = $object->getHeader()->get('Subtype')->getContent(); |
|||
| 115 | if (null != $subtype) { |
||||
| 116 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
||||
| 117 | $this->dictionary[$type]['subtype'][$subtype] = []; |
||||
| 118 | 49 | } |
|||
| 119 | $this->dictionary[$type]['subtype'][$subtype][$id] = $object; |
||||
| 120 | } |
||||
| 121 | } |
||||
| 122 | } |
||||
| 123 | 49 | } |
|||
| 124 | |||||
| 125 | /** |
||||
| 126 | 49 | * Build details array. |
|||
| 127 | */ |
||||
| 128 | protected function buildDetails() |
||||
| 129 | 49 | { |
|||
| 130 | // Build details array. |
||||
| 131 | 40 | $details = []; |
|||
| 132 | |||||
| 133 | // Extract document info |
||||
| 134 | 40 | if ($this->trailer->has('Info')) { |
|||
| 135 | 40 | /** @var PDFObject $info */ |
|||
| 136 | $info = $this->trailer->get('Info'); |
||||
| 137 | // This could be an ElementMissing object, so we need to check for |
||||
| 138 | // the getHeader method first. |
||||
| 139 | if (null !== $info && method_exists($info, 'getHeader')) { |
||||
| 140 | $details = $info->getHeader()->getDetails(); |
||||
| 141 | 49 | } |
|||
| 142 | 48 | } |
|||
| 143 | 2 | ||||
| 144 | 2 | // Retrieve the page count |
|||
| 145 | try { |
||||
| 146 | $pages = $this->getPages(); |
||||
| 147 | 49 | $details['Pages'] = \count($pages); |
|||
| 148 | 49 | } catch (\Exception $e) { |
|||
| 149 | $details['Pages'] = 0; |
||||
| 150 | 1 | } |
|||
| 151 | |||||
| 152 | 1 | $details = array_merge($details, $this->metadata); |
|||
|
0 ignored issues
–
show
$this->metadata of type Smalot\PdfParser\Metadata is incompatible with the type array expected by parameter $arrays of array_merge().
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 153 | |||||
| 154 | $this->details = $details; |
||||
| 155 | } |
||||
| 156 | |||||
| 157 | /** |
||||
| 158 | 49 | * Extract XMP Metadata |
|||
| 159 | */ |
||||
| 160 | 49 | public function extractXMPMetadata(string $content): void |
|||
| 161 | { |
||||
| 162 | 49 | $xml = xml_parser_create(); |
|||
| 163 | 49 | xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1); |
|||
| 164 | |||||
| 165 | if (xml_parse_into_struct($xml, $content, $values, $index)) { |
||||
| 166 | |||||
| 167 | $detail = ''; |
||||
| 168 | 1 | ||||
| 169 | foreach ($values as $val) { |
||||
| 170 | 1 | switch ($val['tag']) { |
|||
| 171 | case 'DC:CREATOR': |
||||
| 172 | $detail = ($val['type'] == 'open') ? 'Author' : ''; |
||||
| 173 | break; |
||||
| 174 | |||||
| 175 | case 'DC:DESCRIPTION': |
||||
| 176 | 46 | $detail = ($val['type'] == 'open') ? 'Description' : ''; |
|||
| 177 | break; |
||||
| 178 | 46 | ||||
| 179 | 46 | case 'DC:TITLE': |
|||
| 180 | $detail = ($val['type'] == 'open') ? 'Title' : ''; |
||||
| 181 | break; |
||||
| 182 | 3 | ||||
| 183 | case 'DC:SUBJECT': |
||||
| 184 | $detail = ($val['type'] == 'open') ? 'Subject' : ''; |
||||
| 185 | 50 | break; |
|||
| 186 | |||||
| 187 | 50 | case 'RDF:LI': |
|||
| 188 | if ($detail && $val['type'] == 'complete' && isset($val['value'])) { |
||||
| 189 | $this->metadata[$detail] = $val['value']; |
||||
| 190 | 53 | } |
|||
| 191 | break; |
||||
| 192 | 53 | ||||
| 193 | 12 | case 'DC:FORMAT': |
|||
| 194 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 195 | $this->metadata['Format'] = $val['value']; |
||||
| 196 | 48 | } |
|||
| 197 | break; |
||||
| 198 | |||||
| 199 | case 'PDF:KEYWORDS': |
||||
| 200 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 201 | $this->metadata['Keywords'] = $val['value']; |
||||
| 202 | } |
||||
| 203 | break; |
||||
| 204 | 48 | ||||
| 205 | case 'PDF:PRODUCER': |
||||
| 206 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 207 | $this->metadata['Producer'] = $val['value']; |
||||
| 208 | } |
||||
| 209 | break; |
||||
| 210 | 27 | ||||
| 211 | case 'PDFX:SOURCEMODIFIED': |
||||
| 212 | 27 | if ($val['type'] == 'complete' && isset($val['value'])) { |
|||
| 213 | $this->metadata['SourceModified'] = $val['value']; |
||||
| 214 | } |
||||
| 215 | 21 | break; |
|||
| 216 | |||||
| 217 | 21 | case 'PDFX:COMPANY': |
|||
| 218 | 21 | if ($val['type'] == 'complete' && isset($val['value'])) { |
|||
| 219 | 3 | $this->metadata['Company'] = $val['value']; |
|||
| 220 | } |
||||
| 221 | break; |
||||
| 222 | 18 | ||||
| 223 | case 'XMP:CREATEDATE': |
||||
| 224 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 225 | $this->metadata['CreationDate'] = $val['value']; |
||||
| 226 | } |
||||
| 227 | break; |
||||
| 228 | |||||
| 229 | case 'XMP:CREATORTOOL': |
||||
| 230 | 50 | if ($val['type'] == 'complete' && isset($val['value'])) { |
|||
| 231 | $this->metadata['Creator'] = $val['value']; |
||||
| 232 | 50 | } |
|||
| 233 | break; |
||||
| 234 | 42 | ||||
| 235 | 42 | case 'XMP:MODIFYDATE': |
|||
| 236 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 237 | $this->metadata['ModifyDate'] = $val['value']; |
||||
| 238 | 42 | } |
|||
| 239 | 42 | break; |
|||
| 240 | 42 | ||||
| 241 | case 'XMP:METADATADATE': |
||||
| 242 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 243 | $this->metadata['MetadataDate'] = $val['value']; |
||||
| 244 | 9 | } |
|||
| 245 | break; |
||||
| 246 | 1 | ||||
| 247 | case 'XMPMM:DOCUMENTID': |
||||
| 248 | if ($val['type'] == 'complete' && isset($val['value'])) { |
||||
| 249 | 1 | $this->metadata['DocumentUUID'] = $val['value']; |
|||
| 250 | 1 | } |
|||
| 251 | 1 | break; |
|||
| 252 | |||||
| 253 | case 'XMPMM:INSTANCEID': |
||||
| 254 | 1 | if ($val['type'] == 'complete' && isset($val['value'])) { |
|||
| 255 | $this->metadata['InstanceUUID'] = $val['value']; |
||||
| 256 | } |
||||
| 257 | 9 | break; |
|||
| 258 | |||||
| 259 | 7 | } |
|||
| 260 | } |
||||
| 261 | 7 | } |
|||
| 262 | } |
||||
| 263 | |||||
| 264 | 3 | ||||
| 265 | public function getDictionary(): array |
||||
| 266 | { |
||||
| 267 | 12 | return $this->dictionary; |
|||
| 268 | } |
||||
| 269 | 12 | ||||
| 270 | 12 | /** |
|||
| 271 | * @param PDFObject[] $objects |
||||
| 272 | */ |
||||
| 273 | 12 | public function setObjects($objects = []) |
|||
| 274 | 1 | { |
|||
| 275 | $this->objects = (array) $objects; |
||||
| 276 | |||||
| 277 | 12 | $this->init(); |
|||
| 278 | } |
||||
| 279 | |||||
| 280 | /** |
||||
| 281 | 12 | * @return PDFObject[] |
|||
| 282 | */ |
||||
| 283 | public function getObjects() |
||||
| 284 | 12 | { |
|||
| 285 | 12 | return $this->objects; |
|||
| 286 | } |
||||
| 287 | |||||
| 288 | /** |
||||
| 289 | 12 | * @return PDFObject|Font|Page|Element|null |
|||
| 290 | */ |
||||
| 291 | public function getObjectById(string $id) |
||||
| 292 | { |
||||
| 293 | if (isset($this->objects[$id])) { |
||||
| 294 | return $this->objects[$id]; |
||||
| 295 | } |
||||
| 296 | |||||
| 297 | 41 | return null; |
|||
| 298 | } |
||||
| 299 | 41 | ||||
| 300 | 41 | public function hasObjectsByType(string $type, string $subtype = null): bool |
|||
| 301 | { |
||||
| 302 | 12 | return 0 < \count($this->getObjectsByType($type, $subtype)); |
|||
| 303 | } |
||||
| 304 | 12 | ||||
| 305 | public function getObjectsByType(string $type, string $subtype = null): array |
||||
| 306 | { |
||||
| 307 | if (!isset($this->dictionary[$type])) { |
||||
| 308 | return []; |
||||
| 309 | } |
||||
| 310 | |||||
| 311 | if (null != $subtype) { |
||||
| 312 | if (!isset($this->dictionary[$type]['subtype'][$subtype])) { |
||||
| 313 | return []; |
||||
| 314 | } |
||||
| 315 | |||||
| 316 | return $this->dictionary[$type]['subtype'][$subtype]; |
||||
| 317 | } |
||||
| 318 | |||||
| 319 | return $this->dictionary[$type]['all']; |
||||
| 320 | } |
||||
| 321 | |||||
| 322 | /** |
||||
| 323 | * @return Font[] |
||||
| 324 | */ |
||||
| 325 | public function getFonts() |
||||
| 326 | { |
||||
| 327 | return $this->getObjectsByType('Font'); |
||||
| 328 | } |
||||
| 329 | |||||
| 330 | public function getFirstFont(): ?Font |
||||
| 331 | { |
||||
| 332 | $fonts = $this->getFonts(); |
||||
| 333 | if ([] === $fonts) { |
||||
| 334 | return null; |
||||
| 335 | } |
||||
| 336 | |||||
| 337 | return reset($fonts); |
||||
| 338 | } |
||||
| 339 | |||||
| 340 | /** |
||||
| 341 | * @return Page[] |
||||
| 342 | * |
||||
| 343 | * @throws \Exception |
||||
| 344 | */ |
||||
| 345 | public function getPages() |
||||
| 346 | { |
||||
| 347 | if ($this->hasObjectsByType('Catalog')) { |
||||
| 348 | // Search for catalog to list pages. |
||||
| 349 | $catalogues = $this->getObjectsByType('Catalog'); |
||||
| 350 | $catalogue = reset($catalogues); |
||||
| 351 | |||||
| 352 | /** @var Pages $object */ |
||||
| 353 | $object = $catalogue->get('Pages'); |
||||
| 354 | if (method_exists($object, 'getPages')) { |
||||
| 355 | return $object->getPages(true); |
||||
| 356 | } |
||||
| 357 | } |
||||
| 358 | |||||
| 359 | if ($this->hasObjectsByType('Pages')) { |
||||
| 360 | // Search for pages to list kids. |
||||
| 361 | $pages = []; |
||||
| 362 | |||||
| 363 | /** @var Pages[] $objects */ |
||||
| 364 | $objects = $this->getObjectsByType('Pages'); |
||||
| 365 | foreach ($objects as $object) { |
||||
| 366 | $pages = array_merge($pages, $object->getPages(true)); |
||||
| 367 | } |
||||
| 368 | |||||
| 369 | return $pages; |
||||
| 370 | } |
||||
| 371 | |||||
| 372 | if ($this->hasObjectsByType('Page')) { |
||||
| 373 | // Search for 'page' (unordered pages). |
||||
| 374 | $pages = $this->getObjectsByType('Page'); |
||||
| 375 | |||||
| 376 | return array_values($pages); |
||||
| 377 | } |
||||
| 378 | |||||
| 379 | throw new \Exception('Missing catalog.'); |
||||
| 380 | } |
||||
| 381 | |||||
| 382 | public function getText(int $pageLimit = null): string |
||||
| 383 | { |
||||
| 384 | $texts = []; |
||||
| 385 | $pages = $this->getPages(); |
||||
| 386 | |||||
| 387 | // Only use the first X number of pages if $pageLimit is set and numeric. |
||||
| 388 | if (\is_int($pageLimit) && 0 < $pageLimit) { |
||||
| 389 | $pages = \array_slice($pages, 0, $pageLimit); |
||||
| 390 | } |
||||
| 391 | |||||
| 392 | foreach ($pages as $index => $page) { |
||||
| 393 | /** |
||||
| 394 | * In some cases, the $page variable may be null. |
||||
| 395 | */ |
||||
| 396 | if (null === $page) { |
||||
| 397 | continue; |
||||
| 398 | } |
||||
| 399 | if ($text = trim($page->getText())) { |
||||
| 400 | $texts[] = $text; |
||||
| 401 | } |
||||
| 402 | } |
||||
| 403 | |||||
| 404 | return implode("\n\n", $texts); |
||||
| 405 | } |
||||
| 406 | |||||
| 407 | public function getTrailer(): Header |
||||
| 408 | { |
||||
| 409 | return $this->trailer; |
||||
| 410 | } |
||||
| 411 | |||||
| 412 | public function setTrailer(Header $trailer) |
||||
| 413 | { |
||||
| 414 | $this->trailer = $trailer; |
||||
| 415 | } |
||||
| 416 | |||||
| 417 | public function getDetails(): array |
||||
| 418 | { |
||||
| 419 | return $this->details; |
||||
| 420 | } |
||||
| 421 | } |
||||
| 422 |
The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g.
excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths