smalot /
pdfparser
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * @file |
||
| 5 | * This file is part of the PdfParser library. |
||
| 6 | * |
||
| 7 | * @author Sébastien MALOT <[email protected]> |
||
| 8 | * |
||
| 9 | * @date 2017-01-03 |
||
| 10 | * |
||
| 11 | * @license LGPLv3 |
||
| 12 | * |
||
| 13 | * @url <https://github.com/smalot/pdfparser> |
||
| 14 | * |
||
| 15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
| 16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
| 17 | * |
||
| 18 | * This program is free software: you can redistribute it and/or modify |
||
| 19 | * it under the terms of the GNU Lesser General Public License as published by |
||
| 20 | * the Free Software Foundation, either version 3 of the License, or |
||
| 21 | * (at your option) any later version. |
||
| 22 | * |
||
| 23 | * This program is distributed in the hope that it will be useful, |
||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 26 | * GNU Lesser General Public License for more details. |
||
| 27 | * |
||
| 28 | * You should have received a copy of the GNU Lesser General Public License |
||
| 29 | * along with this program. |
||
| 30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
| 31 | */ |
||
| 32 | |||
| 33 | namespace Smalot\PdfParser; |
||
| 34 | |||
| 35 | use Smalot\PdfParser\Element\ElementArray; |
||
| 36 | use Smalot\PdfParser\Element\ElementBoolean; |
||
| 37 | use Smalot\PdfParser\Element\ElementDate; |
||
| 38 | use Smalot\PdfParser\Element\ElementHexa; |
||
| 39 | use Smalot\PdfParser\Element\ElementName; |
||
| 40 | use Smalot\PdfParser\Element\ElementNull; |
||
| 41 | use Smalot\PdfParser\Element\ElementNumeric; |
||
| 42 | use Smalot\PdfParser\Element\ElementString; |
||
| 43 | use Smalot\PdfParser\Element\ElementXRef; |
||
| 44 | use Smalot\PdfParser\RawData\RawDataParser; |
||
| 45 | |||
| 46 | /** |
||
| 47 | * Class Parser |
||
| 48 | */ |
||
| 49 | class Parser |
||
| 50 | { |
||
| 51 | /** |
||
| 52 | * @var Config |
||
| 53 | */ |
||
| 54 | private $config; |
||
| 55 | |||
| 56 | /** |
||
| 57 | * @var PDFObject[] |
||
| 58 | */ |
||
| 59 | protected $objects = []; |
||
| 60 | |||
| 61 | protected $rawDataParser; |
||
| 62 | |||
| 63 | 44 | public function __construct($cfg = [], Config $config = null) |
|
| 64 | { |
||
| 65 | 44 | $this->config = $config ?: new Config(); |
|
| 66 | 44 | $this->rawDataParser = new RawDataParser($cfg, $this->config); |
|
| 67 | 44 | } |
|
| 68 | |||
| 69 | 1 | public function getConfig(): Config |
|
| 70 | { |
||
| 71 | 1 | return $this->config; |
|
| 72 | } |
||
| 73 | |||
| 74 | /** |
||
| 75 | * @throws \Exception |
||
| 76 | */ |
||
| 77 | 42 | public function parseFile(string $filename): Document |
|
| 78 | { |
||
| 79 | 42 | $content = file_get_contents($filename); |
|
| 80 | /* |
||
| 81 | * 2018/06/20 @doganoo as multiple times a |
||
| 82 | * users have complained that the parseFile() |
||
| 83 | * method dies silently, it is an better option |
||
| 84 | * to remove the error control operator (@) and |
||
| 85 | * let the users know that the method throws an exception |
||
| 86 | * by adding @throws tag to PHPDoc. |
||
| 87 | * |
||
| 88 | * See here for an example: https://github.com/smalot/pdfparser/issues/204 |
||
| 89 | */ |
||
| 90 | 42 | return $this->parseContent($content); |
|
| 91 | } |
||
| 92 | |||
| 93 | /** |
||
| 94 | * @param string $content PDF content to parse |
||
| 95 | * |
||
| 96 | * @throws \Exception if secured PDF file was detected |
||
| 97 | * @throws \Exception if no object list was found |
||
| 98 | */ |
||
| 99 | 42 | public function parseContent(string $content): Document |
|
| 100 | { |
||
| 101 | // Create structure from raw data. |
||
| 102 | 42 | list($xref, $data) = $this->rawDataParser->parseData($content); |
|
| 103 | |||
| 104 | 41 | if (isset($xref['trailer']['encrypt'])) { |
|
| 105 | throw new \Exception('Secured pdf file are currently not supported.'); |
||
| 106 | } |
||
| 107 | |||
| 108 | 41 | if (empty($data)) { |
|
| 109 | throw new \Exception('Object list not found. Possible secured file.'); |
||
| 110 | } |
||
| 111 | |||
| 112 | // Create destination object. |
||
| 113 | 41 | $document = new Document(); |
|
| 114 | 41 | $this->objects = []; |
|
| 115 | |||
| 116 | 41 | foreach ($data as $id => $structure) { |
|
| 117 | 41 | $this->parseObject($id, $structure, $document); |
|
| 118 | 41 | unset($data[$id]); |
|
| 119 | } |
||
| 120 | |||
| 121 | 41 | $document->setTrailer($this->parseTrailer($xref['trailer'], $document)); |
|
| 122 | 41 | $document->setObjects($this->objects); |
|
| 123 | |||
| 124 | 41 | return $document; |
|
| 125 | } |
||
| 126 | |||
| 127 | 41 | protected function parseTrailer(array $structure, ?Document $document) |
|
| 128 | { |
||
| 129 | 41 | $trailer = []; |
|
| 130 | |||
| 131 | 41 | foreach ($structure as $name => $values) { |
|
| 132 | 41 | $name = ucfirst($name); |
|
| 133 | |||
| 134 | 41 | if (is_numeric($values)) { |
|
| 135 | 41 | $trailer[$name] = new ElementNumeric($values); |
|
| 136 | 41 | } elseif (\is_array($values)) { |
|
| 137 | 35 | $value = $this->parseTrailer($values, null); |
|
| 138 | 35 | $trailer[$name] = new ElementArray($value, null); |
|
| 139 | 41 | } elseif (false !== strpos($values, '_')) { |
|
| 140 | 41 | $trailer[$name] = new ElementXRef($values, $document); |
|
| 141 | } else { |
||
| 142 | 35 | $trailer[$name] = $this->parseHeaderElement('(', $values, $document); |
|
| 143 | } |
||
| 144 | } |
||
| 145 | |||
| 146 | 41 | return new Header($trailer, $document); |
|
| 147 | } |
||
| 148 | |||
| 149 | 42 | protected function parseObject(string $id, array $structure, ?Document $document) |
|
| 150 | { |
||
| 151 | 42 | $header = new Header([], $document); |
|
| 152 | 42 | $content = ''; |
|
| 153 | |||
| 154 | 42 | foreach ($structure as $position => $part) { |
|
| 155 | 42 | if (\is_int($part)) { |
|
| 156 | $part = [null, null]; |
||
| 157 | } |
||
| 158 | 42 | switch ($part[0]) { |
|
| 159 | 42 | case '[': |
|
| 160 | 14 | $elements = []; |
|
| 161 | |||
| 162 | 14 | foreach ($part[1] as $sub_element) { |
|
| 163 | 14 | $sub_type = $sub_element[0]; |
|
| 164 | 14 | $sub_value = $sub_element[1]; |
|
| 165 | 14 | $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|
| 166 | } |
||
| 167 | |||
| 168 | 14 | $header = new Header($elements, $document); |
|
| 169 | 14 | break; |
|
| 170 | |||
| 171 | 42 | case '<<': |
|
| 172 | 42 | $header = $this->parseHeader($part[1], $document); |
|
| 173 | 42 | break; |
|
| 174 | |||
| 175 | 42 | case 'stream': |
|
| 176 | 42 | $content = isset($part[3][0]) ? $part[3][0] : $part[1]; |
|
| 177 | |||
| 178 | 42 | if ($header->get('Type')->equals('ObjStm')) { |
|
| 179 | 13 | $match = []; |
|
| 180 | |||
| 181 | // Split xrefs and contents. |
||
| 182 | 13 | preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match); |
|
| 183 | 13 | $content = $match[3]; |
|
| 184 | |||
| 185 | // Extract xrefs. |
||
| 186 | 13 | $xrefs = preg_split( |
|
| 187 | 13 | '/(\d+\s+\d+\s*)/s', |
|
| 188 | 13 | $match[1], |
|
| 189 | 13 | -1, |
|
| 190 | 13 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|
| 191 | ); |
||
| 192 | 13 | $table = []; |
|
| 193 | |||
| 194 | 13 | foreach ($xrefs as $xref) { |
|
| 195 | 13 | list($id, $position) = preg_split("/\s+/", trim($xref)); |
|
| 196 | 13 | $table[$position] = $id; |
|
| 197 | } |
||
| 198 | |||
| 199 | 13 | ksort($table); |
|
| 200 | |||
| 201 | 13 | $ids = array_values($table); |
|
| 202 | 13 | $positions = array_keys($table); |
|
| 203 | |||
| 204 | 13 | foreach ($positions as $index => $position) { |
|
| 205 | 13 | $id = $ids[$index].'_0'; |
|
| 206 | 13 | $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content); |
|
| 207 | 13 | $sub_content = substr($content, $position, (int) $next_position - (int) $position); |
|
| 208 | |||
| 209 | 13 | $sub_header = Header::parse($sub_content, $document); |
|
| 210 | 13 | $object = PDFObject::factory($document, $sub_header, '', $this->config); |
|
| 211 | 13 | $this->objects[$id] = $object; |
|
| 212 | } |
||
| 213 | |||
| 214 | // It is not necessary to store this content. |
||
| 215 | |||
| 216 | 13 | return; |
|
| 217 | |||
| 218 | 41 | } else if ($header->get('Type')->equals('Metadata')) { |
|
| 219 | |||
| 220 | // Attempt to parse XMP XML Metadata |
||
| 221 | 41 | $document->extractXMPMetadata($content); |
|
|
0 ignored issues
–
show
|
|||
| 222 | 41 | ||
| 223 | } |
||
| 224 | 41 | break; |
|
| 225 | 21 | ||
| 226 | default: |
||
| 227 | if ('null' != $part) { |
||
| 228 | 41 | $element = $this->parseHeaderElement($part[0], $part[1], $document); |
|
| 229 | |||
| 230 | if ($element) { |
||
| 231 | $header = new Header([$element], $document); |
||
| 232 | 41 | } |
|
| 233 | 41 | } |
|
| 234 | break; |
||
| 235 | 41 | } |
|
| 236 | } |
||
| 237 | |||
| 238 | if (!isset($this->objects[$id])) { |
||
| 239 | $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config); |
||
| 240 | 42 | } |
|
| 241 | } |
||
| 242 | 42 | ||
| 243 | 42 | /** |
|
| 244 | * @throws \Exception |
||
| 245 | 42 | */ |
|
| 246 | 42 | protected function parseHeader(array $structure, ?Document $document): Header |
|
| 247 | 42 | { |
|
| 248 | 42 | $elements = []; |
|
| 249 | $count = \count($structure); |
||
| 250 | 42 | ||
| 251 | for ($position = 0; $position < $count; $position += 2) { |
||
| 252 | $name = $structure[$position][1]; |
||
| 253 | 42 | $type = $structure[$position + 1][0]; |
|
| 254 | $value = $structure[$position + 1][1]; |
||
| 255 | |||
| 256 | $elements[$name] = $this->parseHeaderElement($type, $value, $document); |
||
| 257 | } |
||
| 258 | |||
| 259 | return new Header($elements, $document); |
||
| 260 | } |
||
| 261 | |||
| 262 | /** |
||
| 263 | 42 | * @param string|array $value |
|
| 264 | * |
||
| 265 | 42 | * @return Element|Header|null |
|
| 266 | 42 | * |
|
| 267 | 6 | * @throws \Exception |
|
| 268 | */ |
||
| 269 | protected function parseHeaderElement(?string $type, $value, ?Document $document) |
||
| 270 | 42 | { |
|
| 271 | 42 | $valueIsEmpty = null == $value || '' == $value || false == $value; |
|
| 272 | 42 | if (('<<' === $type || '>>' === $type) && $valueIsEmpty) { |
|
| 273 | 41 | $value = []; |
|
| 274 | 41 | } |
|
| 275 | |||
| 276 | 41 | switch ($type) { |
|
| 277 | case '<<': |
||
| 278 | 42 | case '>>': |
|
| 279 | 41 | $header = $this->parseHeader($value, $document); |
|
| 280 | PDFObject::factory($document, $header, null, $this->config); |
||
| 281 | 42 | ||
| 282 | 15 | return $header; |
|
| 283 | |||
| 284 | 42 | case 'numeric': |
|
| 285 | 3 | return new ElementNumeric($value); |
|
| 286 | |||
| 287 | 42 | case 'boolean': |
|
| 288 | 41 | return new ElementBoolean($value); |
|
| 289 | 32 | ||
| 290 | case 'null': |
||
| 291 | return new ElementNull(); |
||
| 292 | 41 | ||
| 293 | case '(': |
||
| 294 | 42 | if ($date = ElementDate::parse('('.$value.')', $document)) { |
|
| 295 | 18 | return $date; |
|
| 296 | } |
||
| 297 | 42 | ||
| 298 | 42 | return ElementString::parse('('.$value.')', $document); |
|
| 299 | |||
| 300 | 41 | case '<': |
|
| 301 | 41 | return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); |
|
| 302 | 41 | ||
| 303 | case '/': |
||
| 304 | 41 | return ElementName::parse('/'.$value, $document); |
|
| 305 | 41 | ||
| 306 | case 'ojbref': // old mistake in tcpdf parser |
||
| 307 | 41 | case 'objref': |
|
| 308 | 41 | return new ElementXRef($value, $document); |
|
| 309 | 41 | ||
| 310 | 41 | case '[': |
|
| 311 | 41 | $values = []; |
|
| 312 | |||
| 313 | if (\is_array($value)) { |
||
| 314 | foreach ($value as $sub_element) { |
||
| 315 | 41 | $sub_type = $sub_element[0]; |
|
| 316 | $sub_value = $sub_element[1]; |
||
| 317 | 41 | $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|
| 318 | 1 | } |
|
| 319 | } |
||
| 320 | |||
| 321 | 41 | return new ElementArray($values, $document); |
|
| 322 | |||
| 323 | case 'endstream': |
||
| 324 | case 'obj': // I don't know what it means but got my project fixed. |
||
| 325 | case '': |
||
| 326 | // Nothing to do with. |
||
| 327 | return null; |
||
| 328 | |||
| 329 | default: |
||
| 330 | throw new \Exception('Invalid type: "'.$type.'".'); |
||
| 331 | } |
||
| 332 | } |
||
| 333 | } |
||
| 334 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.