1 | <?php |
||||||
2 | |||||||
3 | /** |
||||||
4 | * @file |
||||||
5 | * This file is part of the PdfParser library. |
||||||
6 | * |
||||||
7 | * @author Sébastien MALOT <[email protected]> |
||||||
8 | * |
||||||
9 | * @date 2017-01-03 |
||||||
10 | * |
||||||
11 | * @license LGPLv3 |
||||||
12 | * |
||||||
13 | * @url <https://github.com/smalot/pdfparser> |
||||||
14 | * |
||||||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||||
17 | * |
||||||
18 | * This program is free software: you can redistribute it and/or modify |
||||||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||||||
20 | * the Free Software Foundation, either version 3 of the License, or |
||||||
21 | * (at your option) any later version. |
||||||
22 | * |
||||||
23 | * This program is distributed in the hope that it will be useful, |
||||||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
26 | * GNU Lesser General Public License for more details. |
||||||
27 | * |
||||||
28 | * You should have received a copy of the GNU Lesser General Public License |
||||||
29 | * along with this program. |
||||||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||||
31 | */ |
||||||
32 | |||||||
33 | namespace Smalot\PdfParser; |
||||||
34 | |||||||
35 | use Smalot\PdfParser\Element\ElementArray; |
||||||
36 | use Smalot\PdfParser\Element\ElementBoolean; |
||||||
37 | use Smalot\PdfParser\Element\ElementDate; |
||||||
38 | use Smalot\PdfParser\Element\ElementHexa; |
||||||
39 | use Smalot\PdfParser\Element\ElementName; |
||||||
40 | use Smalot\PdfParser\Element\ElementNull; |
||||||
41 | use Smalot\PdfParser\Element\ElementNumeric; |
||||||
42 | use Smalot\PdfParser\Element\ElementString; |
||||||
43 | use Smalot\PdfParser\Element\ElementXRef; |
||||||
44 | use Smalot\PdfParser\RawData\RawDataParser; |
||||||
45 | |||||||
46 | /** |
||||||
47 | * Class Parser |
||||||
48 | */ |
||||||
49 | class Parser |
||||||
50 | { |
||||||
51 | /** |
||||||
52 | * @var Config |
||||||
53 | */ |
||||||
54 | private $config; |
||||||
55 | |||||||
56 | /** |
||||||
57 | * @var PDFObject[] |
||||||
58 | */ |
||||||
59 | protected $objects = []; |
||||||
60 | |||||||
61 | protected $rawDataParser; |
||||||
62 | |||||||
63 | 71 | public function __construct($cfg = [], ?Config $config = null) |
|||||
64 | { |
||||||
65 | 71 | $this->config = $config ?: new Config(); |
|||||
66 | 71 | $this->rawDataParser = new RawDataParser($cfg, $this->config); |
|||||
67 | } |
||||||
68 | |||||||
69 | 1 | public function getConfig(): Config |
|||||
70 | { |
||||||
71 | 1 | return $this->config; |
|||||
72 | } |
||||||
73 | |||||||
74 | /** |
||||||
75 | * @throws \Exception |
||||||
76 | */ |
||||||
77 | 66 | public function parseFile(string $filename): Document |
|||||
78 | { |
||||||
79 | 66 | $content = file_get_contents($filename); |
|||||
80 | |||||||
81 | /* |
||||||
82 | * 2018/06/20 @doganoo as multiple times a |
||||||
83 | * users have complained that the parseFile() |
||||||
84 | * method dies silently, it is an better option |
||||||
85 | * to remove the error control operator (@) and |
||||||
86 | * let the users know that the method throws an exception |
||||||
87 | * by adding @throws tag to PHPDoc. |
||||||
88 | * |
||||||
89 | * See here for an example: https://github.com/smalot/pdfparser/issues/204 |
||||||
90 | */ |
||||||
91 | 66 | return $this->parseContent($content); |
|||||
92 | } |
||||||
93 | |||||||
94 | /** |
||||||
95 | * @param string $content PDF content to parse |
||||||
96 | * |
||||||
97 | * @throws \Exception if secured PDF file was detected |
||||||
98 | * @throws \Exception if no object list was found |
||||||
99 | */ |
||||||
100 | 66 | public function parseContent(string $content): Document |
|||||
101 | { |
||||||
102 | // Create structure from raw data. |
||||||
103 | 66 | list($xref, $data) = $this->rawDataParser->parseData($content); |
|||||
104 | |||||||
105 | 65 | if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { |
|||||
106 | 1 | throw new \Exception('Secured pdf file are currently not supported.'); |
|||||
107 | } |
||||||
108 | |||||||
109 | 64 | if (empty($data)) { |
|||||
110 | throw new \Exception('Object list not found. Possible secured file.'); |
||||||
111 | } |
||||||
112 | |||||||
113 | // Create destination object. |
||||||
114 | 64 | $document = new Document(); |
|||||
115 | 64 | $this->objects = []; |
|||||
116 | |||||||
117 | 64 | foreach ($data as $id => $structure) { |
|||||
118 | 64 | $this->parseObject($id, $structure, $document); |
|||||
119 | 64 | unset($data[$id]); |
|||||
120 | } |
||||||
121 | |||||||
122 | 64 | $document->setTrailer($this->parseTrailer($xref['trailer'], $document)); |
|||||
123 | 64 | $document->setObjects($this->objects); |
|||||
124 | |||||||
125 | 64 | return $document; |
|||||
126 | } |
||||||
127 | |||||||
128 | 64 | protected function parseTrailer(array $structure, ?Document $document) |
|||||
129 | { |
||||||
130 | 64 | $trailer = []; |
|||||
131 | |||||||
132 | 64 | foreach ($structure as $name => $values) { |
|||||
133 | 64 | $name = ucfirst($name); |
|||||
134 | |||||||
135 | 64 | if (is_numeric($values)) { |
|||||
136 | 64 | $trailer[$name] = new ElementNumeric($values); |
|||||
137 | 64 | } elseif (\is_array($values)) { |
|||||
138 | 53 | $value = $this->parseTrailer($values, null); |
|||||
139 | 53 | $trailer[$name] = new ElementArray($value, null); |
|||||
140 | 64 | } elseif (false !== strpos($values, '_')) { |
|||||
141 | 64 | $trailer[$name] = new ElementXRef($values, $document); |
|||||
142 | } else { |
||||||
143 | 53 | $trailer[$name] = $this->parseHeaderElement('(', $values, $document); |
|||||
144 | } |
||||||
145 | } |
||||||
146 | |||||||
147 | 64 | return new Header($trailer, $document); |
|||||
148 | } |
||||||
149 | |||||||
150 | 65 | protected function parseObject(string $id, array $structure, ?Document $document) |
|||||
151 | { |
||||||
152 | 65 | $header = new Header([], $document); |
|||||
153 | 65 | $content = ''; |
|||||
154 | |||||||
155 | 65 | foreach ($structure as $position => $part) { |
|||||
156 | 65 | if (\is_int($part)) { |
|||||
157 | $part = [null, null]; |
||||||
158 | } |
||||||
159 | 65 | switch ($part[0]) { |
|||||
160 | 65 | case '[': |
|||||
161 | 26 | $elements = []; |
|||||
162 | |||||||
163 | 26 | foreach ($part[1] as $sub_element) { |
|||||
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||||||
164 | 26 | $sub_type = $sub_element[0]; |
|||||
165 | 26 | $sub_value = $sub_element[1]; |
|||||
166 | 26 | $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|||||
167 | } |
||||||
168 | |||||||
169 | 26 | $header = new Header($elements, $document); |
|||||
170 | 26 | break; |
|||||
171 | |||||||
172 | 65 | case '<<': |
|||||
173 | 65 | $header = $this->parseHeader($part[1], $document); |
|||||
0 ignored issues
–
show
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader() .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
174 | 65 | break; |
|||||
175 | |||||||
176 | 65 | case 'stream': |
|||||
177 | 65 | $content = isset($part[3][0]) ? $part[3][0] : $part[1]; |
|||||
178 | |||||||
179 | 65 | if ($header->get('Type')->equals('ObjStm')) { |
|||||
180 | 17 | $match = []; |
|||||
181 | |||||||
182 | // Split xrefs and contents. |
||||||
183 | 17 | preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match); |
|||||
184 | 17 | $content = $match[3]; |
|||||
185 | |||||||
186 | // Extract xrefs. |
||||||
187 | 17 | $xrefs = preg_split( |
|||||
188 | 17 | '/(\d+\s+\d+\s*)/s', |
|||||
189 | 17 | $match[1], |
|||||
190 | 17 | -1, |
|||||
191 | 17 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|||||
192 | 17 | ); |
|||||
193 | 17 | $table = []; |
|||||
194 | |||||||
195 | 17 | foreach ($xrefs as $xref) { |
|||||
196 | 17 | list($id, $position) = preg_split("/\s+/", trim($xref)); |
|||||
197 | 17 | $table[$position] = $id; |
|||||
198 | } |
||||||
199 | |||||||
200 | 17 | ksort($table); |
|||||
201 | |||||||
202 | 17 | $ids = array_values($table); |
|||||
203 | 17 | $positions = array_keys($table); |
|||||
204 | |||||||
205 | 17 | foreach ($positions as $index => $position) { |
|||||
0 ignored issues
–
show
|
|||||||
206 | 17 | $id = $ids[$index].'_0'; |
|||||
207 | 17 | $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content); |
|||||
208 | 17 | $sub_content = substr($content, $position, (int) $next_position - (int) $position); |
|||||
209 | |||||||
210 | 17 | $sub_header = Header::parse($sub_content, $document); |
|||||
0 ignored issues
–
show
It seems like
$document can also be of type null ; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
211 | 17 | $object = PDFObject::factory($document, $sub_header, '', $this->config); |
|||||
0 ignored issues
–
show
It seems like
$document can also be of type null ; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
212 | 17 | $this->objects[$id] = $object; |
|||||
213 | } |
||||||
214 | |||||||
215 | // It is not necessary to store this content. |
||||||
216 | |||||||
217 | 17 | return; |
|||||
218 | 64 | } elseif ($header->get('Type')->equals('Metadata')) { |
|||||
219 | // Attempt to parse XMP XML Metadata |
||||||
220 | 41 | $document->extractXMPMetadata($content); |
|||||
0 ignored issues
–
show
The method
extractXMPMetadata() does not exist on null .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
221 | } |
||||||
222 | 64 | break; |
|||||
223 | |||||||
224 | default: |
||||||
225 | 64 | if ('null' != $part) { |
|||||
226 | 64 | $element = $this->parseHeaderElement($part[0], $part[1], $document); |
|||||
227 | |||||||
228 | 64 | if ($element) { |
|||||
229 | 28 | $header = new Header([$element], $document); |
|||||
230 | } |
||||||
231 | } |
||||||
232 | 64 | break; |
|||||
233 | } |
||||||
234 | } |
||||||
235 | |||||||
236 | 64 | if (!isset($this->objects[$id])) { |
|||||
237 | 64 | $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config); |
|||||
238 | } |
||||||
239 | } |
||||||
240 | |||||||
241 | /** |
||||||
242 | * @throws \Exception |
||||||
243 | */ |
||||||
244 | 65 | protected function parseHeader(array $structure, ?Document $document): Header |
|||||
245 | { |
||||||
246 | 65 | $elements = []; |
|||||
247 | 65 | $count = \count($structure); |
|||||
248 | |||||||
249 | 65 | for ($position = 0; $position < $count; $position += 2) { |
|||||
250 | 65 | $name = $structure[$position][1]; |
|||||
251 | 65 | $type = $structure[$position + 1][0]; |
|||||
252 | 65 | $value = $structure[$position + 1][1]; |
|||||
253 | |||||||
254 | 65 | $elements[$name] = $this->parseHeaderElement($type, $value, $document); |
|||||
255 | } |
||||||
256 | |||||||
257 | 65 | return new Header($elements, $document); |
|||||
258 | } |
||||||
259 | |||||||
260 | /** |
||||||
261 | * @param string|array $value |
||||||
262 | * |
||||||
263 | * @return Element|Header|null |
||||||
264 | * |
||||||
265 | * @throws \Exception |
||||||
266 | */ |
||||||
267 | 65 | protected function parseHeaderElement(?string $type, $value, ?Document $document) |
|||||
268 | { |
||||||
269 | 65 | $valueIsEmpty = null == $value || '' == $value || false == $value; |
|||||
270 | 65 | if (('<<' === $type || '>>' === $type) && $valueIsEmpty) { |
|||||
271 | 12 | $value = []; |
|||||
272 | } |
||||||
273 | |||||||
274 | switch ($type) { |
||||||
275 | 65 | case '<<': |
|||||
276 | 65 | case '>>': |
|||||
277 | 64 | $header = $this->parseHeader($value, $document); |
|||||
0 ignored issues
–
show
It seems like
$value can also be of type string ; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
278 | 64 | PDFObject::factory($document, $header, null, $this->config); |
|||||
0 ignored issues
–
show
It seems like
$document can also be of type null ; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
279 | |||||||
280 | 64 | return $header; |
|||||
281 | |||||||
282 | 65 | case 'numeric': |
|||||
283 | 64 | return new ElementNumeric($value); |
|||||
0 ignored issues
–
show
It seems like
$value can also be of type array and array ; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
284 | |||||||
285 | 65 | case 'boolean': |
|||||
286 | 30 | return new ElementBoolean($value); |
|||||
0 ignored issues
–
show
It seems like
$value can also be of type array and array ; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
287 | |||||||
288 | 65 | case 'null': |
|||||
289 | 11 | return new ElementNull(); |
|||||
290 | |||||||
291 | 65 | case '(': |
|||||
292 | 64 | if ($date = ElementDate::parse('('.$value.')', $document)) { |
|||||
0 ignored issues
–
show
Are you sure
$value of type array|string can be used in concatenation ?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
293 | 52 | return $date; |
|||||
294 | } |
||||||
295 | |||||||
296 | 64 | return ElementString::parse('('.$value.')', $document); |
|||||
0 ignored issues
–
show
The expression
return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null . Did you maybe forget to handle an error condition?
If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.
Loading history...
|
|||||||
297 | |||||||
298 | 65 | case '<': |
|||||
299 | 31 | return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); |
|||||
0 ignored issues
–
show
It seems like
$value can also be of type array and array ; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
300 | |||||||
301 | 65 | case '/': |
|||||
302 | 65 | return ElementName::parse('/'.$value, $document); |
|||||
0 ignored issues
–
show
The expression
return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null . Did you maybe forget to handle an error condition?
If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.
Loading history...
|
|||||||
303 | |||||||
304 | 64 | case 'ojbref': // old mistake in tcpdf parser |
|||||
305 | 64 | case 'objref': |
|||||
306 | 64 | return new ElementXRef($value, $document); |
|||||
307 | |||||||
308 | 64 | case '[': |
|||||
309 | 64 | $values = []; |
|||||
310 | |||||||
311 | 64 | if (\is_array($value)) { |
|||||
312 | 64 | foreach ($value as $sub_element) { |
|||||
313 | 64 | $sub_type = $sub_element[0]; |
|||||
314 | 64 | $sub_value = $sub_element[1]; |
|||||
315 | 64 | $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|||||
316 | } |
||||||
317 | } |
||||||
318 | |||||||
319 | 64 | return new ElementArray($values, $document); |
|||||
320 | |||||||
321 | 64 | case 'endstream': |
|||||
322 | 1 | case 'obj': // I don't know what it means but got my project fixed. |
|||||
323 | case '': |
||||||
324 | // Nothing to do with. |
||||||
325 | 64 | return null; |
|||||
326 | |||||||
327 | default: |
||||||
328 | throw new \Exception('Invalid type: "'.$type.'".'); |
||||||
329 | } |
||||||
330 | } |
||||||
331 | } |
||||||
332 |