php_analyzer.check_variables.key_is_overwritten_by_foreach
1 | <?php |
||
2 | |||
3 | /** |
||
4 | * @file |
||
5 | * This file is part of the PdfParser library. |
||
6 | * |
||
7 | * @author Sébastien MALOT <[email protected]> |
||
8 | * |
||
9 | * @date 2017-01-03 |
||
10 | * |
||
11 | * @license LGPLv3 |
||
12 | * |
||
13 | * @url <https://github.com/smalot/pdfparser> |
||
14 | * |
||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
17 | * |
||
18 | * This program is free software: you can redistribute it and/or modify |
||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||
20 | * the Free Software Foundation, either version 3 of the License, or |
||
21 | * (at your option) any later version. |
||
22 | * |
||
23 | * This program is distributed in the hope that it will be useful, |
||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
26 | * GNU Lesser General Public License for more details. |
||
27 | * |
||
28 | * You should have received a copy of the GNU Lesser General Public License |
||
29 | * along with this program. |
||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
31 | */ |
||
32 | |||
33 | namespace Smalot\PdfParser; |
||
34 | |||
35 | use Smalot\PdfParser\Element\ElementArray; |
||
36 | use Smalot\PdfParser\Element\ElementBoolean; |
||
37 | use Smalot\PdfParser\Element\ElementDate; |
||
38 | use Smalot\PdfParser\Element\ElementHexa; |
||
39 | use Smalot\PdfParser\Element\ElementName; |
||
40 | use Smalot\PdfParser\Element\ElementNull; |
||
41 | use Smalot\PdfParser\Element\ElementNumeric; |
||
42 | use Smalot\PdfParser\Element\ElementString; |
||
43 | use Smalot\PdfParser\Element\ElementXRef; |
||
44 | use Smalot\PdfParser\RawData\RawDataParser; |
||
45 | |||
46 | /** |
||
47 | * Class Parser |
||
48 | */ |
||
49 | class Parser |
||
50 | { |
||
51 | /** |
||
52 | * @var Config |
||
53 | */ |
||
54 | private $config; |
||
55 | |||
56 | /** |
||
57 | * @var PDFObject[] |
||
58 | */ |
||
59 | protected $objects = []; |
||
60 | |||
61 | protected $rawDataParser; |
||
62 | |||
63 | 73 | public function __construct($cfg = [], ?Config $config = null) |
|
64 | { |
||
65 | 73 | $this->config = $config ?: new Config(); |
|
66 | 73 | $this->rawDataParser = new RawDataParser($cfg, $this->config); |
|
67 | } |
||
68 | |||
69 | 1 | public function getConfig(): Config |
|
70 | { |
||
71 | 1 | return $this->config; |
|
72 | } |
||
73 | |||
74 | /** |
||
75 | * @throws \Exception |
||
76 | */ |
||
77 | 68 | public function parseFile(string $filename): Document |
|
78 | { |
||
79 | 68 | $content = file_get_contents($filename); |
|
80 | |||
81 | /* |
||
82 | * 2018/06/20 @doganoo as multiple times a |
||
83 | * users have complained that the parseFile() |
||
84 | * method dies silently, it is an better option |
||
85 | * to remove the error control operator (@) and |
||
86 | * let the users know that the method throws an exception |
||
87 | * by adding @throws tag to PHPDoc. |
||
88 | * |
||
89 | * See here for an example: https://github.com/smalot/pdfparser/issues/204 |
||
90 | */ |
||
91 | 68 | return $this->parseContent($content); |
|
92 | } |
||
93 | |||
94 | /** |
||
95 | * @param string $content PDF content to parse |
||
96 | * |
||
97 | * @throws \Exception if secured PDF file was detected |
||
98 | * @throws \Exception if no object list was found |
||
99 | */ |
||
100 | 68 | public function parseContent(string $content): Document |
|
101 | { |
||
102 | // Create structure from raw data. |
||
103 | 68 | list($xref, $data) = $this->rawDataParser->parseData($content); |
|
104 | |||
105 | 67 | if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { |
|
106 | 1 | throw new \Exception('Secured pdf file are currently not supported.'); |
|
107 | } |
||
108 | |||
109 | 66 | if (empty($data)) { |
|
110 | throw new \Exception('Object list not found. Possible secured file.'); |
||
111 | } |
||
112 | |||
113 | // Create destination object. |
||
114 | 66 | $document = new Document(); |
|
115 | 66 | $this->objects = []; |
|
116 | |||
117 | 66 | foreach ($data as $id => $structure) { |
|
118 | 66 | $this->parseObject($id, $structure, $document); |
|
119 | 66 | unset($data[$id]); |
|
120 | } |
||
121 | |||
122 | 66 | $document->setTrailer($this->parseTrailer($xref['trailer'], $document)); |
|
123 | 66 | $document->setObjects($this->objects); |
|
124 | |||
125 | 66 | return $document; |
|
126 | } |
||
127 | |||
128 | 66 | protected function parseTrailer(array $structure, ?Document $document) |
|
129 | { |
||
130 | 66 | $trailer = []; |
|
131 | |||
132 | 66 | foreach ($structure as $name => $values) { |
|
133 | 66 | $name = ucfirst($name); |
|
134 | |||
135 | 66 | if (is_numeric($values)) { |
|
136 | 66 | $trailer[$name] = new ElementNumeric($values); |
|
137 | 66 | } elseif (\is_array($values)) { |
|
138 | 53 | $value = $this->parseTrailer($values, null); |
|
139 | 53 | $trailer[$name] = new ElementArray($value, null); |
|
140 | 66 | } elseif (false !== strpos($values, '_')) { |
|
141 | 66 | $trailer[$name] = new ElementXRef($values, $document); |
|
142 | } else { |
||
143 | 53 | $trailer[$name] = $this->parseHeaderElement('(', $values, $document); |
|
144 | } |
||
145 | } |
||
146 | |||
147 | 66 | return new Header($trailer, $document); |
|
148 | } |
||
149 | |||
150 | 67 | protected function parseObject(string $id, array $structure, ?Document $document) |
|
151 | { |
||
152 | 67 | $header = new Header([], $document); |
|
153 | 67 | $content = ''; |
|
154 | |||
155 | 67 | foreach ($structure as $position => $part) { |
|
156 | 67 | if (\is_int($part)) { |
|
157 | $part = [null, null]; |
||
158 | } |
||
159 | 67 | switch ($part[0]) { |
|
160 | 67 | case '[': |
|
161 | 27 | $elements = []; |
|
162 | |||
163 | 27 | foreach ($part[1] as $sub_element) { |
|
164 | 27 | $sub_type = $sub_element[0]; |
|
165 | 27 | $sub_value = $sub_element[1]; |
|
166 | 27 | $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|
167 | } |
||
168 | |||
169 | 27 | $header = new Header($elements, $document); |
|
170 | 27 | break; |
|
171 | |||
172 | 67 | case '<<': |
|
173 | 67 | $header = $this->parseHeader($part[1], $document); |
|
174 | 67 | break; |
|
175 | |||
176 | 67 | case 'stream': |
|
177 | 67 | $content = isset($part[3][0]) ? $part[3][0] : $part[1]; |
|
178 | |||
179 | 67 | if ($header->get('Type')->equals('ObjStm')) { |
|
180 | 17 | $match = []; |
|
181 | |||
182 | // Split xrefs and contents. |
||
183 | 17 | preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match); |
|
184 | 17 | $content = $match[3]; |
|
185 | |||
186 | // Extract xrefs. |
||
187 | 17 | $xrefs = preg_split( |
|
188 | 17 | '/(\d+\s+\d+\s*)/s', |
|
189 | 17 | $match[1], |
|
190 | 17 | -1, |
|
191 | 17 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|
192 | 17 | ); |
|
193 | 17 | $table = []; |
|
194 | |||
195 | 17 | foreach ($xrefs as $xref) { |
|
196 | 17 | list($id, $position) = preg_split("/\s+/", trim($xref)); |
|
197 | 17 | $table[$position] = $id; |
|
198 | } |
||
199 | |||
200 | 17 | ksort($table); |
|
201 | |||
202 | 17 | $ids = array_values($table); |
|
203 | 17 | $positions = array_keys($table); |
|
204 | |||
205 | 17 | foreach ($positions as $index => $position) { |
|
0 ignored issues
–
show
Comprehensibility
Bug
introduced
by
Loading history...
|
|||
206 | 17 | $id = $ids[$index].'_0'; |
|
207 | 17 | $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content); |
|
208 | 17 | $sub_content = substr($content, $position, (int) $next_position - (int) $position); |
|
209 | |||
210 | 17 | $sub_header = Header::parse($sub_content, $document); |
|
211 | 17 | $object = PDFObject::factory($document, $sub_header, '', $this->config); |
|
212 | 17 | $this->objects[$id] = $object; |
|
213 | } |
||
214 | |||
215 | // It is not necessary to store this content. |
||
216 | |||
217 | 17 | return; |
|
218 | 66 | } elseif ($header->get('Type')->equals('Metadata')) { |
|
219 | // Attempt to parse XMP XML Metadata |
||
220 | 41 | $document->extractXMPMetadata($content); |
|
221 | } |
||
222 | 66 | break; |
|
223 | |||
224 | default: |
||
225 | 66 | if ('null' != $part) { |
|
226 | 66 | $element = $this->parseHeaderElement($part[0], $part[1], $document); |
|
227 | |||
228 | 66 | if ($element) { |
|
229 | 29 | $header = new Header([$element], $document); |
|
230 | } |
||
231 | } |
||
232 | 66 | break; |
|
233 | } |
||
234 | } |
||
235 | |||
236 | 66 | if (!isset($this->objects[$id])) { |
|
237 | 66 | $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config); |
|
238 | } |
||
239 | } |
||
240 | |||
241 | /** |
||
242 | * @throws \Exception |
||
243 | */ |
||
244 | 67 | protected function parseHeader(array $structure, ?Document $document): Header |
|
245 | { |
||
246 | 67 | $elements = []; |
|
247 | 67 | $count = \count($structure); |
|
248 | |||
249 | 67 | for ($position = 0; $position < $count; $position += 2) { |
|
250 | 67 | $name = $structure[$position][1]; |
|
251 | 67 | $type = $structure[$position + 1][0]; |
|
252 | 67 | $value = $structure[$position + 1][1]; |
|
253 | |||
254 | 67 | $elements[$name] = $this->parseHeaderElement($type, $value, $document); |
|
255 | } |
||
256 | |||
257 | 67 | return new Header($elements, $document); |
|
258 | } |
||
259 | |||
260 | /** |
||
261 | * @param string|array $value |
||
262 | * |
||
263 | * @return Element|Header|null |
||
264 | * |
||
265 | * @throws \Exception |
||
266 | */ |
||
267 | 67 | protected function parseHeaderElement(?string $type, $value, ?Document $document) |
|
268 | { |
||
269 | 67 | $valueIsEmpty = null == $value || '' == $value || false == $value; |
|
270 | 67 | if (('<<' === $type || '>>' === $type) && $valueIsEmpty) { |
|
271 | 13 | $value = []; |
|
272 | } |
||
273 | |||
274 | switch ($type) { |
||
275 | 67 | case '<<': |
|
276 | 67 | case '>>': |
|
277 | 66 | $header = $this->parseHeader($value, $document); |
|
278 | 66 | PDFObject::factory($document, $header, null, $this->config); |
|
279 | |||
280 | 66 | return $header; |
|
281 | |||
282 | 67 | case 'numeric': |
|
283 | 66 | return new ElementNumeric($value); |
|
284 | |||
285 | 67 | case 'boolean': |
|
286 | 30 | return new ElementBoolean($value); |
|
287 | |||
288 | 67 | case 'null': |
|
289 | 11 | return new ElementNull(); |
|
290 | |||
291 | 67 | case '(': |
|
292 | 65 | if ($date = ElementDate::parse('('.$value.')', $document)) { |
|
293 | 53 | return $date; |
|
294 | } |
||
295 | |||
296 | 65 | return ElementString::parse('('.$value.')', $document); |
|
297 | |||
298 | 67 | case '<': |
|
299 | 31 | return $this->parseHeaderElement('(', ElementHexa::decode($value), $document); |
|
300 | |||
301 | 67 | case '/': |
|
302 | 67 | return ElementName::parse('/'.$value, $document); |
|
303 | |||
304 | 66 | case 'ojbref': // old mistake in tcpdf parser |
|
305 | 66 | case 'objref': |
|
306 | 66 | return new ElementXRef($value, $document); |
|
307 | |||
308 | 66 | case '[': |
|
309 | 66 | $values = []; |
|
310 | |||
311 | 66 | if (\is_array($value)) { |
|
312 | 66 | foreach ($value as $sub_element) { |
|
313 | 66 | $sub_type = $sub_element[0]; |
|
314 | 66 | $sub_value = $sub_element[1]; |
|
315 | 66 | $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document); |
|
316 | } |
||
317 | } |
||
318 | |||
319 | 66 | return new ElementArray($values, $document); |
|
320 | |||
321 | 66 | case 'endstream': |
|
322 | 1 | case 'obj': // I don't know what it means but got my project fixed. |
|
323 | case '': |
||
324 | // Nothing to do with. |
||
325 | 66 | return null; |
|
326 | |||
327 | default: |
||
328 | throw new \Exception('Invalid type: "'.$type.'".'); |
||
329 | } |
||
330 | } |
||
331 | } |
||
332 |