Issues (82)

src/Smalot/PdfParser/Parser.php (1 issue)

php_analyzer.check_variables.key_is_overwritten_by_foreach

Bug Comprehensibility Minor
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 73
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 73
        $this->config = $config ?: new Config();
66 73
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 68
    public function parseFile(string $filename): Document
78
    {
79 68
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91 68
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100 68
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103 68
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105 67
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106 1
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109 66
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114 66
        $document = new Document();
115 66
        $this->objects = [];
116
117 66
        foreach ($data as $id => $structure) {
118 66
            $this->parseObject($id, $structure, $document);
119 66
            unset($data[$id]);
120
        }
121
122 66
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123 66
        $document->setObjects($this->objects);
124
125 66
        return $document;
126
    }
127
128 66
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130 66
        $trailer = [];
131
132 66
        foreach ($structure as $name => $values) {
133 66
            $name = ucfirst($name);
134
135 66
            if (is_numeric($values)) {
136 66
                $trailer[$name] = new ElementNumeric($values);
137 66
            } elseif (\is_array($values)) {
138 53
                $value = $this->parseTrailer($values, null);
139 53
                $trailer[$name] = new ElementArray($value, null);
140 66
            } elseif (false !== strpos($values, '_')) {
141 66
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143 53
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147 66
        return new Header($trailer, $document);
148
    }
149
150 67
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152 67
        $header = new Header([], $document);
153 67
        $content = '';
154
155 67
        foreach ($structure as $position => $part) {
156 67
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159 67
            switch ($part[0]) {
160 67
                case '[':
161 27
                    $elements = [];
162
163 27
                    foreach ($part[1] as $sub_element) {
164 27
                        $sub_type = $sub_element[0];
165 27
                        $sub_value = $sub_element[1];
166 27
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169 27
                    $header = new Header($elements, $document);
170 27
                    break;
171
172 67
                case '<<':
173 67
                    $header = $this->parseHeader($part[1], $document);
174 67
                    break;
175
176 67
                case 'stream':
177 67
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179 67
                    if ($header->get('Type')->equals('ObjStm')) {
180 17
                        $match = [];
181
182
                        // Split xrefs and contents.
183 17
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184 17
                        $content = $match[3];
185
186
                        // Extract xrefs.
187 17
                        $xrefs = preg_split(
188 17
                            '/(\d+\s+\d+\s*)/s',
189 17
                            $match[1],
190 17
                            -1,
191 17
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192 17
                        );
193 17
                        $table = [];
194
195 17
                        foreach ($xrefs as $xref) {
196 17
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197 17
                            $table[$position] = $id;
198
                        }
199
200 17
                        ksort($table);
201
202 17
                        $ids = array_values($table);
203 17
                        $positions = array_keys($table);
204
205 17
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206 17
                            $id = $ids[$index].'_0';
207 17
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208 17
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210 17
                            $sub_header = Header::parse($sub_content, $document);
211 17
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
212 17
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217 17
                        return;
218 66
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220 41
                        $document->extractXMPMetadata($content);
221
                    }
222 66
                    break;
223
224
                default:
225 66
                    if ('null' != $part) {
226 66
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228 66
                        if ($element) {
229 29
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232 66
                    break;
233
            }
234
        }
235
236 66
        if (!isset($this->objects[$id])) {
237 66
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244 67
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246 67
        $elements = [];
247 67
        $count = \count($structure);
248
249 67
        for ($position = 0; $position < $count; $position += 2) {
250 67
            $name = $structure[$position][1];
251 67
            $type = $structure[$position + 1][0];
252 67
            $value = $structure[$position + 1][1];
253
254 67
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257 67
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267 67
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269 67
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270 67
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271 13
            $value = [];
272
        }
273
274
        switch ($type) {
275 67
            case '<<':
276 67
            case '>>':
277 66
                $header = $this->parseHeader($value, $document);
278 66
                PDFObject::factory($document, $header, null, $this->config);
279
280 66
                return $header;
281
282 67
            case 'numeric':
283 66
                return new ElementNumeric($value);
284
285 67
            case 'boolean':
286 30
                return new ElementBoolean($value);
287
288 67
            case 'null':
289 11
                return new ElementNull();
290
291 67
            case '(':
292 65
                if ($date = ElementDate::parse('('.$value.')', $document)) {
293 53
                    return $date;
294
                }
295
296 65
                return ElementString::parse('('.$value.')', $document);
297
298 67
            case '<':
299 31
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
300
301 67
            case '/':
302 67
                return ElementName::parse('/'.$value, $document);
303
304 66
            case 'ojbref': // old mistake in tcpdf parser
305 66
            case 'objref':
306 66
                return new ElementXRef($value, $document);
307
308 66
            case '[':
309 66
                $values = [];
310
311 66
                if (\is_array($value)) {
312 66
                    foreach ($value as $sub_element) {
313 66
                        $sub_type = $sub_element[0];
314 66
                        $sub_value = $sub_element[1];
315 66
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319 66
                return new ElementArray($values, $document);
320
321 66
            case 'endstream':
322 1
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325 66
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332