Issues (82)

src/Smalot/PdfParser/Parser.php (14 issues)

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 71
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 71
        $this->config = $config ?: new Config();
66 71
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 66
    public function parseFile(string $filename): Document
78
    {
79 66
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91 66
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100 66
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103 66
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105 65
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106 1
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109 64
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114 64
        $document = new Document();
115 64
        $this->objects = [];
116
117 64
        foreach ($data as $id => $structure) {
118 64
            $this->parseObject($id, $structure, $document);
119 64
            unset($data[$id]);
120
        }
121
122 64
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123 64
        $document->setObjects($this->objects);
124
125 64
        return $document;
126
    }
127
128 64
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130 64
        $trailer = [];
131
132 64
        foreach ($structure as $name => $values) {
133 64
            $name = ucfirst($name);
134
135 64
            if (is_numeric($values)) {
136 64
                $trailer[$name] = new ElementNumeric($values);
137 64
            } elseif (\is_array($values)) {
138 53
                $value = $this->parseTrailer($values, null);
139 53
                $trailer[$name] = new ElementArray($value, null);
140 64
            } elseif (false !== strpos($values, '_')) {
141 64
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143 53
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147 64
        return new Header($trailer, $document);
148
    }
149
150 65
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152 65
        $header = new Header([], $document);
153 65
        $content = '';
154
155 65
        foreach ($structure as $position => $part) {
156 65
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159 65
            switch ($part[0]) {
160 65
                case '[':
161 26
                    $elements = [];
162
163 26
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
The expression $part[1] of type null is not traversable.
Loading history...
164 26
                        $sub_type = $sub_element[0];
165 26
                        $sub_value = $sub_element[1];
166 26
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169 26
                    $header = new Header($elements, $document);
170 26
                    break;
171
172 65
                case '<<':
173 65
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

173
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
174 65
                    break;
175
176 65
                case 'stream':
177 65
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179 65
                    if ($header->get('Type')->equals('ObjStm')) {
180 17
                        $match = [];
181
182
                        // Split xrefs and contents.
183 17
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184 17
                        $content = $match[3];
185
186
                        // Extract xrefs.
187 17
                        $xrefs = preg_split(
188 17
                            '/(\d+\s+\d+\s*)/s',
189 17
                            $match[1],
190 17
                            -1,
191 17
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192 17
                        );
193 17
                        $table = [];
194
195 17
                        foreach ($xrefs as $xref) {
196 17
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197 17
                            $table[$position] = $id;
198
                        }
199
200 17
                        ksort($table);
201
202 17
                        $ids = array_values($table);
203 17
                        $positions = array_keys($table);
204
205 17
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206 17
                            $id = $ids[$index].'_0';
207 17
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208 17
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210 17
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
211 17
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

211
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
212 17
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217 17
                        return;
218 64
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220 41
                        $document->extractXMPMetadata($content);
0 ignored issues
show
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
                        $document->/** @scrutinizer ignore-call */ 
221
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221
                    }
222 64
                    break;
223
224
                default:
225 64
                    if ('null' != $part) {
226 64
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228 64
                        if ($element) {
229 28
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232 64
                    break;
233
            }
234
        }
235
236 64
        if (!isset($this->objects[$id])) {
237 64
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244 65
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246 65
        $elements = [];
247 65
        $count = \count($structure);
248
249 65
        for ($position = 0; $position < $count; $position += 2) {
250 65
            $name = $structure[$position][1];
251 65
            $type = $structure[$position + 1][0];
252 65
            $value = $structure[$position + 1][1];
253
254 65
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257 65
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267 65
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269 65
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270 65
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271 12
            $value = [];
272
        }
273
274
        switch ($type) {
275 65
            case '<<':
276 65
            case '>>':
277 64
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
278 64
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

278
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
279
280 64
                return $header;
281
282 65
            case 'numeric':
283 64
                return new ElementNumeric($value);
0 ignored issues
show
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
284
285 65
            case 'boolean':
286 30
                return new ElementBoolean($value);
0 ignored issues
show
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

286
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
287
288 65
            case 'null':
289 11
                return new ElementNull();
290
291 65
            case '(':
292 64
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

292
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
293 52
                    return $date;
294
                }
295
296 64
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
297
298 65
            case '<':
299 31
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
300
301 65
            case '/':
302 65
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
303
304 64
            case 'ojbref': // old mistake in tcpdf parser
305 64
            case 'objref':
306 64
                return new ElementXRef($value, $document);
307
308 64
            case '[':
309 64
                $values = [];
310
311 64
                if (\is_array($value)) {
312 64
                    foreach ($value as $sub_element) {
313 64
                        $sub_type = $sub_element[0];
314 64
                        $sub_value = $sub_element[1];
315 64
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319 64
                return new ElementArray($values, $document);
320
321 64
            case 'endstream':
322 1
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325 64
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332