Parser::parseHeader()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 14
rs 10
c 0
b 0
f 0
ccs 9
cts 9
cp 1
cc 2
nc 2
nop 2
crap 2
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 73
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 73
        $this->config = $config ?: new Config();
66 73
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 68
    public function parseFile(string $filename): Document
78
    {
79 68
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91 68
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100 68
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103 68
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105 67
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106 1
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109 66
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114 66
        $document = new Document();
115 66
        $this->objects = [];
116
117 66
        foreach ($data as $id => $structure) {
118 66
            $this->parseObject($id, $structure, $document);
119 66
            unset($data[$id]);
120
        }
121
122 66
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123 66
        $document->setObjects($this->objects);
124
125 66
        return $document;
126
    }
127
128 66
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130 66
        $trailer = [];
131
132 66
        foreach ($structure as $name => $values) {
133 66
            $name = ucfirst($name);
134
135 66
            if (is_numeric($values)) {
136 66
                $trailer[$name] = new ElementNumeric($values);
137 66
            } elseif (\is_array($values)) {
138 53
                $value = $this->parseTrailer($values, null);
139 53
                $trailer[$name] = new ElementArray($value, null);
140 66
            } elseif (false !== strpos($values, '_')) {
141 66
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143 53
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147 66
        return new Header($trailer, $document);
148
    }
149
150 67
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152 67
        $header = new Header([], $document);
153 67
        $content = '';
154
155 67
        foreach ($structure as $position => $part) {
156 67
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159 67
            switch ($part[0]) {
160 67
                case '[':
161 27
                    $elements = [];
162
163 27
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
164 27
                        $sub_type = $sub_element[0];
165 27
                        $sub_value = $sub_element[1];
166 27
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169 27
                    $header = new Header($elements, $document);
170 27
                    break;
171
172 67
                case '<<':
173 67
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

173
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
174 67
                    break;
175
176 67
                case 'stream':
177 67
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179 67
                    if ($header->get('Type')->equals('ObjStm')) {
180 17
                        $match = [];
181
182
                        // Split xrefs and contents.
183 17
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184 17
                        $content = $match[3];
185
186
                        // Extract xrefs.
187 17
                        $xrefs = preg_split(
188 17
                            '/(\d+\s+\d+\s*)/s',
189 17
                            $match[1],
190 17
                            -1,
191 17
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192 17
                        );
193 17
                        $table = [];
194
195 17
                        foreach ($xrefs as $xref) {
196 17
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197 17
                            $table[$position] = $id;
198
                        }
199
200 17
                        ksort($table);
201
202 17
                        $ids = array_values($table);
203 17
                        $positions = array_keys($table);
204
205 17
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206 17
                            $id = $ids[$index].'_0';
207 17
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208 17
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210 17
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
211 17
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

211
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
212 17
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217 17
                        return;
218 66
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220 41
                        $document->extractXMPMetadata($content);
0 ignored issues
show
Bug introduced by
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
                        $document->/** @scrutinizer ignore-call */ 
221
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221
                    }
222 66
                    break;
223
224
                default:
225 66
                    if ('null' != $part) {
226 66
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228 66
                        if ($element) {
229 29
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232 66
                    break;
233
            }
234
        }
235
236 66
        if (!isset($this->objects[$id])) {
237 66
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244 67
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246 67
        $elements = [];
247 67
        $count = \count($structure);
248
249 67
        for ($position = 0; $position < $count; $position += 2) {
250 67
            $name = $structure[$position][1];
251 67
            $type = $structure[$position + 1][0];
252 67
            $value = $structure[$position + 1][1];
253
254 67
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257 67
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267 67
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269 67
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270 67
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271 13
            $value = [];
272
        }
273
274
        switch ($type) {
275 67
            case '<<':
276 67
            case '>>':
277 66
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
278 66
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

278
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
279
280 66
                return $header;
281
282 67
            case 'numeric':
283 66
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
284
285 67
            case 'boolean':
286 30
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

286
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
287
288 67
            case 'null':
289 11
                return new ElementNull();
290
291 67
            case '(':
292 65
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

292
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
293 53
                    return $date;
294
                }
295
296 65
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
297
298 67
            case '<':
299 31
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
300
301 67
            case '/':
302 67
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
303
304 66
            case 'ojbref': // old mistake in tcpdf parser
305 66
            case 'objref':
306 66
                return new ElementXRef($value, $document);
307
308 66
            case '[':
309 66
                $values = [];
310
311 66
                if (\is_array($value)) {
312 66
                    foreach ($value as $sub_element) {
313 66
                        $sub_type = $sub_element[0];
314 66
                        $sub_value = $sub_element[1];
315 66
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319 66
                return new ElementArray($values, $document);
320
321 66
            case 'endstream':
322 1
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325 66
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332