Passed
Pull Request — master (#560)
by
unknown
08:50
created

Parser::parseTrailer()   A

Complexity

Conditions 5
Paths 5

Size

Total Lines 20
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 5

Importance

Changes 1
Bugs 0 Features 1
Metric Value
eloc 13
dl 0
loc 20
rs 9.5222
c 1
b 0
f 1
ccs 13
cts 13
cp 1
cc 5
nc 5
nop 2
crap 5
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 41
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 41
        $this->config = $config ?: new Config();
66 41
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67 41
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 39
    public function parseFile(string $filename): Document
78
    {
79 39
        $content = file_get_contents($filename);
80
        /*
81
         * 2018/06/20 @doganoo as multiple times a
82
         * users have complained that the parseFile()
83
         * method dies silently, it is an better option
84
         * to remove the error control operator (@) and
85
         * let the users know that the method throws an exception
86
         * by adding @throws tag to PHPDoc.
87
         *
88
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
89
         */
90 39
        return $this->parseContent($content);
91
    }
92
93
    /**
94
     * @param string $content PDF content to parse
95
     *
96
     * @throws \Exception if secured PDF file was detected
97
     * @throws \Exception if no object list was found
98
     */
99 39
    public function parseContent(string $content): Document
100
    {
101
        // Create structure from raw data.
102 39
        list($xref, $data) = $this->rawDataParser->parseData($content);
103
104 38
        if (isset($xref['trailer']['encrypt'])) {
105
            throw new \Exception('Secured pdf file are currently not supported.');
106
        }
107
108 38
        if (empty($data)) {
109
            throw new \Exception('Object list not found. Possible secured file.');
110
        }
111
112
        // Create destination object.
113 38
        $document = new Document();
114 38
        $this->objects = [];
115
116 38
        foreach ($data as $id => $structure) {
117 38
            $this->parseObject($id, $structure, $document);
118 38
            unset($data[$id]);
119
        }
120
121 38
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
122 38
        $document->setObjects($this->objects);
123
124 38
        return $document;
125
    }
126
127 38
    protected function parseTrailer(array $structure, ?Document $document)
128
    {
129 38
        $trailer = [];
130
131 38
        foreach ($structure as $name => $values) {
132 38
            $name = ucfirst($name);
133
134 38
            if (is_numeric($values)) {
135 38
                $trailer[$name] = new ElementNumeric($values);
136 38
            } elseif (\is_array($values)) {
137 33
                $value = $this->parseTrailer($values, null);
138 33
                $trailer[$name] = new ElementArray($value, null);
139 38
            } elseif (false !== strpos($values, '_')) {
140 38
                $trailer[$name] = new ElementXRef($values, $document);
141
            } else {
142 33
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
143
            }
144
        }
145
146 38
        return new Header($trailer, $document);
147
    }
148
149 39
    protected function parseObject(string $id, array $structure, ?Document $document)
150
    {
151 39
        $header = new Header([], $document);
152 39
        $content = '';
153
154 39
        foreach ($structure as $position => $part) {
155 39
            if (\is_int($part)) {
156
                $part = [null, null];
157
            }
158 39
            switch ($part[0]) {
159 39
                case '[':
160 13
                    $elements = [];
161
162 13
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
163 13
                        $sub_type = $sub_element[0];
164 13
                        $sub_value = $sub_element[1];
165 13
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
166
                    }
167
168 13
                    $header = new Header($elements, $document);
169 13
                    break;
170
171 39
                case '<<':
172 39
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

172
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
173 39
                    break;
174
175 39
                case 'stream':
176 39
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
177
178 39
                    if ($header->get('Type')->equals('ObjStm')) {
179 11
                        $match = [];
180
181
                        // Split xrefs and contents.
182 11
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
183 11
                        $content = $match[3];
184
185
                        // Extract xrefs.
186 11
                        $xrefs = preg_split(
187 11
                            '/(\d+\s+\d+\s*)/s',
188 11
                            $match[1],
189 11
                            -1,
190 11
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
191
                        );
192 11
                        $table = [];
193
194 11
                        foreach ($xrefs as $xref) {
195 11
                            list($id, $position) = preg_split("/\s+/", trim($xref));
196 11
                            $table[$position] = $id;
197
                        }
198
199 11
                        ksort($table);
200
201 11
                        $ids = array_values($table);
202 11
                        $positions = array_keys($table);
203
204 11
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
205 11
                            $id = $ids[$index].'_0';
206 11
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
207 11
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
208
209 11
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

209
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
210 11
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
211 11
                            $this->objects[$id] = $object;
212
                        }
213
214
                        // It is not necessary to store this content.
215
216 11
                        return;
217
                    }
218 38
                    break;
219
220
                default:
221 38
                    if ('null' != $part) {
222 38
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
223
224 38
                        if ($element) {
225 20
                            $header = new Header([$element], $document);
226
                        }
227
                    }
228 38
                    break;
229
            }
230
        }
231
232 38
        if (!isset($this->objects[$id])) {
233 38
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
234
        }
235 38
    }
236
237
    /**
238
     * @throws \Exception
239
     */
240 39
    protected function parseHeader(array $structure, ?Document $document): Header
241
    {
242 39
        $elements = [];
243 39
        $count = \count($structure);
244
245 39
        for ($position = 0; $position < $count; $position += 2) {
246 39
            $name = $structure[$position][1];
247 39
            $type = $structure[$position + 1][0];
248 39
            $value = $structure[$position + 1][1];
249
250 39
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
251
        }
252
253 39
        return new Header($elements, $document);
254
    }
255
256
    /**
257
     * @param string|array $value
258
     *
259
     * @return Element|Header|null
260
     *
261
     * @throws \Exception
262
     */
263 39
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
264
    {
265 39
        switch ($type) {
266 39
            case '<<':
267 39
            case '>>':
268 38
                if (empty($value)) {
269 5
                    return null;
270
                }
271
272 38
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

272
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
273 38
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

273
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
274
275 38
                return $header;
276
277 39
            case 'numeric':
278 38
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

278
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
279
280 39
            case 'boolean':
281 13
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

281
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
282
283 39
            case 'null':
284 3
                return new ElementNull();
285
286 39
            case '(':
287 38
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

287
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
288 31
                    return $date;
289
                }
290
291 38
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
292
293 39
            case '<':
294 15
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

294
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
295
296 39
            case '/':
297 39
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
298
299 38
            case 'ojbref': // old mistake in tcpdf parser
300 38
            case 'objref':
301 38
                return new ElementXRef($value, $document);
302
303 38
            case '[':
304 38
                $values = [];
305
306 38
                if (\is_array($value)) {
307 38
                    foreach ($value as $sub_element) {
308 38
                        $sub_type = $sub_element[0];
309 38
                        $sub_value = $sub_element[1];
310 38
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
311
                    }
312
                }
313
314 38
                return new ElementArray($values, $document);
315
316 38
            case 'endstream':
317 1
            case 'obj': // I don't know what it means but got my project fixed.
318
            case '':
319
                // Nothing to do with.
320 38
                return null;
321
322
            default:
323
                throw new \Exception('Invalid type: "'.$type.'".');
324
        }
325
    }
326
}
327