Test Failed
Pull Request — master (#606)
by
unknown
02:17
created

Parser::parseHeader()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 8
c 0
b 0
f 0
nc 2
nop 2
dl 0
loc 14
ccs 5
cts 5
cp 1
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 44
    public function __construct($cfg = [], Config $config = null)
64
    {
65 44
        $this->config = $config ?: new Config();
66 44
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67 44
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 42
    public function parseFile(string $filename): Document
78
    {
79 42
        $content = file_get_contents($filename);
80
        /*
81
         * 2018/06/20 @doganoo as multiple times a
82
         * users have complained that the parseFile()
83
         * method dies silently, it is an better option
84
         * to remove the error control operator (@) and
85
         * let the users know that the method throws an exception
86
         * by adding @throws tag to PHPDoc.
87
         *
88
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
89
         */
90 42
        return $this->parseContent($content);
91
    }
92
93
    /**
94
     * @param string $content PDF content to parse
95
     *
96
     * @throws \Exception if secured PDF file was detected
97
     * @throws \Exception if no object list was found
98
     */
99 42
    public function parseContent(string $content): Document
100
    {
101
        // Create structure from raw data.
102 42
        list($xref, $data) = $this->rawDataParser->parseData($content);
103
104 41
        if (isset($xref['trailer']['encrypt'])) {
105
            throw new \Exception('Secured pdf file are currently not supported.');
106
        }
107
108 41
        if (empty($data)) {
109
            throw new \Exception('Object list not found. Possible secured file.');
110
        }
111
112
        // Create destination object.
113 41
        $document = new Document();
114 41
        $this->objects = [];
115
116 41
        foreach ($data as $id => $structure) {
117 41
            $this->parseObject($id, $structure, $document);
118 41
            unset($data[$id]);
119
        }
120
121 41
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
122 41
        $document->setObjects($this->objects);
123
124 41
        return $document;
125
    }
126
127 41
    protected function parseTrailer(array $structure, ?Document $document)
128
    {
129 41
        $trailer = [];
130
131 41
        foreach ($structure as $name => $values) {
132 41
            $name = ucfirst($name);
133
134 41
            if (is_numeric($values)) {
135 41
                $trailer[$name] = new ElementNumeric($values);
136 41
            } elseif (\is_array($values)) {
137 35
                $value = $this->parseTrailer($values, null);
138 35
                $trailer[$name] = new ElementArray($value, null);
139 41
            } elseif (false !== strpos($values, '_')) {
140 41
                $trailer[$name] = new ElementXRef($values, $document);
141
            } else {
142 35
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
143
            }
144
        }
145
146 41
        return new Header($trailer, $document);
147
    }
148
149 42
    protected function parseObject(string $id, array $structure, ?Document $document)
150
    {
151 42
        $header = new Header([], $document);
152 42
        $content = '';
153
154 42
        foreach ($structure as $position => $part) {
155 42
            if (\is_int($part)) {
156
                $part = [null, null];
157
            }
158 42
            switch ($part[0]) {
159 42
                case '[':
160 14
                    $elements = [];
161
162 14
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
163 14
                        $sub_type = $sub_element[0];
164 14
                        $sub_value = $sub_element[1];
165 14
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
166
                    }
167
168 14
                    $header = new Header($elements, $document);
169 14
                    break;
170
171 42
                case '<<':
172 42
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

172
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
173 42
                    break;
174
175 42
                case 'stream':
176 42
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
177
178 42
                    if ($header->get('Type')->equals('ObjStm')) {
179 13
                        $match = [];
180
181
                        // Split xrefs and contents.
182 13
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
183 13
                        $content = $match[3];
184
185
                        // Extract xrefs.
186 13
                        $xrefs = preg_split(
187 13
                            '/(\d+\s+\d+\s*)/s',
188 13
                            $match[1],
189 13
                            -1,
190 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
191
                        );
192 13
                        $table = [];
193
194 13
                        foreach ($xrefs as $xref) {
195 13
                            list($id, $position) = preg_split("/\s+/", trim($xref));
196 13
                            $table[$position] = $id;
197
                        }
198
199 13
                        ksort($table);
200
201 13
                        $ids = array_values($table);
202 13
                        $positions = array_keys($table);
203
204 13
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
205 13
                            $id = $ids[$index].'_0';
206 13
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
207 13
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
208
209 13
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

209
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
210 13
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
211 13
                            $this->objects[$id] = $object;
212
                        }
213
214
                        // It is not necessary to store this content.
215
216 13
                        return;
217
218 41
                    } else if ($header->get('Type')->equals('Metadata')) {
219
220
                        // Attempt to parse XMP XML Metadata
221 41
                        $document->extractXMPMetadata($content);
0 ignored issues
show
Bug introduced by
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

221
                        $document->/** @scrutinizer ignore-call */ 
222
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
222 41
223
                    }
224 41
                    break;
225 21
226
                default:
227
                    if ('null' != $part) {
228 41
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
229
230
                        if ($element) {
231
                            $header = new Header([$element], $document);
232 41
                        }
233 41
                    }
234
                    break;
235 41
            }
236
        }
237
238
        if (!isset($this->objects[$id])) {
239
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
240 42
        }
241
    }
242 42
243 42
    /**
244
     * @throws \Exception
245 42
     */
246 42
    protected function parseHeader(array $structure, ?Document $document): Header
247 42
    {
248 42
        $elements = [];
249
        $count = \count($structure);
250 42
251
        for ($position = 0; $position < $count; $position += 2) {
252
            $name = $structure[$position][1];
253 42
            $type = $structure[$position + 1][0];
254
            $value = $structure[$position + 1][1];
255
256
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
257
        }
258
259
        return new Header($elements, $document);
260
    }
261
262
    /**
263 42
     * @param string|array $value
264
     *
265 42
     * @return Element|Header|null
266 42
     *
267 6
     * @throws \Exception
268
     */
269
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
270 42
    {
271 42
        $valueIsEmpty = null == $value || '' == $value || false == $value;
272 42
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
273 41
            $value = [];
274 41
        }
275
276 41
        switch ($type) {
277
            case '<<':
278 42
            case '>>':
279 41
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

279
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
280
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

280
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
281 42
282 15
                return $header;
283
284 42
            case 'numeric':
285 3
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

285
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
286
287 42
            case 'boolean':
288 41
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

288
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
289 32
290
            case 'null':
291
                return new ElementNull();
292 41
293
            case '(':
294 42
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

294
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
295 18
                    return $date;
296
                }
297 42
298 42
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
299
300 41
            case '<':
301 41
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

301
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
302 41
303
            case '/':
304 41
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
305 41
306
            case 'ojbref': // old mistake in tcpdf parser
307 41
            case 'objref':
308 41
                return new ElementXRef($value, $document);
309 41
310 41
            case '[':
311 41
                $values = [];
312
313
                if (\is_array($value)) {
314
                    foreach ($value as $sub_element) {
315 41
                        $sub_type = $sub_element[0];
316
                        $sub_value = $sub_element[1];
317 41
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
318 1
                    }
319
                }
320
321 41
                return new ElementArray($values, $document);
322
323
            case 'endstream':
324
            case 'obj': // I don't know what it means but got my project fixed.
325
            case '':
326
                // Nothing to do with.
327
                return null;
328
329
            default:
330
                throw new \Exception('Invalid type: "'.$type.'".');
331
        }
332
    }
333
}
334