Passed
Pull Request — master (#500)
by Konrad
02:26
created

Parser::parseHeader()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 14
rs 10
c 0
b 0
f 0
ccs 9
cts 9
cp 1
cc 2
nc 2
nop 2
crap 2
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
use Smalot\PdfParser\RawData\RawDataParser;
43
44
/**
45
 * Class Parser
46
 */
47
class Parser
48
{
49
    /**
50
     * @var Config
51
     */
52
    private $config;
53
54
    /**
55
     * @var PDFObject[]
56
     */
57
    protected $objects = [];
58
59
    protected $rawDataParser;
60
61 39
    public function __construct($cfg = [], ?Config $config = null)
62
    {
63 39
        $this->config = $config ?: new Config();
64 39
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
65 39
    }
66
67 1
    public function getConfig(): Config
68
    {
69 1
        return $this->config;
70
    }
71
72
    /**
73
     * @throws \Exception
74
     */
75 37
    public function parseFile(string $filename): Document
76
    {
77 37
        $content = file_get_contents($filename);
78
        /*
79
         * 2018/06/20 @doganoo as multiple times a
80
         * users have complained that the parseFile()
81
         * method dies silently, it is an better option
82
         * to remove the error control operator (@) and
83
         * let the users know that the method throws an exception
84
         * by adding @throws tag to PHPDoc.
85
         *
86
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
87
         */
88 37
        return $this->parseContent($content);
89
    }
90
91
    /**
92
     * @param string $content PDF content to parse
93
     *
94
     * @throws \Exception if secured PDF file was detected
95
     * @throws \Exception if no object list was found
96
     */
97 37
    public function parseContent(string $content): Document
98
    {
99
        // Create structure from raw data.
100 37
        list($xref, $data) = $this->rawDataParser->parseData($content);
101
102 36
        if (isset($xref['trailer']['encrypt'])) {
103
            throw new \Exception('Secured pdf file are currently not supported.');
104
        }
105
106 36
        if (empty($data)) {
107
            throw new \Exception('Object list not found. Possible secured file.');
108
        }
109
110
        // Create destination object.
111 36
        $document = new Document();
112 36
        $this->objects = [];
113
114 36
        foreach ($data as $id => $structure) {
115 36
            $this->parseObject($id, $structure, $document);
116 36
            unset($data[$id]);
117
        }
118
119 36
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
120 36
        $document->setObjects($this->objects);
121
122 36
        return $document;
123
    }
124
125 36
    protected function parseTrailer(array $structure, ?Document $document)
126
    {
127 36
        $trailer = [];
128
129 36
        foreach ($structure as $name => $values) {
130 36
            $name = ucfirst($name);
131
132 36
            if (is_numeric($values)) {
133 36
                $trailer[$name] = new ElementNumeric($values);
134 36
            } elseif (\is_array($values)) {
135 31
                $value = $this->parseTrailer($values, null);
136 31
                $trailer[$name] = new ElementArray($value, null);
137 36
            } elseif (false !== strpos($values, '_')) {
138 36
                $trailer[$name] = new ElementXRef($values, $document);
139
            } else {
140 31
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
141
            }
142
        }
143
144 36
        return new Header($trailer, $document);
145
    }
146
147 37
    protected function parseObject(string $id, array $structure, ?Document $document)
148
    {
149 37
        $header = new Header([], $document);
150 37
        $content = '';
151
152 37
        foreach ($structure as $position => $part) {
153 37
            if (\is_int($part)) {
154
                $part = [null, null];
155
            }
156 37
            switch ($part[0]) {
157 37
                case '[':
158 12
                    $elements = [];
159
160 12
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
161 12
                        $sub_type = $sub_element[0];
162 12
                        $sub_value = $sub_element[1];
163 12
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
164
                    }
165
166 12
                    $header = new Header($elements, $document);
167 12
                    break;
168
169 37
                case '<<':
170 37
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

170
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
171 37
                    break;
172
173 37
                case 'stream':
174 37
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
175
176 37
                    if ($header->get('Type')->equals('ObjStm')) {
177 10
                        $match = [];
178
179
                        // Split xrefs and contents.
180 10
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
181 10
                        $content = $match[3];
182
183
                        // Extract xrefs.
184 10
                        $xrefs = preg_split(
185 10
                            '/(\d+\s+\d+\s*)/s',
186 10
                            $match[1],
187 10
                            -1,
188 10
                          \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
189
                        );
190 10
                        $table = [];
191
192 10
                        foreach ($xrefs as $xref) {
193 10
                            list($id, $position) = preg_split("/\s+/", trim($xref));
194 10
                            $table[$position] = $id;
195
                        }
196
197 10
                        ksort($table);
198
199 10
                        $ids = array_values($table);
200 10
                        $positions = array_keys($table);
201
202 10
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
203 10
                            $id = $ids[$index].'_0';
204 10
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
205 10
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
206
207 10
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

207
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
208 10
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

208
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
209 10
                            $this->objects[$id] = $object;
210
                        }
211
212
                        // It is not necessary to store this content.
213
214 10
                        return;
215
                    }
216 36
                    break;
217
218
                default:
219 36
                    if ('null' != $part) {
220 36
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
221
222 36
                        if ($element) {
223 18
                            $header = new Header([$element], $document);
224
                        }
225
                    }
226 36
                    break;
227
            }
228
        }
229
230 36
        if (!isset($this->objects[$id])) {
231 36
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
232
        }
233 36
    }
234
235
    /**
236
     * @throws \Exception
237
     */
238 37
    protected function parseHeader(array $structure, ?Document $document): Header
239
    {
240 37
        $elements = [];
241 37
        $count = \count($structure);
242
243 37
        for ($position = 0; $position < $count; $position += 2) {
244 37
            $name = $structure[$position][1];
245 37
            $type = $structure[$position + 1][0];
246 37
            $value = $structure[$position + 1][1];
247
248 37
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
249
        }
250
251 37
        return new Header($elements, $document);
252
    }
253
254
    /**
255
     * @param string|array $value
256
     *
257
     * @return Element|Header|null
258
     *
259
     * @throws \Exception
260
     */
261 37
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
262
    {
263 37
        switch ($type) {
264 37
            case '<<':
265 37
            case '>>':
266 36
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

266
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
267 36
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

267
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
268
269 36
                return $header;
270
271 37
            case 'numeric':
272 36
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

272
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
273
274 37
            case 'boolean':
275 12
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

275
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
276
277 37
            case 'null':
278 3
                return new ElementNull();
279
280 37
            case '(':
281 36
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

281
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
282 29
                    return $date;
283
                }
284
285 36
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
286
287 37
            case '<':
288 14
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

288
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
289
290 37
            case '/':
291 37
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
292
293 36
            case 'ojbref': // old mistake in tcpdf parser
294 36
            case 'objref':
295 36
                return new ElementXRef($value, $document);
296
297 36
            case '[':
298 36
                $values = [];
299
300 36
                if (\is_array($value)) {
301 36
                    foreach ($value as $sub_element) {
302 36
                        $sub_type = $sub_element[0];
303 36
                        $sub_value = $sub_element[1];
304 36
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
305
                    }
306
                }
307
308 36
                return new ElementArray($values, $document);
309
310 36
            case 'endstream':
311 1
            case 'obj': //I don't know what it means but got my project fixed.
312
            case '':
313
                // Nothing to do with.
314 36
                return null;
315
316
            default:
317
                throw new \Exception('Invalid type: "'.$type.'".');
318
        }
319
    }
320
}
321