Passed
Pull Request — master (#615)
by Jeffrey
02:30
created

Parser::parseFile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 2
Bugs 0 Features 0
Metric Value
eloc 2
dl 0
loc 14
rs 10
c 2
b 0
f 0
ccs 3
cts 3
cp 1
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 46
    public function __construct($cfg = [], Config $config = null)
64
    {
65 46
        $this->config = $config ?: new Config();
66 46
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67 46
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 44
    public function parseFile(string $filename): Document
78
    {
79 44
        $content = file_get_contents($filename);
80
        /*
81
         * 2018/06/20 @doganoo as multiple times a
82
         * users have complained that the parseFile()
83
         * method dies silently, it is an better option
84
         * to remove the error control operator (@) and
85
         * let the users know that the method throws an exception
86
         * by adding @throws tag to PHPDoc.
87
         *
88
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
89
         */
90 44
        return $this->parseContent($content);
91
    }
92
93
    /**
94
     * @param string $content PDF content to parse
95
     *
96
     * @throws \Exception if secured PDF file was detected
97
     * @throws \Exception if no object list was found
98
     */
99 44
    public function parseContent(string $content): Document
100
    {
101
        // Create structure from raw data.
102 44
        list($xref, $data) = $this->rawDataParser->parseData($content);
103
104 43
        if (isset($xref['trailer']['encrypt'])) {
105
            throw new \Exception('Secured pdf file are currently not supported.');
106
        }
107
108 43
        if (empty($data)) {
109
            throw new \Exception('Object list not found. Possible secured file.');
110
        }
111
112
        // Create destination object.
113 43
        $document = new Document();
114 43
        $this->objects = [];
115
116 43
        foreach ($data as $id => $structure) {
117 43
            $this->parseObject($id, $structure, $document);
118 43
            unset($data[$id]);
119
        }
120
121 43
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
122 43
        $document->setObjects($this->objects);
123
124 43
        return $document;
125
    }
126
127 43
    protected function parseTrailer(array $structure, ?Document $document)
128
    {
129 43
        $trailer = [];
130
131 43
        foreach ($structure as $name => $values) {
132 43
            $name = ucfirst($name);
133
134 43
            if (is_numeric($values)) {
135 43
                $trailer[$name] = new ElementNumeric($values);
136 43
            } elseif (\is_array($values)) {
137 37
                $value = $this->parseTrailer($values, null);
138 37
                $trailer[$name] = new ElementArray($value, null);
139 43
            } elseif (false !== strpos($values, '_')) {
140 43
                $trailer[$name] = new ElementXRef($values, $document);
141
            } else {
142 37
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
143
            }
144
        }
145
146 43
        return new Header($trailer, $document);
147
    }
148
149 44
    protected function parseObject(string $id, array $structure, ?Document $document)
150
    {
151 44
        $header = new Header([], $document);
152 44
        $content = '';
153
154 44
        foreach ($structure as $position => $part) {
155 44
            if (\is_int($part)) {
156
                $part = [null, null];
157
            }
158 44
            switch ($part[0]) {
159 44
                case '[':
160 16
                    $elements = [];
161
162 16
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
163 16
                        $sub_type = $sub_element[0];
164 16
                        $sub_value = $sub_element[1];
165 16
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
166
                    }
167
168 16
                    $header = new Header($elements, $document);
169 16
                    break;
170
171 44
                case '<<':
172 44
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

172
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
173 44
                    break;
174
175 44
                case 'stream':
176 44
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
177
178 44
                    if ($header->get('Type')->equals('ObjStm')) {
179 13
                        $match = [];
180
181
                        // Split xrefs and contents.
182 13
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
183 13
                        $content = $match[3];
184
185
                        // Extract xrefs.
186 13
                        $xrefs = preg_split(
187 13
                            '/(\d+\s+\d+\s*)/s',
188 13
                            $match[1],
189 13
                            -1,
190 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
191
                        );
192 13
                        $table = [];
193
194 13
                        foreach ($xrefs as $xref) {
195 13
                            list($id, $position) = preg_split("/\s+/", trim($xref));
196 13
                            $table[$position] = $id;
197
                        }
198
199 13
                        ksort($table);
200
201 13
                        $ids = array_values($table);
202 13
                        $positions = array_keys($table);
203
204 13
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
205 13
                            $id = $ids[$index].'_0';
206 13
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
207 13
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
208
209 13
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

209
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
210 13
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
211 13
                            $this->objects[$id] = $object;
212
                        }
213
214
                        // It is not necessary to store this content.
215
216 13
                        return;
217 43
                    } elseif ($header->get('Type')->equals('Metadata')) {
218
                        // Attempt to parse XMP XML Metadata
219 30
                        $document->extractXMPMetadata($content);
0 ignored issues
show
Bug introduced by
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

219
                        $document->/** @scrutinizer ignore-call */ 
220
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
220
                    }
221 43
                    break;
222
223
                default:
224 43
                    if ('null' != $part) {
225 43
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
226
227 43
                        if ($element) {
228 21
                            $header = new Header([$element], $document);
229
                        }
230
                    }
231 43
                    break;
232
            }
233
        }
234
235 43
        if (!isset($this->objects[$id])) {
236 43
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
237
        }
238 43
    }
239
240
    /**
241
     * @throws \Exception
242
     */
243 44
    protected function parseHeader(array $structure, ?Document $document): Header
244
    {
245 44
        $elements = [];
246 44
        $count = \count($structure);
247
248 44
        for ($position = 0; $position < $count; $position += 2) {
249 44
            $name = $structure[$position][1];
250 44
            $type = $structure[$position + 1][0];
251 44
            $value = $structure[$position + 1][1];
252
253 44
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
254
        }
255
256 44
        return new Header($elements, $document);
257
    }
258
259
    /**
260
     * @param string|array $value
261
     *
262
     * @return Element|Header|null
263
     *
264
     * @throws \Exception
265
     */
266 44
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
267
    {
268 44
        $valueIsEmpty = null == $value || '' == $value || false == $value;
269 44
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
270 6
            $value = [];
271
        }
272
273 44
        switch ($type) {
274 44
            case '<<':
275 44
            case '>>':
276 43
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

276
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
277 43
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
278
279 43
                return $header;
280
281 44
            case 'numeric':
282 43
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

282
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
283
284 44
            case 'boolean':
285 17
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

285
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
286
287 44
            case 'null':
288 3
                return new ElementNull();
289
290 44
            case '(':
291 43
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
292 34
                    return $date;
293
                }
294
295 43
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
296
297 44
            case '<':
298 20
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

298
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
299
300 44
            case '/':
301 44
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
302
303 43
            case 'ojbref': // old mistake in tcpdf parser
304 43
            case 'objref':
305 43
                return new ElementXRef($value, $document);
306
307 43
            case '[':
308 43
                $values = [];
309
310 43
                if (\is_array($value)) {
311 43
                    foreach ($value as $sub_element) {
312 43
                        $sub_type = $sub_element[0];
313 43
                        $sub_value = $sub_element[1];
314 43
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
315
                    }
316
                }
317
318 43
                return new ElementArray($values, $document);
319
320 43
            case 'endstream':
321 1
            case 'obj': // I don't know what it means but got my project fixed.
322
            case '':
323
                // Nothing to do with.
324 43
                return null;
325
326
            default:
327
                throw new \Exception('Invalid type: "'.$type.'".');
328
        }
329
    }
330
}
331