Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

Parser::parseContent()   A

Complexity

Conditions 5
Paths 4

Size

Total Lines 26
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 5.009

Importance

Changes 9
Bugs 1 Features 2
Metric Value
cc 5
eloc 13
c 9
b 1
f 2
nc 4
nop 1
dl 0
loc 26
ccs 13
cts 14
cp 0.9286
crap 5.009
rs 9.5222
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 70
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 70
        $this->config = $config ?: new Config();
66 70
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 65
    public function parseFile(string $filename): Document
78
    {
79 65
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91 65
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100 65
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103 65
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105 64
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106 1
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109 63
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114 63
        $document = new Document();
115 63
        $this->objects = [];
116
117 63
        foreach ($data as $id => $structure) {
118 63
            $this->parseObject($id, $structure, $document);
119 63
            unset($data[$id]);
120
        }
121
122 63
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123 63
        $document->setObjects($this->objects);
124
125 63
        return $document;
126
    }
127
128 63
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130 63
        $trailer = [];
131
132 63
        foreach ($structure as $name => $values) {
133 63
            $name = ucfirst($name);
134
135 63
            if (is_numeric($values)) {
136 63
                $trailer[$name] = new ElementNumeric($values);
137 63
            } elseif (\is_array($values)) {
138 53
                $value = $this->parseTrailer($values, null);
139 53
                $trailer[$name] = new ElementArray($value, null);
140 63
            } elseif (false !== strpos($values, '_')) {
141 63
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143 53
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147 63
        return new Header($trailer, $document);
148
    }
149
150 64
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152 64
        $header = new Header([], $document);
153 64
        $content = '';
154
155 64
        foreach ($structure as $position => $part) {
156 64
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159 64
            switch ($part[0]) {
160 64
                case '[':
161 25
                    $elements = [];
162
163 25
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
164 25
                        $sub_type = $sub_element[0];
165 25
                        $sub_value = $sub_element[1];
166 25
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169 25
                    $header = new Header($elements, $document);
170 25
                    break;
171
172 64
                case '<<':
173 64
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

173
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
174 64
                    break;
175
176 64
                case 'stream':
177 64
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179 64
                    if ($header->get('Type')->equals('ObjStm')) {
180 17
                        $match = [];
181
182
                        // Split xrefs and contents.
183 17
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184 17
                        $content = $match[3];
185
186
                        // Extract xrefs.
187 17
                        $xrefs = preg_split(
188 17
                            '/(\d+\s+\d+\s*)/s',
189 17
                            $match[1],
190 17
                            -1,
191 17
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192 17
                        );
193 17
                        $table = [];
194
195 17
                        foreach ($xrefs as $xref) {
196 17
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197 17
                            $table[$position] = $id;
198
                        }
199
200 17
                        ksort($table);
201
202 17
                        $ids = array_values($table);
203 17
                        $positions = array_keys($table);
204
205 17
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206 17
                            $id = $ids[$index].'_0';
207 17
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208 17
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210 17
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
211 17
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

211
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
212 17
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217 17
                        return;
218 63
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220 41
                        $document->extractXMPMetadata($content);
0 ignored issues
show
Bug introduced by
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
                        $document->/** @scrutinizer ignore-call */ 
221
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221
                    }
222 63
                    break;
223
224
                default:
225 63
                    if ('null' != $part) {
226 63
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228 63
                        if ($element) {
229 27
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232 63
                    break;
233
            }
234
        }
235
236 63
        if (!isset($this->objects[$id])) {
237 63
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244 64
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246 64
        $elements = [];
247 64
        $count = \count($structure);
248
249 64
        for ($position = 0; $position < $count; $position += 2) {
250 64
            $name = $structure[$position][1];
251 64
            $type = $structure[$position + 1][0];
252 64
            $value = $structure[$position + 1][1];
253
254 64
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257 64
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267 64
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269 64
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270 64
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271 11
            $value = [];
272
        }
273
274
        switch ($type) {
275 64
            case '<<':
276 64
            case '>>':
277 63
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
278 63
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

278
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
279
280 63
                return $header;
281
282 64
            case 'numeric':
283 63
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
284
285 64
            case 'boolean':
286 30
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

286
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
287
288 64
            case 'null':
289 11
                return new ElementNull();
290
291 64
            case '(':
292 63
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

292
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
293 51
                    return $date;
294
                }
295
296 63
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
297
298 64
            case '<':
299 30
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
300
301 64
            case '/':
302 64
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
303
304 63
            case 'ojbref': // old mistake in tcpdf parser
305 63
            case 'objref':
306 63
                return new ElementXRef($value, $document);
307
308 63
            case '[':
309 63
                $values = [];
310
311 63
                if (\is_array($value)) {
312 63
                    foreach ($value as $sub_element) {
313 63
                        $sub_type = $sub_element[0];
314 63
                        $sub_value = $sub_element[1];
315 63
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319 63
                return new ElementArray($values, $document);
320
321 63
            case 'endstream':
322 1
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325 63
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332