Parser::parseObject()   C
last analyzed

Complexity

Conditions 16
Paths 66

Size

Total Lines 88
Code Lines 53

Duplication

Lines 0
Ratio 0 %

Importance

Changes 5
Bugs 2 Features 0
Metric Value
cc 16
eloc 53
c 5
b 2
f 0
nc 66
nop 3
dl 0
loc 88
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65
        $this->config = $config ?: new Config();
66
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69
    public function getConfig(): Config
70
    {
71
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77
    public function parseFile(string $filename): Document
78
    {
79
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114
        $document = new Document();
115
        $this->objects = [];
116
117
        foreach ($data as $id => $structure) {
118
            $this->parseObject($id, $structure, $document);
119
            unset($data[$id]);
120
        }
121
122
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123
        $document->setObjects($this->objects);
124
125
        return $document;
126
    }
127
128
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130
        $trailer = [];
131
132
        foreach ($structure as $name => $values) {
133
            $name = ucfirst($name);
134
135
            if (is_numeric($values)) {
136
                $trailer[$name] = new ElementNumeric($values);
137
            } elseif (\is_array($values)) {
138
                $value = $this->parseTrailer($values, null);
139
                $trailer[$name] = new ElementArray($value, null);
140
            } elseif (false !== strpos($values, '_')) {
141
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147
        return new Header($trailer, $document);
148
    }
149
150
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152
        $header = new Header([], $document);
153
        $content = '';
154
155
        foreach ($structure as $position => $part) {
156
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159
            switch ($part[0]) {
160
                case '[':
161
                    $elements = [];
162
163
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
164
                        $sub_type = $sub_element[0];
165
                        $sub_value = $sub_element[1];
166
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169
                    $header = new Header($elements, $document);
170
                    break;
171
172
                case '<<':
173
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

173
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
174
                    break;
175
176
                case 'stream':
177
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179
                    if ($header->get('Type')->equals('ObjStm')) {
180
                        $match = [];
181
182
                        // Split xrefs and contents.
183
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184
                        $content = $match[3];
185
186
                        // Extract xrefs.
187
                        $xrefs = preg_split(
188
                            '/(\d+\s+\d+\s*)/s',
189
                            $match[1],
190
                            -1,
191
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192
                        );
193
                        $table = [];
194
195
                        foreach ($xrefs as $xref) {
196
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197
                            $table[$position] = $id;
198
                        }
199
200
                        ksort($table);
201
202
                        $ids = array_values($table);
203
                        $positions = array_keys($table);
204
205
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206
                            $id = $ids[$index].'_0';
207
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
211
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

211
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
212
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217
                        return;
218
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220
                        $document->extractXMPMetadata($content);
0 ignored issues
show
Bug introduced by
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
                        $document->/** @scrutinizer ignore-call */ 
221
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221
                    }
222
                    break;
223
224
                default:
225
                    if ('null' != $part) {
226
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228
                        if ($element) {
229
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232
                    break;
233
            }
234
        }
235
236
        if (!isset($this->objects[$id])) {
237
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246
        $elements = [];
247
        $count = \count($structure);
248
249
        for ($position = 0; $position < $count; $position += 2) {
250
            $name = $structure[$position][1];
251
            $type = $structure[$position + 1][0];
252
            $value = $structure[$position + 1][1];
253
254
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271
            $value = [];
272
        }
273
274
        switch ($type) {
275
            case '<<':
276
            case '>>':
277
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
278
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

278
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
279
280
                return $header;
281
282
            case 'numeric':
283
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
284
285
            case 'boolean':
286
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

286
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
287
288
            case 'null':
289
                return new ElementNull();
290
291
            case '(':
292
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

292
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
293
                    return $date;
294
                }
295
296
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
297
298
            case '<':
299
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array and array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
300
301
            case '/':
302
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
303
304
            case 'ojbref': // old mistake in tcpdf parser
305
            case 'objref':
306
                return new ElementXRef($value, $document);
307
308
            case '[':
309
                $values = [];
310
311
                if (\is_array($value)) {
312
                    foreach ($value as $sub_element) {
313
                        $sub_type = $sub_element[0];
314
                        $sub_value = $sub_element[1];
315
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319
                return new ElementArray($values, $document);
320
321
            case 'endstream':
322
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332