Completed
Pull Request — master (#304)
by Jeremy
05:53 queued 03:07
created

Parser::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
use Smalot\PdfParser\RawData\RawDataParser;
43
44
/**
45
 * Class Parser
46
 */
47
class Parser
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    protected $rawDataParser;
55
56 16
    public function __construct($cfg = [])
57
    {
58 16
        $this->rawDataParser = new RawDataParser($cfg);
59 16
    }
60
61
    /**
62
     * @param string $filename
63
     *
64
     * @return Document
65
     *
66
     * @throws \Exception
67
     */
68 16
    public function parseFile($filename)
69
    {
70 16
        $content = file_get_contents($filename);
71
        /*
72
         * 2018/06/20 @doganoo as multiple times a
73
         * users have complained that the parseFile()
74
         * method dies silently, it is an better option
75
         * to remove the error control operator (@) and
76
         * let the users know that the method throws an exception
77
         * by adding @throws tag to PHPDoc.
78
         *
79
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
80
         */
81 16
        return $this->parseContent($content);
82
    }
83
84
    /**
85
     * @param string $content PDF content to parse
86
     *
87
     * @return Document
88
     *
89
     * @throws \Exception if secured PDF file was detected
90
     * @throws \Exception if no object list was found
91
     */
92 16
    public function parseContent($content)
93
    {
94
        // Create structure from raw data.
95 16
        list($xref, $data) = $this->rawDataParser->parseData($content);
96
97 16
        if (isset($xref['trailer']['encrypt'])) {
98
            throw new \Exception('Secured pdf file are currently not supported.');
99
        }
100
101 16
        if (empty($data)) {
102
            throw new \Exception('Object list not found. Possible secured file.');
103
        }
104
105
        // Create destination object.
106 16
        $document = new Document();
107 16
        $this->objects = [];
108
109 16
        foreach ($data as $id => $structure) {
110 16
            $this->parseObject($id, $structure, $document);
111 16
            unset($data[$id]);
112
        }
113
114 16
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
115 16
        $document->setObjects($this->objects);
116
117 16
        return $document;
118
    }
119
120 16
    protected function parseTrailer($structure, $document)
121
    {
122 16
        $trailer = [];
123
124 16
        foreach ($structure as $name => $values) {
125 16
            $name = ucfirst($name);
126
127 16
            if (is_numeric($values)) {
128 16
                $trailer[$name] = new ElementNumeric($values);
129 16
            } elseif (\is_array($values)) {
130 16
                $value = $this->parseTrailer($values, null);
131 16
                $trailer[$name] = new ElementArray($value, null);
0 ignored issues
show
Bug introduced by
$value of type Smalot\PdfParser\Header is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

131
                $trailer[$name] = new ElementArray(/** @scrutinizer ignore-type */ $value, null);
Loading history...
132 16
            } elseif (false !== strpos($values, '_')) {
133 16
                $trailer[$name] = new ElementXRef($values, $document);
134
            } else {
135 16
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
136
            }
137
        }
138
139 16
        return new Header($trailer, $document);
140
    }
141
142
    /**
143
     * @param string   $id
144
     * @param array    $structure
145
     * @param Document $document
146
     */
147 16
    protected function parseObject($id, $structure, $document)
148
    {
149 16
        $header = new Header([], $document);
150 16
        $content = '';
151
152 16
        foreach ($structure as $position => $part) {
153 16
            switch ($part[0]) {
154 16
                case '[':
155 2
                    $elements = [];
156
157 2
                    foreach ($part[1] as $sub_element) {
158 2
                        $sub_type = $sub_element[0];
159 2
                        $sub_value = $sub_element[1];
160 2
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
161
                    }
162
163 2
                    $header = new Header($elements, $document);
0 ignored issues
show
Bug introduced by
It seems like $elements can also be of type Smalot\PdfParser\Header[]; however, parameter $elements of Smalot\PdfParser\Header::__construct() does only seem to accept Smalot\PdfParser\Element[], maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

163
                    $header = new Header(/** @scrutinizer ignore-type */ $elements, $document);
Loading history...
164 2
                    break;
165
166 16
                case '<<':
167 16
                    $header = $this->parseHeader($part[1], $document);
168 16
                    break;
169
170 16
                case 'stream':
171 16
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
172
173 16
                    if ($header->get('Type')->equals('ObjStm')) {
174 2
                        $match = [];
175
176
                        // Split xrefs and contents.
177 2
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
178 2
                        $content = $match[3];
179
180
                        // Extract xrefs.
181 2
                        $xrefs = preg_split(
182 2
                            '/(\d+\s+\d+\s*)/s',
183 2
                            $match[1],
184 2
                            -1,
185 2
                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
186
                        );
187 2
                        $table = [];
188
189 2
                        foreach ($xrefs as $xref) {
190 2
                            list($id, $position) = explode(' ', trim($xref));
191 2
                            $table[$position] = $id;
192
                        }
193
194 2
                        ksort($table);
195
196 2
                        $ids = array_values($table);
197 2
                        $positions = array_keys($table);
198
199 2
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
200 2
                            $id = $ids[$index].'_0';
201 2
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
202 2
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
203
204 2
                            $sub_header = Header::parse($sub_content, $document);
205 2
                            $object = PDFObject::factory($document, $sub_header, '');
206 2
                            $this->objects[$id] = $object;
207
                        }
208
209
                        // It is not necessary to store this content.
210 2
                        $content = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $content is dead and can be removed.
Loading history...
211
212 2
                        return;
213
                    }
214 16
                    break;
215
216
                default:
217 16
                    if ('null' != $part) {
218 16
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
219
220 16
                        if ($element) {
221 14
                            $header = new Header([$element], $document);
0 ignored issues
show
Bug introduced by
array($element) of type array<integer,Smalot\PdfParser\Header> is incompatible with the type Smalot\PdfParser\Element[] expected by parameter $elements of Smalot\PdfParser\Header::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

221
                            $header = new Header(/** @scrutinizer ignore-type */ [$element], $document);
Loading history...
222
                        }
223
                    }
224 16
                    break;
225
            }
226
        }
227
228 16
        if (!isset($this->objects[$id])) {
229 16
            $this->objects[$id] = PDFObject::factory($document, $header, $content);
230
        }
231 16
    }
232
233
    /**
234
     * @param array    $structure
235
     * @param Document $document
236
     *
237
     * @return Header
238
     *
239
     * @throws \Exception
240
     */
241 16
    protected function parseHeader($structure, $document)
242
    {
243 16
        $elements = [];
244 16
        $count = \count($structure);
245
246 16
        for ($position = 0; $position < $count; $position += 2) {
247 16
            $name = $structure[$position][1];
248 16
            $type = $structure[$position + 1][0];
249 16
            $value = $structure[$position + 1][1];
250
251 16
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
252
        }
253
254 16
        return new Header($elements, $document);
255
    }
256
257
    /**
258
     * @param string       $type
259
     * @param string|array $value
260
     * @param Document     $document
261
     *
262
     * @return Element|Header|null
263
     *
264
     * @throws \Exception
265
     */
266 16
    protected function parseHeaderElement($type, $value, $document)
267
    {
268
        switch ($type) {
269 16
            case '<<':
270 16
            case '>>':
271 16
                return $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

271
                return $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
272
273 16
            case 'numeric':
274 16
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

274
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
275
276 16
            case 'boolean':
277 4
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

277
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
278
279 16
            case 'null':
280 2
                return new ElementNull();
281
282 16
            case '(':
283 16
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
284 15
                    return $date;
285
                }
286
287 16
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
288
289 16
            case '<':
290 4
                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

290
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value, $document), $document);
Loading history...
291
292 16
            case '/':
293 16
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
294
295 16
            case 'ojbref': // old mistake in tcpdf parser
296 16
            case 'objref':
297 16
                return new ElementXRef($value, $document);
298
299 16
            case '[':
300 16
                $values = [];
301
302 16
                if (\is_array($value)) {
303 16
                    foreach ($value as $sub_element) {
304 16
                        $sub_type = $sub_element[0];
305 16
                        $sub_value = $sub_element[1];
306 16
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
307
                    }
308
                }
309
310 16
                return new ElementArray($values, $document);
0 ignored issues
show
Bug introduced by
$values of type Smalot\PdfParser\Header[]|array is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

310
                return new ElementArray(/** @scrutinizer ignore-type */ $values, $document);
Loading history...
311
312 16
            case 'endstream':
313
            case 'obj': //I don't know what it means but got my project fixed.
314
            case '':
315
                // Nothing to do with.
316 16
                return null;
317
318
            default:
319
                throw new \Exception('Invalid type: "'.$type.'".');
320
        }
321
    }
322
}
323