Completed
Push — master ( 66ad27...2fab78 )
by Konrad
12:00 queued 08:18
created

Parser::parseHeader()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 14
rs 10
c 0
b 0
f 0
ccs 9
cts 9
cp 1
cc 2
nc 2
nop 2
crap 2
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
use Smalot\PdfParser\RawData\RawDataParser;
43
44
/**
45
 * Class Parser
46
 */
47
class Parser
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    protected $rawDataParser;
55
56 16
    public function __construct($cfg = [])
57
    {
58 16
        $this->rawDataParser = new RawDataParser($cfg);
59 16
    }
60
61
    /**
62
     * @param $filename
63
     *
64
     * @return Document
65
     *
66
     * @throws \Exception
67
     */
68 16
    public function parseFile($filename)
69
    {
70 16
        $content = file_get_contents($filename);
71
        /*
72
         * 2018/06/20 @doganoo as multiple times a
73
         * users have complained that the parseFile()
74
         * method dies silently, it is an better option
75
         * to remove the error control operator (@) and
76
         * let the users know that the method throws an exception
77
         * by adding @throws tag to PHPDoc.
78
         *
79
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
80
         */
81 16
        return $this->parseContent($content);
82
    }
83
84
    /**
85
     * @param string $content PDF content to parse
86
     *
87
     * @return Document
88
     *
89
     * @throws Exception if secured PDF file was detected
90
     * @throws Exception if no object list was found
91
     */
92 16
    public function parseContent($content)
93
    {
94
        // Create structure from raw data.
95 16
        list($xref, $data) = $this->rawDataParser->parseData($content);
96
97 16
        if (isset($xref['trailer']['encrypt'])) {
98
            throw new \Exception('Secured pdf file are currently not supported.');
99
        }
100
101 16
        if (empty($data)) {
102
            throw new \Exception('Object list not found. Possible secured file.');
103
        }
104
105
        // Create destination object.
106 16
        $document = new Document();
107 16
        $this->objects = [];
108
109 16
        foreach ($data as $id => $structure) {
110 16
            $this->parseObject($id, $structure, $document);
111 16
            unset($data[$id]);
112
        }
113
114 16
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
115 16
        $document->setObjects($this->objects);
116
117 16
        return $document;
118
    }
119
120 16
    protected function parseTrailer($structure, $document)
121
    {
122 16
        $trailer = [];
123
124 16
        foreach ($structure as $name => $values) {
125 16
            $name = ucfirst($name);
126
127 16
            if (is_numeric($values)) {
128 16
                $trailer[$name] = new ElementNumeric($values, $document);
129 16
            } elseif (\is_array($values)) {
130 16
                $value = $this->parseTrailer($values, null);
131 16
                $trailer[$name] = new ElementArray($value, null);
0 ignored issues
show
Bug introduced by
$value of type Smalot\PdfParser\Header is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

131
                $trailer[$name] = new ElementArray(/** @scrutinizer ignore-type */ $value, null);
Loading history...
132 16
            } elseif (false !== strpos($values, '_')) {
133 16
                $trailer[$name] = new ElementXRef($values, $document);
134
            } else {
135 16
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
136
            }
137
        }
138
139 16
        return new Header($trailer, $document);
140
    }
141
142
    /**
143
     * @param string   $id
144
     * @param array    $structure
145
     * @param Document $document
146
     */
147 16
    protected function parseObject($id, $structure, $document)
148
    {
149 16
        $header = new Header([], $document);
150 16
        $content = '';
151
152 16
        foreach ($structure as $position => $part) {
153 16
            switch ($part[0]) {
154 16
                case '[':
155 2
                    $elements = [];
156
157 2
                    foreach ($part[1] as $sub_element) {
158 2
                        $sub_type = $sub_element[0];
159 2
                        $sub_value = $sub_element[1];
160 2
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
161
                    }
162
163 2
                    $header = new Header($elements, $document);
0 ignored issues
show
Bug introduced by
It seems like $elements can also be of type Smalot\PdfParser\Header[]; however, parameter $elements of Smalot\PdfParser\Header::__construct() does only seem to accept Smalot\PdfParser\Element[], maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

163
                    $header = new Header(/** @scrutinizer ignore-type */ $elements, $document);
Loading history...
164 2
                    break;
165
166 16
                case '<<':
167 16
                    $header = $this->parseHeader($part[1], $document);
168 16
                    break;
169
170 16
                case 'stream':
171 16
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
172
173 16
                    if ($header->get('Type')->equals('ObjStm')) {
174 2
                        $match = [];
175
176
                        // Split xrefs and contents.
177 2
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
178 2
                        $content = $match[3];
179
180
                        // Extract xrefs.
181 2
                        $xrefs = preg_split(
182 2
                            '/(\d+\s+\d+\s*)/s',
183 2
                            $match[1],
184 2
                            -1,
185 2
                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
186
                        );
187 2
                        $table = [];
188
189 2
                        foreach ($xrefs as $xref) {
190 2
                            list($id, $position) = explode(' ', trim($xref));
191 2
                            $table[$position] = $id;
192
                        }
193
194 2
                        ksort($table);
195
196 2
                        $ids = array_values($table);
197 2
                        $positions = array_keys($table);
198
199 2
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
200 2
                            $id = $ids[$index].'_0';
201 2
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
202 2
                            $sub_content = substr($content, $position, $next_position - $position);
203
204 2
                            $sub_header = Header::parse($sub_content, $document);
205 2
                            $object = PDFObject::factory($document, $sub_header, '');
206 2
                            $this->objects[$id] = $object;
207
                        }
208
209
                        // It is not necessary to store this content.
210 2
                        $content = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $content is dead and can be removed.
Loading history...
211
212 2
                        return;
213
                    }
214 16
                    break;
215
216
                default:
217 16
                    if ('null' != $part) {
218 16
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
219
220 16
                        if ($element) {
221 14
                            $header = new Header([$element], $document);
0 ignored issues
show
Bug introduced by
array($element) of type array<integer,Smalot\PdfParser\Header> is incompatible with the type Smalot\PdfParser\Element[] expected by parameter $elements of Smalot\PdfParser\Header::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

221
                            $header = new Header(/** @scrutinizer ignore-type */ [$element], $document);
Loading history...
222
                        }
223
                    }
224 16
                    break;
225
            }
226
        }
227
228 16
        if (!isset($this->objects[$id])) {
229 16
            $this->objects[$id] = PDFObject::factory($document, $header, $content);
230
        }
231 16
    }
232
233
    /**
234
     * @param array    $structure
235
     * @param Document $document
236
     *
237
     * @return Header
238
     *
239
     * @throws \Exception
240
     */
241 16
    protected function parseHeader($structure, $document)
242
    {
243 16
        $elements = [];
244 16
        $count = \count($structure);
245
246 16
        for ($position = 0; $position < $count; $position += 2) {
247 16
            $name = $structure[$position][1];
248 16
            $type = $structure[$position + 1][0];
249 16
            $value = $structure[$position + 1][1];
250
251 16
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
252
        }
253
254 16
        return new Header($elements, $document);
255
    }
256
257
    /**
258
     * @param $type
259
     * @param $value
260
     * @param $document
261
     *
262
     * @return Element|Header
263
     *
264
     * @throws \Exception
265
     */
266 16
    protected function parseHeaderElement($type, $value, $document)
267
    {
268
        switch ($type) {
269 16
            case '<<':
270 16
            case '>>':
271 16
                return $this->parseHeader($value, $document);
272
273 16
            case 'numeric':
274 16
                return new ElementNumeric($value, $document);
275
276 16
            case 'boolean':
277 4
                return new ElementBoolean($value, $document);
278
279 16
            case 'null':
280 2
                return new ElementNull($value, $document);
281
282 16
            case '(':
283 16
                if ($date = ElementDate::parse('('.$value.')', $document)) {
284 15
                    return $date;
285
                } else {
286 16
                    return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element|Smalot\PdfParser\Header. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
287
                }
288
289
                // no break
290 16
            case '<':
291 4
                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
292
293 16
            case '/':
294 16
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element|Smalot\PdfParser\Header. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
295
296 16
            case 'ojbref': // old mistake in tcpdf parser
297 16
            case 'objref':
298 16
                return new ElementXRef($value, $document);
299
300 16
            case '[':
301 16
                $values = [];
302
303 16
                foreach ($value as $sub_element) {
304 16
                    $sub_type = $sub_element[0];
305 16
                    $sub_value = $sub_element[1];
306 16
                    $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
307
                }
308
309 16
                return new ElementArray($values, $document);
310
311 16
            case 'endstream':
312
            case 'obj': //I don't know what it means but got my project fixed.
313
            case '':
314
                // Nothing to do with.
315 16
                break;
316
317
            default:
318
                throw new \Exception('Invalid type: "'.$type.'".');
319
        }
320 16
    }
321
}
322