Passed
Branch master (f7fac8)
by Sebastien
02:47
created

Parser::parseTrailer()   A

Complexity

Conditions 5
Paths 5

Size

Total Lines 20
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 5
eloc 13
c 1
b 0
f 1
nc 5
nop 2
dl 0
loc 20
rs 9.5222
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 * @license LGPLv3
10
 * @url     <https://github.com/smalot/pdfparser>
11
 *
12
 *  PdfParser is a pdf library written in PHP, extraction oriented.
13
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
14
 *
15
 *  This program is free software: you can redistribute it and/or modify
16
 *  it under the terms of the GNU Lesser General Public License as published by
17
 *  the Free Software Foundation, either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  This program is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU Lesser General Public License for more details.
24
 *
25
 *  You should have received a copy of the GNU Lesser General Public License
26
 *  along with this program.
27
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
28
 *
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
43
/**
44
 * Class Parser
45
 *
46
 * @package Smalot\PdfParser
47
 */
48
class Parser
49
{
50
    /**
51
     * @var PDFObject[]
52
     */
53
    protected $objects = array();
54
55
    /**
56
     *
57
     */
58
    public function __construct()
59
    {
60
61
    }
62
63
    /**
64
     * @param $filename
65
     * @return Document
66
     * @throws \Exception
67
     */
68
    public function parseFile($filename)
69
    {
70
        $content = file_get_contents($filename);
71
        /*
72
         * 2018/06/20 @doganoo as multiple times a
73
         * users have complained that the parseFile()
74
         * method dies silently, it is an better option
75
         * to remove the error control operator (@) and
76
         * let the users know that the method throws an exception
77
         * by adding @throws tag to PHPDoc.
78
         *
79
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
80
         */
81
        return $this->parseContent($content);
82
    }
83
84
    /**
85
     * @param $content
86
     * @return Document
87
     * @throws \Exception
88
     */
89
    public function parseContent($content)
90
    {
91
        // Create structure using TCPDF Parser.
92
        ob_start();
93
        @$parser = new \TCPDF_PARSER(ltrim($content));
94
        list($xref, $data) = $parser->getParsedData();
95
        unset($parser);
96
        ob_end_clean();
97
98
        if (isset($xref['trailer']['encrypt'])) {
99
            throw new \Exception('Secured pdf file are currently not supported.');
100
        }
101
102
        if (empty($data)) {
103
            throw new \Exception('Object list not found. Possible secured file.');
104
        }
105
106
        // Create destination object.
107
        $document      = new Document();
108
        $this->objects = array();
109
110
        foreach ($data as $id => $structure) {
111
            $this->parseObject($id, $structure, $document);
112
            unset($data[$id]);
113
        }
114
115
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
116
        $document->setObjects($this->objects);
117
118
        return $document;
119
    }
120
121
    protected function parseTrailer($structure, $document)
122
    {
123
        $trailer = array();
124
125
        foreach ($structure as $name => $values) {
126
            $name = ucfirst($name);
127
128
            if (is_numeric($values)) {
129
                $trailer[$name] = new ElementNumeric($values, $document);
130
            } elseif (is_array($values)) {
131
                $value          = $this->parseTrailer($values, null);
132
                $trailer[$name] = new ElementArray($value, null);
0 ignored issues
show
Bug introduced by
$value of type Smalot\PdfParser\Header is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

132
                $trailer[$name] = new ElementArray(/** @scrutinizer ignore-type */ $value, null);
Loading history...
133
            } elseif (strpos($values, '_') !== false) {
134
                $trailer[$name] = new ElementXRef($values, $document);
135
            } else {
136
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
137
            }
138
        }
139
140
        return new Header($trailer, $document);
141
    }
142
143
    /**
144
     * @param string   $id
145
     * @param array    $structure
146
     * @param Document $document
147
     */
148
    protected function parseObject($id, $structure, $document)
149
    {
150
        $header  = new Header(array(), $document);
151
        $content = '';
152
153
        foreach ($structure as $position => $part) {
154
            switch ($part[0]) {
155
                case '[':
156
                    $elements = array();
157
158
                    foreach ($part[1] as $sub_element) {
159
                        $sub_type   = $sub_element[0];
160
                        $sub_value  = $sub_element[1];
161
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
162
                    }
163
164
                    $header = new Header($elements, $document);
0 ignored issues
show
Bug introduced by
It seems like $elements can also be of type Smalot\PdfParser\Header[]; however, parameter $elements of Smalot\PdfParser\Header::__construct() does only seem to accept Smalot\PdfParser\Element[], maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

164
                    $header = new Header(/** @scrutinizer ignore-type */ $elements, $document);
Loading history...
165
                    break;
166
167
                case '<<':
168
                    $header = $this->parseHeader($part[1], $document);
169
                    break;
170
171
                case 'stream':
172
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
173
174
                    if ($header->get('Type')->equals('ObjStm')) {
175
                        $match = array();
176
177
                        // Split xrefs and contents.
178
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
179
                        $content = $match[3];
180
181
                        // Extract xrefs.
182
                        $xrefs = preg_split(
183
                            '/(\d+\s+\d+\s*)/s',
184
                            $match[1],
185
                            -1,
186
                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
187
                        );
188
                        $table = array();
189
190
                        foreach ($xrefs as $xref) {
191
                            list($id, $position) = explode(' ', trim($xref));
192
                            $table[$position] = $id;
193
                        }
194
195
                        ksort($table);
196
197
                        $ids       = array_values($table);
198
                        $positions = array_keys($table);
199
200
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
201
                            $id            = $ids[$index] . '_0';
202
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : strlen($content);
203
                            $sub_content   = substr($content, $position, $next_position - $position);
204
205
                            $sub_header         = Header::parse($sub_content, $document);
206
                            $object             = PDFObject::factory($document, $sub_header, '');
207
                            $this->objects[$id] = $object;
208
                        }
209
210
                        // It is not necessary to store this content.
211
                        $content = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $content is dead and can be removed.
Loading history...
212
213
                        return;
214
                    }
215
                    break;
216
217
                default:
218
                    if ($part != 'null') {
219
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
220
221
                        if ($element) {
222
                            $header = new Header(array($element), $document);
0 ignored issues
show
Bug introduced by
array($element) of type array<integer,Smalot\PdfParser\Header> is incompatible with the type Smalot\PdfParser\Element[] expected by parameter $elements of Smalot\PdfParser\Header::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

222
                            $header = new Header(/** @scrutinizer ignore-type */ array($element), $document);
Loading history...
223
                        }
224
                    }
225
                    break;
226
227
            }
228
        }
229
230
        if (!isset($this->objects[$id])) {
231
            $this->objects[$id] = PDFObject::factory($document, $header, $content);
232
        }
233
    }
234
235
    /**
236
     * @param array    $structure
237
     * @param Document $document
238
     *
239
     * @return Header
240
     * @throws \Exception
241
     */
242
    protected function parseHeader($structure, $document)
243
    {
244
        $elements = array();
245
        $count    = count($structure);
246
247
        for ($position = 0; $position < $count; $position += 2) {
248
            $name  = $structure[$position][1];
249
            $type  = $structure[$position + 1][0];
250
            $value = $structure[$position + 1][1];
251
252
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
253
        }
254
255
        return new Header($elements, $document);
256
    }
257
258
    /**
259
     * @param $type
260
     * @param $value
261
     * @param $document
262
     *
263
     * @return Element|Header
264
     * @throws \Exception
265
     */
266
    protected function parseHeaderElement($type, $value, $document)
267
    {
268
        switch ($type) {
269
            case '<<':
270
            case '>>':
271
                return $this->parseHeader($value, $document);
272
273
            case 'numeric':
274
                return new ElementNumeric($value, $document);
275
276
            case 'boolean':
277
                return new ElementBoolean($value, $document);
278
279
            case 'null':
280
                return new ElementNull($value, $document);
281
282
            case '(':
283
                if ($date = ElementDate::parse('(' . $value . ')', $document)) {
284
                    return $date;
285
                } else {
286
                    return ElementString::parse('(' . $value . ')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element|Smalot\PdfParser\Header. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
287
                }
288
289
            case '<':
290
                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
291
292
            case '/':
293
                return ElementName::parse('/' . $value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element|Smalot\PdfParser\Header. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
294
295
            case 'ojbref': // old mistake in tcpdf parser
296
            case 'objref':
297
                return new ElementXRef($value, $document);
298
299
            case '[':
300
                $values = array();
301
302
                foreach ($value as $sub_element) {
303
                    $sub_type  = $sub_element[0];
304
                    $sub_value = $sub_element[1];
305
                    $values[]  = $this->parseHeaderElement($sub_type, $sub_value, $document);
306
                }
307
308
                return new ElementArray($values, $document);
309
310
            case 'endstream':
311
            case 'obj': //I don't know what it means but got my project fixed.
312
            case '':
313
                // Nothing to do with.
314
                break;
315
316
            default:
317
                throw new \Exception('Invalid type: "' . $type . '".');
318
        }
319
    }
320
}
321