Passed
Push — master ( f7cc41...5d3746 )
by Konrad
02:11
created

Parser::getConfig()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
use Smalot\PdfParser\RawData\RawDataParser;
43
44
/**
45
 * Class Parser
46
 */
47
class Parser
48
{
49
    /**
50
     * @var Config
51
     */
52
    private $config;
53
54
    /**
55
     * @var PDFObject[]
56
     */
57
    protected $objects = [];
58
59
    protected $rawDataParser;
60
61 28
    public function __construct($cfg = [], Config $config = null)
62
    {
63 28
        $this->rawDataParser = new RawDataParser($cfg);
64 28
        $this->config = $config ?: new Config();
65 28
    }
66
67
    /**
68
     * @return Config
69
     */
70 1
    public function getConfig()
71
    {
72 1
        return $this->config;
73
    }
74
75
    /**
76
     * @param string $filename
77
     *
78
     * @return Document
79
     *
80
     * @throws \Exception
81
     */
82 26
    public function parseFile($filename)
83
    {
84 26
        $content = file_get_contents($filename);
85
        /*
86
         * 2018/06/20 @doganoo as multiple times a
87
         * users have complained that the parseFile()
88
         * method dies silently, it is an better option
89
         * to remove the error control operator (@) and
90
         * let the users know that the method throws an exception
91
         * by adding @throws tag to PHPDoc.
92
         *
93
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
94
         */
95 26
        return $this->parseContent($content);
96
    }
97
98
    /**
99
     * @param string $content PDF content to parse
100
     *
101
     * @return Document
102
     *
103
     * @throws \Exception if secured PDF file was detected
104
     * @throws \Exception if no object list was found
105
     */
106 26
    public function parseContent($content)
107
    {
108
        // Create structure from raw data.
109 26
        list($xref, $data) = $this->rawDataParser->parseData($content);
110
111 25
        if (isset($xref['trailer']['encrypt'])) {
112
            throw new \Exception('Secured pdf file are currently not supported.');
113
        }
114
115 25
        if (empty($data)) {
116
            throw new \Exception('Object list not found. Possible secured file.');
117
        }
118
119
        // Create destination object.
120 25
        $document = new Document();
121 25
        $this->objects = [];
122
123 25
        foreach ($data as $id => $structure) {
124 25
            $this->parseObject($id, $structure, $document);
125 25
            unset($data[$id]);
126
        }
127
128 25
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
129 25
        $document->setObjects($this->objects);
130
131 25
        return $document;
132
    }
133
134 25
    protected function parseTrailer($structure, $document)
135
    {
136 25
        $trailer = [];
137
138 25
        foreach ($structure as $name => $values) {
139 25
            $name = ucfirst($name);
140
141 25
            if (is_numeric($values)) {
142 25
                $trailer[$name] = new ElementNumeric($values);
143 25
            } elseif (\is_array($values)) {
144 24
                $value = $this->parseTrailer($values, null);
145 24
                $trailer[$name] = new ElementArray($value, null);
0 ignored issues
show
Bug introduced by
$value of type Smalot\PdfParser\Header is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

145
                $trailer[$name] = new ElementArray(/** @scrutinizer ignore-type */ $value, null);
Loading history...
146 25
            } elseif (false !== strpos($values, '_')) {
147 25
                $trailer[$name] = new ElementXRef($values, $document);
148
            } else {
149 24
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
150
            }
151
        }
152
153 25
        return new Header($trailer, $document);
154
    }
155
156
    /**
157
     * @param string   $id
158
     * @param array    $structure
159
     * @param Document $document
160
     */
161 26
    protected function parseObject($id, $structure, $document)
162
    {
163 26
        $header = new Header([], $document);
164 26
        $content = '';
165
166 26
        foreach ($structure as $position => $part) {
167 26
            if (\is_int($part)) {
168 2
                $part = [null, null];
169
            }
170 26
            switch ($part[0]) {
171 26
                case '[':
172 7
                    $elements = [];
173
174 7
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
175 7
                        $sub_type = $sub_element[0];
176 7
                        $sub_value = $sub_element[1];
177 7
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
178
                    }
179
180 7
                    $header = new Header($elements, $document);
0 ignored issues
show
Bug introduced by
It seems like $elements can also be of type Smalot\PdfParser\Header[]; however, parameter $elements of Smalot\PdfParser\Header::__construct() does only seem to accept Smalot\PdfParser\Element[], maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

180
                    $header = new Header(/** @scrutinizer ignore-type */ $elements, $document);
Loading history...
181 7
                    break;
182
183 26
                case '<<':
184 26
                    $header = $this->parseHeader($part[1], $document);
185 26
                    break;
186
187 26
                case 'stream':
188 26
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
189
190 26
                    if ($header->get('Type')->equals('ObjStm')) {
191 7
                        $match = [];
192
193
                        // Split xrefs and contents.
194 7
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
195 7
                        $content = $match[3];
196
197
                        // Extract xrefs.
198 7
                        $xrefs = preg_split(
199 7
                            '/(\d+\s+\d+\s*)/s',
200 7
                            $match[1],
201 7
                            -1,
202 7
                          PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
203
                        );
204 7
                        $table = [];
205
206 7
                        foreach ($xrefs as $xref) {
207 7
                            list($id, $position) = preg_split("/\s+/", trim($xref));
208 7
                            $table[$position] = $id;
209
                        }
210
211 7
                        ksort($table);
212
213 7
                        $ids = array_values($table);
214 7
                        $positions = array_keys($table);
215
216 7
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
217 7
                            $id = $ids[$index].'_0';
218 7
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
219 7
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
220
221 7
                            $sub_header = Header::parse($sub_content, $document);
222 7
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
223 7
                            $this->objects[$id] = $object;
224
                        }
225
226
                        // It is not necessary to store this content.
227 7
                        $content = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $content is dead and can be removed.
Loading history...
228
229 7
                        return;
230
                    }
231 25
                    break;
232
233
                default:
234 25
                    if ('null' != $part) {
235 25
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
236
237 25
                        if ($element) {
238 16
                            $header = new Header([$element], $document);
0 ignored issues
show
Bug introduced by
array($element) of type array<integer,Smalot\PdfParser\Header> is incompatible with the type Smalot\PdfParser\Element[] expected by parameter $elements of Smalot\PdfParser\Header::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

238
                            $header = new Header(/** @scrutinizer ignore-type */ [$element], $document);
Loading history...
239
                        }
240
                    }
241 25
                    break;
242
            }
243
        }
244
245 25
        if (!isset($this->objects[$id])) {
246 25
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
247
        }
248 25
    }
249
250
    /**
251
     * @param array    $structure
252
     * @param Document $document
253
     *
254
     * @return Header
255
     *
256
     * @throws \Exception
257
     */
258 26
    protected function parseHeader($structure, $document)
259
    {
260 26
        $elements = [];
261 26
        $count = \count($structure);
262
263 26
        for ($position = 0; $position < $count; $position += 2) {
264 26
            $name = $structure[$position][1];
265 26
            $type = $structure[$position + 1][0];
266 26
            $value = $structure[$position + 1][1];
267
268 26
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
269
        }
270
271 26
        return new Header($elements, $document);
272
    }
273
274
    /**
275
     * @param string       $type
276
     * @param string|array $value
277
     * @param Document     $document
278
     *
279
     * @return Element|Header|null
280
     *
281
     * @throws \Exception
282
     */
283 26
    protected function parseHeaderElement($type, $value, $document)
284
    {
285 26
        switch ($type) {
286 26
            case '<<':
287 26
            case '>>':
288 25
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

288
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
289 25
                PDFObject::factory($document, $header, null, $this->config);
290
291 25
                return $header;
292
293 26
            case 'numeric':
294 25
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

294
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
295
296 26
            case 'boolean':
297 7
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

297
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
298
299 26
            case 'null':
300 3
                return new ElementNull();
301
302 26
            case '(':
303 25
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

303
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
304 22
                    return $date;
305
                }
306
307 25
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
308
309 26
            case '<':
310 8
                return $this->parseHeaderElement('(', ElementHexa::decode($value, $document), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

310
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value, $document), $document);
Loading history...
311
312 26
            case '/':
313 26
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
314
315 25
            case 'ojbref': // old mistake in tcpdf parser
316 25
            case 'objref':
317 25
                return new ElementXRef($value, $document);
318
319 25
            case '[':
320 25
                $values = [];
321
322 25
                if (\is_array($value)) {
323 25
                    foreach ($value as $sub_element) {
324 25
                        $sub_type = $sub_element[0];
325 25
                        $sub_value = $sub_element[1];
326 25
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
327
                    }
328
                }
329
330 25
                return new ElementArray($values, $document);
0 ignored issues
show
Bug introduced by
$values of type Smalot\PdfParser\Header[]|array is incompatible with the type string expected by parameter $value of Smalot\PdfParser\Element...entArray::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

330
                return new ElementArray(/** @scrutinizer ignore-type */ $values, $document);
Loading history...
331
332 25
            case 'endstream':
333 2
            case 'obj': //I don't know what it means but got my project fixed.
334 2
            case '':
335
                // Nothing to do with.
336 25
                return null;
337
338
            default:
339
                throw new \Exception('Invalid type: "'.$type.'".');
340
        }
341
    }
342
}
343