Passed
Pull Request — master (#493)
by
unknown
07:05
created

Parser::parseB64()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 1
eloc 2
c 1
b 0
f 1
nc 1
nop 1
dl 0
loc 4
ccs 0
cts 3
cp 0
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementBoolean;
35
use Smalot\PdfParser\Element\ElementDate;
36
use Smalot\PdfParser\Element\ElementHexa;
37
use Smalot\PdfParser\Element\ElementName;
38
use Smalot\PdfParser\Element\ElementNull;
39
use Smalot\PdfParser\Element\ElementNumeric;
40
use Smalot\PdfParser\Element\ElementString;
41
use Smalot\PdfParser\Element\ElementXRef;
42
use Smalot\PdfParser\RawData\RawDataParser;
43
44
/**
45
 * Class Parser
46
 */
47
class Parser
48
{
49
    /**
50
     * @var Config
51
     */
52
    private $config;
53
54
    /**
55
     * @var PDFObject[]
56
     */
57
    protected $objects = [];
58
59
    protected $rawDataParser;
60
61 37
    public function __construct($cfg = [], ?Config $config = null)
62
    {
63 37
        $this->config = $config ?: new Config();
64 37
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
65 37
    }
66
67 1
    public function getConfig(): Config
68
    {
69 1
        return $this->config;
70
    }
71
72
    /**
73
     * @throws \Exception
74
     */
75 35
    public function parseFile(string $filename): Document
76
    {
77 35
        $content = file_get_contents($filename);
78
        /*
79
         * 2018/06/20 @doganoo as multiple times a
80
         * users have complained that the parseFile()
81
         * method dies silently, it is an better option
82
         * to remove the error control operator (@) and
83
         * let the users know that the method throws an exception
84
         * by adding @throws tag to PHPDoc.
85
         *
86
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
87
         */
88 35
        return $this->parseContent($content);
89
    }
90
91
    // This function parses a base 64 enconded pdf instead of a file
92
    public function parseB64(string $base64EncodedPdf): Document
93
    {
94
        $decoded = base64_decode($base64EncodedPdf);
95
        return $this->parseContent($decoded);
96
    }
97
98
    /**
99
     * @param string $content PDF content to parse
100
     *
101
     * @throws \Exception if secured PDF file was detected
102
     * @throws \Exception if no object list was found
103
     */
104 35
    public function parseContent(string $content): Document
105
    {
106
        // Create structure from raw data.
107 35
        list($xref, $data) = $this->rawDataParser->parseData($content);
108
109 34
        if (isset($xref['trailer']['encrypt'])) {
110
            throw new \Exception('Secured pdf file are currently not supported.');
111
        }
112
113 34
        if (empty($data)) {
114
            throw new \Exception('Object list not found. Possible secured file.');
115
        }
116
117
        // Create destination object.
118 34
        $document = new Document();
119 34
        $this->objects = [];
120
121 34
        foreach ($data as $id => $structure) {
122 34
            $this->parseObject($id, $structure, $document);
123 34
            unset($data[$id]);
124
        }
125
126 34
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
127 34
        $document->setObjects($this->objects);
128
129 34
        return $document;
130
    }
131
132 34
    protected function parseTrailer(array $structure, ?Document $document)
133
    {
134 34
        $trailer = [];
135
136 34
        foreach ($structure as $name => $values) {
137 34
            $name = ucfirst($name);
138
139 34
            if (is_numeric($values)) {
140 34
                $trailer[$name] = new ElementNumeric($values);
141 34
            } elseif (\is_array($values)) {
142 30
                $value = $this->parseTrailer($values, null);
143 30
                $trailer[$name] = new ElementArray($value, null);
144 34
            } elseif (false !== strpos($values, '_')) {
145 34
                $trailer[$name] = new ElementXRef($values, $document);
146
            } else {
147 30
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
148
            }
149
        }
150
151 34
        return new Header($trailer, $document);
152
    }
153
154 35
    protected function parseObject(string $id, array $structure, ?Document $document)
155
    {
156 35
        $header = new Header([], $document);
157 35
        $content = '';
158
159 35
        foreach ($structure as $position => $part) {
160 35
            if (\is_int($part)) {
161
                $part = [null, null];
162
            }
163 35
            switch ($part[0]) {
164 35
                case '[':
165 12
                    $elements = [];
166
167 12
                    foreach ($part[1] as $sub_element) {
0 ignored issues
show
Bug introduced by
The expression $part[1] of type null is not traversable.
Loading history...
168 12
                        $sub_type = $sub_element[0];
169 12
                        $sub_value = $sub_element[1];
170 12
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
171
                    }
172
173 12
                    $header = new Header($elements, $document);
174 12
                    break;
175
176 35
                case '<<':
177 35
                    $header = $this->parseHeader($part[1], $document);
0 ignored issues
show
Bug introduced by
$part[1] of type null is incompatible with the type array expected by parameter $structure of Smalot\PdfParser\Parser::parseHeader(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

177
                    $header = $this->parseHeader(/** @scrutinizer ignore-type */ $part[1], $document);
Loading history...
178 35
                    break;
179
180 35
                case 'stream':
181 35
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
182
183 35
                    if ($header->get('Type')->equals('ObjStm')) {
184 9
                        $match = [];
185
186
                        // Split xrefs and contents.
187 9
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
188 9
                        $content = $match[3];
189
190
                        // Extract xrefs.
191 9
                        $xrefs = preg_split(
192 9
                            '/(\d+\s+\d+\s*)/s',
193 9
                            $match[1],
194 9
                            -1,
195 9
                          \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
196
                        );
197 9
                        $table = [];
198
199 9
                        foreach ($xrefs as $xref) {
200 9
                            list($id, $position) = preg_split("/\s+/", trim($xref));
201 9
                            $table[$position] = $id;
202
                        }
203
204 9
                        ksort($table);
205
206 9
                        $ids = array_values($table);
207 9
                        $positions = array_keys($table);
208
209 9
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
210 9
                            $id = $ids[$index].'_0';
211 9
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
212 9
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
213
214 9
                            $sub_header = Header::parse($sub_content, $document);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\Header::parse() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

214
                            $sub_header = Header::parse($sub_content, /** @scrutinizer ignore-type */ $document);
Loading history...
215 9
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

215
                            $object = PDFObject::factory(/** @scrutinizer ignore-type */ $document, $sub_header, '', $this->config);
Loading history...
216 9
                            $this->objects[$id] = $object;
217
                        }
218
219
                        // It is not necessary to store this content.
220
221 9
                        return;
222
                    }
223 34
                    break;
224
225
                default:
226 34
                    if ('null' != $part) {
227 34
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
228
229 34
                        if ($element) {
230 18
                            $header = new Header([$element], $document);
231
                        }
232
                    }
233 34
                    break;
234
            }
235
        }
236
237 34
        if (!isset($this->objects[$id])) {
238 34
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
239
        }
240 34
    }
241
242
    /**
243
     * @throws \Exception
244
     */
245 35
    protected function parseHeader(array $structure, ?Document $document): Header
246
    {
247 35
        $elements = [];
248 35
        $count = \count($structure);
249
250 35
        for ($position = 0; $position < $count; $position += 2) {
251 35
            $name = $structure[$position][1];
252 35
            $type = $structure[$position + 1][0];
253 35
            $value = $structure[$position + 1][1];
254
255 35
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
256
        }
257
258 35
        return new Header($elements, $document);
259
    }
260
261
    /**
262
     * @param string|array $value
263
     *
264
     * @return Element|Header|null
265
     *
266
     * @throws \Exception
267
     */
268 35
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
269
    {
270 35
        switch ($type) {
271 35
            case '<<':
272 35
            case '>>':
273 34
                $header = $this->parseHeader($value, $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type string; however, parameter $structure of Smalot\PdfParser\Parser::parseHeader() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

273
                $header = $this->parseHeader(/** @scrutinizer ignore-type */ $value, $document);
Loading history...
274 34
                PDFObject::factory($document, $header, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::factory() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

274
                PDFObject::factory(/** @scrutinizer ignore-type */ $document, $header, null, $this->config);
Loading history...
275
276 34
                return $header;
277
278 35
            case 'numeric':
279 34
                return new ElementNumeric($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tNumeric::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

279
                return new ElementNumeric(/** @scrutinizer ignore-type */ $value);
Loading history...
280
281 35
            case 'boolean':
282 11
                return new ElementBoolean($value);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element...tBoolean::__construct() does only seem to accept boolean|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

282
                return new ElementBoolean(/** @scrutinizer ignore-type */ $value);
Loading history...
283
284 35
            case 'null':
285 3
                return new ElementNull();
286
287 35
            case '(':
288 34
                if ($date = ElementDate::parse('('.$value.')', $document)) {
0 ignored issues
show
Bug introduced by
Are you sure $value of type array|string can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

288
                if ($date = ElementDate::parse('('./** @scrutinizer ignore-type */ $value.')', $document)) {
Loading history...
289 29
                    return $date;
290
                }
291
292 34
                return ElementString::parse('('.$value.')', $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\...value . ')', $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
293
294 35
            case '<':
295 13
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
0 ignored issues
show
Bug introduced by
It seems like $value can also be of type array; however, parameter $value of Smalot\PdfParser\Element\ElementHexa::decode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

295
                return $this->parseHeaderElement('(', ElementHexa::decode(/** @scrutinizer ignore-type */ $value), $document);
Loading history...
296
297 35
            case '/':
298 35
                return ElementName::parse('/'.$value, $document);
0 ignored issues
show
Bug Best Practice introduced by
The expression return Smalot\PdfParser\.../' . $value, $document) could also return false which is incompatible with the documented return type Smalot\PdfParser\Element...t\PdfParser\Header|null. Did you maybe forget to handle an error condition?

If the returned type also contains false, it is an indicator that maybe an error condition leading to the specific return statement remains unhandled.

Loading history...
299
300 34
            case 'ojbref': // old mistake in tcpdf parser
301 34
            case 'objref':
302 34
                return new ElementXRef($value, $document);
303
304 34
            case '[':
305 34
                $values = [];
306
307 34
                if (\is_array($value)) {
308 34
                    foreach ($value as $sub_element) {
309 34
                        $sub_type = $sub_element[0];
310 34
                        $sub_value = $sub_element[1];
311 34
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
312
                    }
313
                }
314
315 34
                return new ElementArray($values, $document);
316
317 34
            case 'endstream':
318
            case 'obj': //I don't know what it means but got my project fixed.
319
            case '':
320
                // Nothing to do with.
321 34
                return null;
322
323
            default:
324
                throw new \Exception('Invalid type: "'.$type.'".');
325
        }
326
    }
327
}
328