Test Failed
Pull Request — master (#560)
by
unknown
07:05
created

Parser::getConfig()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 41
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 41
        $this->config = $config ?: new Config();
66 41
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67 41
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 39
    public function parseFile(string $filename): Document
78
    {
79 39
        $content = file_get_contents($filename);
80
        /*
81
         * 2018/06/20 @doganoo as multiple times a
82
         * users have complained that the parseFile()
83
         * method dies silently, it is an better option
84
         * to remove the error control operator (@) and
85
         * let the users know that the method throws an exception
86
         * by adding @throws tag to PHPDoc.
87
         *
88
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
89
         */
90 39
        return $this->parseContent($content);
91
    }
92
93
    /**
94
     * @param string $content PDF content to parse
95
     *
96
     * @throws \Exception if secured PDF file was detected
97
     * @throws \Exception if no object list was found
98
     */
99 39
    public function parseContent(string $content): Document
100
    {
101
        // Create structure from raw data.
102 39
        list($xref, $data) = $this->rawDataParser->parseData($content);
103
104 38
        if (isset($xref['trailer']['encrypt'])) {
105
            throw new \Exception('Secured pdf file are currently not supported.');
106
        }
107
108 38
        if (empty($data)) {
109
            throw new \Exception('Object list not found. Possible secured file.');
110
        }
111
112
        // Create destination object.
113 38
        $document = new Document();
114 38
        $this->objects = [];
115
116 38
        foreach ($data as $id => $structure) {
117 38
            $this->parseObject($id, $structure, $document);
118 38
            unset($data[$id]);
119
        }
120
121 38
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
122 38
        $document->setObjects($this->objects);
123
124 38
        return $document;
125
    }
126
127 38
    protected function parseTrailer(array $structure, ?Document $document)
128
    {
129 38
        $trailer = [];
130
131 38
        foreach ($structure as $name => $values) {
132 38
            $name = ucfirst($name);
133
134 38
            if (is_numeric($values)) {
135 38
                $trailer[$name] = new ElementNumeric($values);
136 38
            } elseif (\is_array($values)) {
137 33
                $value = $this->parseTrailer($values, null);
138 33
                $trailer[$name] = new ElementArray($value, null);
139 38
            } elseif (false !== strpos($values, '_')) {
140 38
                $trailer[$name] = new ElementXRef($values, $document);
141
            } else {
142 33
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
143
            }
144
        }
145
146 38
        return new Header($trailer, $document);
147
    }
148
149 39
    protected function parseObject(string $id, array $structure, ?Document $document)
150
    {
151 39
        $header = new Header([], $document);
152 39
        $content = '';
153
154 39
        foreach ($structure as $position => $part) {
155 39
            if (\is_int($part)) {
156
                $part = [null, null];
157
            }
158 39
            switch ($part[0]) {
159 39
                case '[':
160 13
                    $elements = [];
161
162 13
                    foreach ($part[1] as $sub_element) {
163 13
                        $sub_type = $sub_element[0];
164 13
                        $sub_value = $sub_element[1];
165 13
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
166
                    }
167
168 13
                    $header = new Header($elements, $document);
169 13
                    break;
170
171 39
                case '<<':
172 39
                    $header = $this->parseHeader($part[1], $document);
173 39
                    break;
174
175 39
                case 'stream':
176 39
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
177
178 39
                    if ($header->get('Type')->equals('ObjStm')) {
179 11
                        $match = [];
180
181
                        // Split xrefs and contents.
182 11
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
183 11
                        $content = $match[3];
184
185
                        // Extract xrefs.
186 11
                        $xrefs = preg_split(
187 11
                            '/(\d+\s+\d+\s*)/s',
188 11
                            $match[1],
189 11
                            -1,
190 11
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
191
                        );
192 11
                        $table = [];
193
194 11
                        foreach ($xrefs as $xref) {
195 11
                            list($id, $position) = preg_split("/\s+/", trim($xref));
196 11
                            $table[$position] = $id;
197
                        }
198
199 11
                        ksort($table);
200
201 11
                        $ids = array_values($table);
202 11
                        $positions = array_keys($table);
203
204 11
                        foreach ($positions as $index => $position) {
205 11
                            $id = $ids[$index].'_0';
206 11
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
207 11
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
208
209 11
                            $sub_header = Header::parse($sub_content, $document);
210 11
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
211 11
                            $this->objects[$id] = $object;
212
                        }
213
214
                        // It is not necessary to store this content.
215
216 11
                        return;
217
                    }
218 38
                    break;
219
220
                default:
221 38
                    if ('null' != $part) {
222 38
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
223
224 38
                        if ($element) {
225 20
                            $header = new Header([$element], $document);
226
                        }
227
                    }
228 38
                    break;
229
            }
230
        }
231
232 38
        if (!isset($this->objects[$id])) {
233 38
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
234
        }
235 38
    }
236
237
    /**
238
     * @throws \Exception
239
     */
240 39
    protected function parseHeader(array $structure, ?Document $document): Header
241
    {
242 39
        $elements = [];
243 39
        $count = \count($structure);
244
245 39
        for ($position = 0; $position < $count; $position += 2) {
246 39
            $name = $structure[$position][1];
247 39
            $type = $structure[$position + 1][0];
248 39
            $value = $structure[$position + 1][1];
249
250 39
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
251
        }
252
253 39
        return new Header($elements, $document);
254
    }
255
256
    /**
257
     * @param string|array $value
258
     *
259
     * @return Element|Header|null
260
     *
261
     * @throws \Exception
262
     */
263 39
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
264
    {
265 39
        $valueIsEmpty = null == $value || '' == $value || false == $value;
266 39
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty)) {
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected ')' on line 266 at column 64
Loading history...
267 39
            $value = [];
268 38
        }
269 38
270
        switch ($type) {
271 38
            case '<<':
272
            case '>>':
273 39
                $header = $this->parseHeader($value, $document);
274 38
                PDFObject::factory($document, $header, null, $this->config);
275
276 39
                return $header;
277 13
278
            case 'numeric':
279 39
                return new ElementNumeric($value);
280 3
281
            case 'boolean':
282 39
                return new ElementBoolean($value);
283 38
284 31
            case 'null':
285
                return new ElementNull();
286
287 38
            case '(':
288
                if ($date = ElementDate::parse('('.$value.')', $document)) {
289 39
                    return $date;
290 15
                }
291
292 39
                return ElementString::parse('('.$value.')', $document);
293 39
294
            case '<':
295 38
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
296 38
297 38
            case '/':
298
                return ElementName::parse('/'.$value, $document);
299 38
300 38
            case 'ojbref': // old mistake in tcpdf parser
301
            case 'objref':
302 38
                return new ElementXRef($value, $document);
303 38
304 38
            case '[':
305 38
                $values = [];
306 38
307
                if (\is_array($value)) {
308
                    foreach ($value as $sub_element) {
309
                        $sub_type = $sub_element[0];
310 38
                        $sub_value = $sub_element[1];
311
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
312 38
                    }
313 1
                }
314
315
                return new ElementArray($values, $document);
316 38
317
            case 'endstream':
318
            case 'obj': // I don't know what it means but got my project fixed.
319
            case '':
320
                // Nothing to do with.
321
                return null;
322
323
            default:
324
                throw new \Exception('Invalid type: "'.$type.'".');
325
        }
326
    }
327
}
328