Issues (82)

src/Smalot/PdfParser/Parser.php (1 issue)

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 71
    public function __construct($cfg = [], ?Config $config = null)
64
    {
65 71
        $this->config = $config ?: new Config();
66 71
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 66
    public function parseFile(string $filename): Document
78
    {
79 66
        $content = file_get_contents($filename);
80
81
        /*
82
         * 2018/06/20 @doganoo as multiple times a
83
         * users have complained that the parseFile()
84
         * method dies silently, it is an better option
85
         * to remove the error control operator (@) and
86
         * let the users know that the method throws an exception
87
         * by adding @throws tag to PHPDoc.
88
         *
89
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
90
         */
91 66
        return $this->parseContent($content);
92
    }
93
94
    /**
95
     * @param string $content PDF content to parse
96
     *
97
     * @throws \Exception if secured PDF file was detected
98
     * @throws \Exception if no object list was found
99
     */
100 66
    public function parseContent(string $content): Document
101
    {
102
        // Create structure from raw data.
103 66
        list($xref, $data) = $this->rawDataParser->parseData($content);
104
105 65
        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106 1
            throw new \Exception('Secured pdf file are currently not supported.');
107
        }
108
109 64
        if (empty($data)) {
110
            throw new \Exception('Object list not found. Possible secured file.');
111
        }
112
113
        // Create destination object.
114 64
        $document = new Document();
115 64
        $this->objects = [];
116
117 64
        foreach ($data as $id => $structure) {
118 64
            $this->parseObject($id, $structure, $document);
119 64
            unset($data[$id]);
120
        }
121
122 64
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123 64
        $document->setObjects($this->objects);
124
125 64
        return $document;
126
    }
127
128 64
    protected function parseTrailer(array $structure, ?Document $document)
129
    {
130 64
        $trailer = [];
131
132 64
        foreach ($structure as $name => $values) {
133 64
            $name = ucfirst($name);
134
135 64
            if (is_numeric($values)) {
136 64
                $trailer[$name] = new ElementNumeric($values);
137 64
            } elseif (\is_array($values)) {
138 53
                $value = $this->parseTrailer($values, null);
139 53
                $trailer[$name] = new ElementArray($value, null);
140 64
            } elseif (false !== strpos($values, '_')) {
141 64
                $trailer[$name] = new ElementXRef($values, $document);
142
            } else {
143 53
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144
            }
145
        }
146
147 64
        return new Header($trailer, $document);
148
    }
149
150 65
    protected function parseObject(string $id, array $structure, ?Document $document)
151
    {
152 65
        $header = new Header([], $document);
153 65
        $content = '';
154
155 65
        foreach ($structure as $position => $part) {
156 65
            if (\is_int($part)) {
157
                $part = [null, null];
158
            }
159 65
            switch ($part[0]) {
160 65
                case '[':
161 26
                    $elements = [];
162
163 26
                    foreach ($part[1] as $sub_element) {
164 26
                        $sub_type = $sub_element[0];
165 26
                        $sub_value = $sub_element[1];
166 26
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167
                    }
168
169 26
                    $header = new Header($elements, $document);
170 26
                    break;
171
172 65
                case '<<':
173 65
                    $header = $this->parseHeader($part[1], $document);
174 65
                    break;
175
176 65
                case 'stream':
177 65
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179 65
                    if ($header->get('Type')->equals('ObjStm')) {
180 17
                        $match = [];
181
182
                        // Split xrefs and contents.
183 17
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
184 17
                        $content = $match[3];
185
186
                        // Extract xrefs.
187 17
                        $xrefs = preg_split(
188 17
                            '/(\d+\s+\d+\s*)/s',
189 17
                            $match[1],
190 17
                            -1,
191 17
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
192 17
                        );
193 17
                        $table = [];
194
195 17
                        foreach ($xrefs as $xref) {
196 17
                            list($id, $position) = preg_split("/\s+/", trim($xref));
197 17
                            $table[$position] = $id;
198
                        }
199
200 17
                        ksort($table);
201
202 17
                        $ids = array_values($table);
203 17
                        $positions = array_keys($table);
204
205 17
                        foreach ($positions as $index => $position) {
0 ignored issues
show
Comprehensibility Bug introduced by
$position is overwriting a variable from outer foreach loop.
Loading history...
206 17
                            $id = $ids[$index].'_0';
207 17
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208 17
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210 17
                            $sub_header = Header::parse($sub_content, $document);
211 17
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
212 17
                            $this->objects[$id] = $object;
213
                        }
214
215
                        // It is not necessary to store this content.
216
217 17
                        return;
218 64
                    } elseif ($header->get('Type')->equals('Metadata')) {
219
                        // Attempt to parse XMP XML Metadata
220 41
                        $document->extractXMPMetadata($content);
221
                    }
222 64
                    break;
223
224
                default:
225 64
                    if ('null' != $part) {
226 64
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228 64
                        if ($element) {
229 28
                            $header = new Header([$element], $document);
230
                        }
231
                    }
232 64
                    break;
233
            }
234
        }
235
236 64
        if (!isset($this->objects[$id])) {
237 64
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238
        }
239
    }
240
241
    /**
242
     * @throws \Exception
243
     */
244 65
    protected function parseHeader(array $structure, ?Document $document): Header
245
    {
246 65
        $elements = [];
247 65
        $count = \count($structure);
248
249 65
        for ($position = 0; $position < $count; $position += 2) {
250 65
            $name = $structure[$position][1];
251 65
            $type = $structure[$position + 1][0];
252 65
            $value = $structure[$position + 1][1];
253
254 65
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
255
        }
256
257 65
        return new Header($elements, $document);
258
    }
259
260
    /**
261
     * @param string|array $value
262
     *
263
     * @return Element|Header|null
264
     *
265
     * @throws \Exception
266
     */
267 65
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
268
    {
269 65
        $valueIsEmpty = null == $value || '' == $value || false == $value;
270 65
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
271 12
            $value = [];
272
        }
273
274
        switch ($type) {
275 65
            case '<<':
276 65
            case '>>':
277 64
                $header = $this->parseHeader($value, $document);
278 64
                PDFObject::factory($document, $header, null, $this->config);
279
280 64
                return $header;
281
282 65
            case 'numeric':
283 64
                return new ElementNumeric($value);
284
285 65
            case 'boolean':
286 30
                return new ElementBoolean($value);
287
288 65
            case 'null':
289 11
                return new ElementNull();
290
291 65
            case '(':
292 64
                if ($date = ElementDate::parse('('.$value.')', $document)) {
293 52
                    return $date;
294
                }
295
296 64
                return ElementString::parse('('.$value.')', $document);
297
298 65
            case '<':
299 31
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
300
301 65
            case '/':
302 65
                return ElementName::parse('/'.$value, $document);
303
304 64
            case 'ojbref': // old mistake in tcpdf parser
305 64
            case 'objref':
306 64
                return new ElementXRef($value, $document);
307
308 64
            case '[':
309 64
                $values = [];
310
311 64
                if (\is_array($value)) {
312 64
                    foreach ($value as $sub_element) {
313 64
                        $sub_type = $sub_element[0];
314 64
                        $sub_value = $sub_element[1];
315 64
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316
                    }
317
                }
318
319 64
                return new ElementArray($values, $document);
320
321 64
            case 'endstream':
322 1
            case 'obj': // I don't know what it means but got my project fixed.
323
            case '':
324
                // Nothing to do with.
325 64
                return null;
326
327
            default:
328
                throw new \Exception('Invalid type: "'.$type.'".');
329
        }
330
    }
331
}
332