Test Failed
Pull Request — master (#606)
by
unknown
02:17
created

src/Smalot/PdfParser/Parser.php (1 issue)

Labels
Severity
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementBoolean;
37
use Smalot\PdfParser\Element\ElementDate;
38
use Smalot\PdfParser\Element\ElementHexa;
39
use Smalot\PdfParser\Element\ElementName;
40
use Smalot\PdfParser\Element\ElementNull;
41
use Smalot\PdfParser\Element\ElementNumeric;
42
use Smalot\PdfParser\Element\ElementString;
43
use Smalot\PdfParser\Element\ElementXRef;
44
use Smalot\PdfParser\RawData\RawDataParser;
45
46
/**
47
 * Class Parser
48
 */
49
class Parser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * @var PDFObject[]
58
     */
59
    protected $objects = [];
60
61
    protected $rawDataParser;
62
63 44
    public function __construct($cfg = [], Config $config = null)
64
    {
65 44
        $this->config = $config ?: new Config();
66 44
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
67 44
    }
68
69 1
    public function getConfig(): Config
70
    {
71 1
        return $this->config;
72
    }
73
74
    /**
75
     * @throws \Exception
76
     */
77 42
    public function parseFile(string $filename): Document
78
    {
79 42
        $content = file_get_contents($filename);
80
        /*
81
         * 2018/06/20 @doganoo as multiple times a
82
         * users have complained that the parseFile()
83
         * method dies silently, it is an better option
84
         * to remove the error control operator (@) and
85
         * let the users know that the method throws an exception
86
         * by adding @throws tag to PHPDoc.
87
         *
88
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
89
         */
90 42
        return $this->parseContent($content);
91
    }
92
93
    /**
94
     * @param string $content PDF content to parse
95
     *
96
     * @throws \Exception if secured PDF file was detected
97
     * @throws \Exception if no object list was found
98
     */
99 42
    public function parseContent(string $content): Document
100
    {
101
        // Create structure from raw data.
102 42
        list($xref, $data) = $this->rawDataParser->parseData($content);
103
104 41
        if (isset($xref['trailer']['encrypt'])) {
105
            throw new \Exception('Secured pdf file are currently not supported.');
106
        }
107
108 41
        if (empty($data)) {
109
            throw new \Exception('Object list not found. Possible secured file.');
110
        }
111
112
        // Create destination object.
113 41
        $document = new Document();
114 41
        $this->objects = [];
115
116 41
        foreach ($data as $id => $structure) {
117 41
            $this->parseObject($id, $structure, $document);
118 41
            unset($data[$id]);
119
        }
120
121 41
        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
122 41
        $document->setObjects($this->objects);
123
124 41
        return $document;
125
    }
126
127 41
    protected function parseTrailer(array $structure, ?Document $document)
128
    {
129 41
        $trailer = [];
130
131 41
        foreach ($structure as $name => $values) {
132 41
            $name = ucfirst($name);
133
134 41
            if (is_numeric($values)) {
135 41
                $trailer[$name] = new ElementNumeric($values);
136 41
            } elseif (\is_array($values)) {
137 35
                $value = $this->parseTrailer($values, null);
138 35
                $trailer[$name] = new ElementArray($value, null);
139 41
            } elseif (false !== strpos($values, '_')) {
140 41
                $trailer[$name] = new ElementXRef($values, $document);
141
            } else {
142 35
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
143
            }
144
        }
145
146 41
        return new Header($trailer, $document);
147
    }
148
149 42
    protected function parseObject(string $id, array $structure, ?Document $document)
150
    {
151 42
        $header = new Header([], $document);
152 42
        $content = '';
153
154 42
        foreach ($structure as $position => $part) {
155 42
            if (\is_int($part)) {
156
                $part = [null, null];
157
            }
158 42
            switch ($part[0]) {
159 42
                case '[':
160 14
                    $elements = [];
161
162 14
                    foreach ($part[1] as $sub_element) {
163 14
                        $sub_type = $sub_element[0];
164 14
                        $sub_value = $sub_element[1];
165 14
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
166
                    }
167
168 14
                    $header = new Header($elements, $document);
169 14
                    break;
170
171 42
                case '<<':
172 42
                    $header = $this->parseHeader($part[1], $document);
173 42
                    break;
174
175 42
                case 'stream':
176 42
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];
177
178 42
                    if ($header->get('Type')->equals('ObjStm')) {
179 13
                        $match = [];
180
181
                        // Split xrefs and contents.
182 13
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
183 13
                        $content = $match[3];
184
185
                        // Extract xrefs.
186 13
                        $xrefs = preg_split(
187 13
                            '/(\d+\s+\d+\s*)/s',
188 13
                            $match[1],
189 13
                            -1,
190 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
191
                        );
192 13
                        $table = [];
193
194 13
                        foreach ($xrefs as $xref) {
195 13
                            list($id, $position) = preg_split("/\s+/", trim($xref));
196 13
                            $table[$position] = $id;
197
                        }
198
199 13
                        ksort($table);
200
201 13
                        $ids = array_values($table);
202 13
                        $positions = array_keys($table);
203
204 13
                        foreach ($positions as $index => $position) {
205 13
                            $id = $ids[$index].'_0';
206 13
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
207 13
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);
208
209 13
                            $sub_header = Header::parse($sub_content, $document);
210 13
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
211 13
                            $this->objects[$id] = $object;
212
                        }
213
214
                        // It is not necessary to store this content.
215
216 13
                        return;
217
218 41
                    } else if ($header->get('Type')->equals('Metadata')) {
219
220
                        // Attempt to parse XMP XML Metadata
221 41
                        $document->extractXMPMetadata($content);
0 ignored issues
show
The method extractXMPMetadata() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

221
                        $document->/** @scrutinizer ignore-call */ 
222
                                   extractXMPMetadata($content);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
222 41
223
                    }
224 41
                    break;
225 21
226
                default:
227
                    if ('null' != $part) {
228 41
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);
229
230
                        if ($element) {
231
                            $header = new Header([$element], $document);
232 41
                        }
233 41
                    }
234
                    break;
235 41
            }
236
        }
237
238
        if (!isset($this->objects[$id])) {
239
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
240 42
        }
241
    }
242 42
243 42
    /**
244
     * @throws \Exception
245 42
     */
246 42
    protected function parseHeader(array $structure, ?Document $document): Header
247 42
    {
248 42
        $elements = [];
249
        $count = \count($structure);
250 42
251
        for ($position = 0; $position < $count; $position += 2) {
252
            $name = $structure[$position][1];
253 42
            $type = $structure[$position + 1][0];
254
            $value = $structure[$position + 1][1];
255
256
            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
257
        }
258
259
        return new Header($elements, $document);
260
    }
261
262
    /**
263 42
     * @param string|array $value
264
     *
265 42
     * @return Element|Header|null
266 42
     *
267 6
     * @throws \Exception
268
     */
269
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
270 42
    {
271 42
        $valueIsEmpty = null == $value || '' == $value || false == $value;
272 42
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
273 41
            $value = [];
274 41
        }
275
276 41
        switch ($type) {
277
            case '<<':
278 42
            case '>>':
279 41
                $header = $this->parseHeader($value, $document);
280
                PDFObject::factory($document, $header, null, $this->config);
281 42
282 15
                return $header;
283
284 42
            case 'numeric':
285 3
                return new ElementNumeric($value);
286
287 42
            case 'boolean':
288 41
                return new ElementBoolean($value);
289 32
290
            case 'null':
291
                return new ElementNull();
292 41
293
            case '(':
294 42
                if ($date = ElementDate::parse('('.$value.')', $document)) {
295 18
                    return $date;
296
                }
297 42
298 42
                return ElementString::parse('('.$value.')', $document);
299
300 41
            case '<':
301 41
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
302 41
303
            case '/':
304 41
                return ElementName::parse('/'.$value, $document);
305 41
306
            case 'ojbref': // old mistake in tcpdf parser
307 41
            case 'objref':
308 41
                return new ElementXRef($value, $document);
309 41
310 41
            case '[':
311 41
                $values = [];
312
313
                if (\is_array($value)) {
314
                    foreach ($value as $sub_element) {
315 41
                        $sub_type = $sub_element[0];
316
                        $sub_value = $sub_element[1];
317 41
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
318 1
                    }
319
                }
320
321 41
                return new ElementArray($values, $document);
322
323
            case 'endstream':
324
            case 'obj': // I don't know what it means but got my project fixed.
325
            case '':
326
                // Nothing to do with.
327
                return null;
328
329
            default:
330
                throw new \Exception('Invalid type: "'.$type.'".');
331
        }
332
    }
333
}
334