Passed
Push — master ( 416ff0...c2c117 )
by Konrad
04:12 queued 02:09
created

RawDataParser::getObjectHeaderPattern()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 4
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 37
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 37
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 37
        $this->filterHelper = new FilterHelper();
75 37
        $this->config = $config ?: new Config();
76 37
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $sdic    Stream's dictionary array
83
     * @param string $stream  Stream to decode
84
     *
85
     * @return array containing decoded stream data and remaining filters
86
     *
87
     * @throws Exception
88
     */
89 33
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
90
    {
91
        // get stream length and filters
92 33
        $slength = \strlen($stream);
93 33
        if ($slength <= 0) {
94
            return ['', []];
95
        }
96 33
        $filters = [];
97 33
        foreach ($sdic as $k => $v) {
98 33
            if ('/' == $v[0]) {
99 33
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
100
                    // get declared stream length
101 32
                    $declength = (int) ($sdic[($k + 1)][1]);
102 32
                    if ($declength < $slength) {
103 32
                        $stream = substr($stream, 0, $declength);
104 32
                        $slength = $declength;
105
                    }
106 33
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
107
                    // resolve indirect object
108 33
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
109 33
                    if ('/' == $objval[0]) {
110
                        // single filter
111 33
                        $filters[] = $objval[1];
112 3
                    } elseif ('[' == $objval[0]) {
113
                        // array of filters
114 3
                        foreach ($objval[1] as $flt) {
115 3
                            if ('/' == $flt[0]) {
116 3
                                $filters[] = $flt[1];
117
                            }
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
124
        // decode the stream
125 33
        $remaining_filters = [];
126 33
        foreach ($filters as $filter) {
127 33
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
128
                try {
129 33
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
130
                } catch (Exception $e) {
131
                    $emsg = $e->getMessage();
132
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
133
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
134
                    ) {
135 33
                        throw new Exception($e->getMessage());
136
                    }
137
                }
138
            } else {
139
                // add missing filter to array
140 4
                $remaining_filters[] = $filter;
141
            }
142
        }
143
144 33
        return [$stream, $remaining_filters];
145
    }
146
147
    /**
148
     * Decode the Cross-Reference section
149
     *
150
     * @param string $pdfData   PDF data
151
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
152
     * @param array  $xref      Previous xref array (if any)
153
     *
154
     * @return array containing xref and trailer data
155
     *
156
     * @throws Exception
157
     */
158 28
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
159
    {
160 28
        $startxref += 4; // 4 is the length of the word 'xref'
161
        // skip initial white space chars
162 28
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
163
        // initialize object number
164 28
        $obj_num = 0;
165
        // search for cross-reference entries or subsection
166 28
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
167 28
            if ($matches[0][1] != $offset) {
168
                // we are on another section
169 5
                break;
170
            }
171 28
            $offset += \strlen($matches[0][0]);
172 28
            if ('n' == $matches[3][0]) {
173
                // create unique object index: [object number]_[generation number]
174 28
                $index = $obj_num.'_'.(int) ($matches[2][0]);
175
                // check if object already exist
176 28
                if (!isset($xref['xref'][$index])) {
177
                    // store object offset position
178 28
                    $xref['xref'][$index] = (int) ($matches[1][0]);
179
                }
180 28
                ++$obj_num;
181 28
            } elseif ('f' == $matches[3][0]) {
182 28
                ++$obj_num;
183
            } else {
184
                // object number (index)
185 28
                $obj_num = (int) ($matches[1][0]);
186
            }
187
        }
188
        // get trailer data
189 28
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
190 28
            $trailer_data = $matches[1][0];
191 28
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
192
                // get only the last updated version
193 28
                $xref['trailer'] = [];
194
                // parse trailer_data
195 28
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
196 28
                    $xref['trailer']['size'] = (int) ($matches[1]);
197
                }
198 28
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 28
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 28
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 28
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 28
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
206
                }
207 28
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
208 24
                    $xref['trailer']['id'] = [];
209 24
                    $xref['trailer']['id'][0] = $matches[1];
210 24
                    $xref['trailer']['id'][1] = $matches[2];
211
                }
212
            }
213 28
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214
                // get previous xref
215 28
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
216
            }
217
        } else {
218
            throw new Exception('Unable to find trailer');
219
        }
220
221 28
        return $xref;
222
    }
223
224
    /**
225
     * Decode the Cross-Reference Stream section
226
     *
227
     * @param string $pdfData   PDF data
228
     * @param int    $startxref Offset at which the xref section starts
229
     * @param array  $xref      Previous xref array (if any)
230
     *
231
     * @return array containing xref and trailer data
232
     *
233
     * @throws Exception if unknown PNG predictor detected
234
     */
235 5
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
236
    {
237
        // try to read Cross-Reference Stream
238 5
        $xrefobj = $this->getRawObject($pdfData, $startxref);
239 5
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
240 5
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
241
            // get only the last updated version
242 5
            $xref['trailer'] = [];
243 5
            $filltrailer = true;
244
        } else {
245 3
            $filltrailer = false;
246
        }
247 5
        if (!isset($xref['xref'])) {
248 5
            $xref['xref'] = [];
249
        }
250 5
        $valid_crs = false;
251 5
        $columns = 0;
252 5
        $predictor = null;
253 5
        $sarr = $xrefcrs[0][1];
254 5
        if (!\is_array($sarr)) {
255
            $sarr = [];
256
        }
257
258 5
        $wb = [];
259
260 5
        foreach ($sarr as $k => $v) {
261
            if (
262 5
                ('/' == $v[0])
263 5
                && ('Type' == $v[1])
264
                && (
265 5
                    isset($sarr[($k + 1)])
266 5
                    && '/' == $sarr[($k + 1)][0]
267 5
                    && 'XRef' == $sarr[($k + 1)][1]
268
                )
269
            ) {
270 5
                $valid_crs = true;
271 5
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272
                // first object number in the subsection
273 3
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
274 5
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
275
                // get previous xref offset
276 3
                $prevxref = (int) ($sarr[($k + 1)][1]);
277 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
278
                // number of bytes (in the decoded stream) of the corresponding field
279 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
280 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
281 5
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
282 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
283 4
                $decpar = $sarr[($k + 1)][1];
284 4
                foreach ($decpar as $kdc => $vdc) {
285
                    if (
286 4
                        '/' == $vdc[0]
287 4
                        && 'Columns' == $vdc[1]
288
                        && (
289 4
                            isset($decpar[($kdc + 1)])
290 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
291
                        )
292
                    ) {
293 4
                        $columns = (int) ($decpar[($kdc + 1)][1]);
294
                    } elseif (
295 4
                        '/' == $vdc[0]
296 4
                        && 'Predictor' == $vdc[1]
297
                        && (
298 4
                            isset($decpar[($kdc + 1)])
299 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
300
                        )
301
                    ) {
302 4
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
303
                    }
304
                }
305 5
            } elseif ($filltrailer) {
306 5
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
307 5
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
308 5
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
309 5
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
310 5
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
311 5
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
312 5
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
313
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
314 5
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
315 5
                    $xref['trailer']['id'] = [];
316 5
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
317 5
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
318
                }
319
            }
320
        }
321
322
        // decode data
323 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
324 5
            if (null !== $predictor) {
325
                // number of bytes in a row
326 4
                $rowlen = ($columns + 1);
327
                // convert the stream into an array of integers
328 4
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
329
                // split the rows
330 4
                $sdata = array_chunk($sdata, $rowlen);
331
332
                // initialize decoded array
333 4
                $ddata = [];
334
                // initialize first row with zeros
335 4
                $prev_row = array_fill(0, $rowlen, 0);
336
                // for each row apply PNG unpredictor
337 4
                foreach ($sdata as $k => $row) {
338
                    // initialize new row
339 4
                    $ddata[$k] = [];
340
                    // get PNG predictor value
341 4
                    $predictor = (10 + $row[0]);
342
                    // for each byte on the row
343 4
                    for ($i = 1; $i <= $columns; ++$i) {
344
                        // new index
345 4
                        $j = ($i - 1);
346 4
                        $row_up = $prev_row[$j];
347 4
                        if (1 == $i) {
348 4
                            $row_left = 0;
349 4
                            $row_upleft = 0;
350
                        } else {
351 4
                            $row_left = $row[($i - 1)];
352 4
                            $row_upleft = $prev_row[($j - 1)];
353
                        }
354 4
                        switch ($predictor) {
355 4
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
356
                                $ddata[$k][$j] = $row[$i];
357
                                break;
358
359 4
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
360
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
361
                                break;
362
363 4
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
364 4
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
365 4
                                break;
366
367
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
368
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
369
                                break;
370
371
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
372
                                // initial estimate
373
                                $p = ($row_left + $row_up - $row_upleft);
374
                                // distances
375
                                $pa = abs($p - $row_left);
376
                                $pb = abs($p - $row_up);
377
                                $pc = abs($p - $row_upleft);
378
                                $pmin = min($pa, $pb, $pc);
379
                                // return minimum distance
380
                                switch ($pmin) {
381
                                    case $pa:
382
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
383
                                        break;
384
385
                                    case $pb:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
387
                                        break;
388
389
                                    case $pc:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
391
                                        break;
392
                                }
393
                                break;
394
395
                            default:  // PNG prediction (on encoding, PNG optimum)
396
                                throw new Exception('Unknown PNG predictor: '.$predictor);
397
                        }
398
                    }
399 4
                    $prev_row = $ddata[$k];
400
                } // end for each row
401
                // complete decoding
402
            } else {
403
                // number of bytes in a row
404 1
                $rowlen = array_sum($wb);
405
                // convert the stream into an array of integers
406 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
407
                // split the rows
408 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

408
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
409
            }
410
411 5
            $sdata = [];
412
413
            // for every row
414 5
            foreach ($ddata as $k => $row) {
415
                // initialize new row
416 5
                $sdata[$k] = [0, 0, 0];
417 5
                if (0 == $wb[0]) {
418
                    // default type field
419
                    $sdata[$k][0] = 1;
420
                }
421 5
                $i = 0; // count bytes in the row
422
                // for every column
423 5
                for ($c = 0; $c < 3; ++$c) {
424
                    // for every byte on the column
425 5
                    for ($b = 0; $b < $wb[$c]; ++$b) {
426 5
                        if (isset($row[$i])) {
427 5
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
428
                        }
429 5
                        ++$i;
430
                    }
431
                }
432
            }
433
434
            // fill xref
435 5
            if (isset($index_first)) {
436 3
                $obj_num = $index_first;
437
            } else {
438 5
                $obj_num = 0;
439
            }
440 5
            foreach ($sdata as $k => $row) {
441 5
                switch ($row[0]) {
442 5
                    case 0:  // (f) linked list of free objects
443 5
                            break;
444
445 5
                    case 1:  // (n) objects that are in use but are not compressed
446
                            // create unique object index: [object number]_[generation number]
447 5
                            $index = $obj_num.'_'.$row[2];
448
                            // check if object already exist
449 5
                            if (!isset($xref['xref'][$index])) {
450
                                // store object offset position
451 5
                                $xref['xref'][$index] = $row[1];
452
                            }
453 5
                            break;
454
455 5
                    case 2:  // compressed objects
456
                            // $row[1] = object number of the object stream in which this object is stored
457
                            // $row[2] = index of this object within the object stream
458 5
                            $index = $row[1].'_0_'.$row[2];
459 5
                            $xref['xref'][$index] = -1;
460 5
                            break;
461
462
                    default:  // null objects
463
                            break;
464
                }
465 5
                ++$obj_num;
466
            }
467
        } // end decoding data
468 5
        if (isset($prevxref)) {
469
            // get previous xref
470 3
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
471
        }
472
473 5
        return $xref;
474
    }
475
476 33
    protected function getObjectHeaderPattern(array $objRefs): string
477
    {
478
        // consider all whitespace character (PDF specifications)
479 33
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
480
    }
481
482 33
    protected function getObjectHeaderLen(array $objRefs): int
483
    {
484
        // "4 0 obj"
485
        // 2 whitespaces + strlen("obj") = 5
486 33
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
487
    }
488
489
    /**
490
     * Get content of indirect object.
491
     *
492
     * @param string $pdfData  PDF data
493
     * @param string $objRef   Object number and generation number separated by underscore character
494
     * @param int    $offset   Object offset
495
     * @param bool   $decoding If true decode streams
496
     *
497
     * @return array containing object data
498
     *
499
     * @throws Exception if invalid object reference found
500
     */
501 33
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
502
    {
503
        /*
504
         * build indirect object header
505
         */
506
        // $objHeader = "[object number] [generation number] obj"
507 33
        $objRefArr = explode('_', $objRef);
508 33
        if (2 !== \count($objRefArr)) {
509
            throw new Exception('Invalid object reference for $obj.');
510
        }
511
512 33
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
513
514
        /*
515
         * check if we are in position
516
         */
517
        // ignore whitespace characters at offset
518 33
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
519
        // ignore leading zeros for object number
520 33
        $offset += strspn($pdfData, '0', $offset);
521 33
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
522
            // an indirect reference to an undefined object shall be considered a reference to the null object
523 1
            return ['null', 'null', $offset];
524
        }
525
526
        /*
527
         * get content
528
         */
529
        // starting position of object content
530 33
        $offset += $objHeaderLen;
531 33
        $objContentArr = [];
532 33
        $i = 0; // object main index
533
        do {
534 33
            $oldOffset = $offset;
535
            // get element
536 33
            $element = $this->getRawObject($pdfData, $offset);
537 33
            $offset = $element[2];
538
            // decode stream using stream's dictionary information
539 33
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
540 33
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
541
            }
542 33
            $objContentArr[$i] = $element;
543 33
            ++$i;
544 33
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
545
        // remove closing delimiter
546 33
        array_pop($objContentArr);
547
548
        /*
549
         * return raw object content
550
         */
551 33
        return $objContentArr;
552
    }
553
554
    /**
555
     * Get the content of object, resolving indirect object reference if necessary.
556
     *
557
     * @param string $pdfData PDF data
558
     * @param array  $obj     Object value
559
     *
560
     * @return array containing object data
561
     *
562
     * @throws Exception
563
     */
564 33
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
565
    {
566 33
        if ('objref' == $obj[0]) {
567
            // reference to indirect object
568
            if (isset($this->objects[$obj[1]])) {
569
                // this object has been already parsed
570
                return $this->objects[$obj[1]];
571
            } elseif (isset($xref[$obj[1]])) {
572
                // parse new object
573
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
574
575
                return $this->objects[$obj[1]];
576
            }
577
        }
578
579 33
        return $obj;
580
    }
581
582
    /**
583
     * Get object type, raw value and offset to next object
584
     *
585
     * @param int $offset Object offset
586
     *
587
     * @return array containing object type, raw value and offset to next object
588
     */
589 34
    protected function getRawObject(string $pdfData, int $offset = 0): array
590
    {
591 34
        $objtype = ''; // object type to be returned
592 34
        $objval = ''; // object value to be returned
593
594
        // skip initial white space chars
595 34
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
596
597
        // get first char
598 34
        $char = $pdfData[$offset];
599
        // get object type
600 34
        switch ($char) {
601 34
            case '%':  // \x25 PERCENT SIGN
602
                    // skip comment and search for next token
603
                    $next = strcspn($pdfData, "\r\n", $offset);
604
                    if ($next > 0) {
605
                        $offset += $next;
606
607
                        return $this->getRawObject($pdfData, $offset);
608
                    }
609
                    break;
610
611 34
            case '/':  // \x2F SOLIDUS
612
                    // name object
613 34
                    $objtype = $char;
614 34
                    ++$offset;
615 34
                    $pregResult = preg_match(
616 34
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
617 34
                        substr($pdfData, $offset, 256),
618
                        $matches
619
                    );
620 34
                    if (1 == $pregResult) {
621 34
                        $objval = $matches[1]; // unescaped value
622 34
                        $offset += \strlen($objval);
623
                    }
624 34
                    break;
625
626 34
            case '(':   // \x28 LEFT PARENTHESIS
627 34
            case ')':  // \x29 RIGHT PARENTHESIS
628
                    // literal string object
629 31
                    $objtype = $char;
630 31
                    ++$offset;
631 31
                    $strpos = $offset;
632 31
                    if ('(' == $char) {
633 31
                        $open_bracket = 1;
634 31
                        while ($open_bracket > 0) {
635 31
                            if (!isset($pdfData[$strpos])) {
636
                                break;
637
                            }
638 31
                            $ch = $pdfData[$strpos];
639 31
                            switch ($ch) {
640 31
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
641
                                        // skip next character
642 16
                                        ++$strpos;
643 16
                                        break;
644
645 31
                                case '(':  // LEFT PARENHESIS (28h)
646
                                        ++$open_bracket;
647
                                        break;
648
649 31
                                case ')':  // RIGHT PARENTHESIS (29h)
650 31
                                        --$open_bracket;
651 31
                                        break;
652
                            }
653 31
                            ++$strpos;
654
                        }
655 31
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
656 31
                        $offset = $strpos;
657
                    }
658 31
                    break;
659
660 34
            case '[':   // \x5B LEFT SQUARE BRACKET
661 34
            case ']':  // \x5D RIGHT SQUARE BRACKET
662
                // array object
663 33
                $objtype = $char;
664 33
                ++$offset;
665 33
                if ('[' == $char) {
666
                    // get array content
667 33
                    $objval = [];
668
                    do {
669 33
                        $oldOffset = $offset;
670
                        // get element
671 33
                        $element = $this->getRawObject($pdfData, $offset);
672 33
                        $offset = $element[2];
673 33
                        $objval[] = $element;
674 33
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
675
                    // remove closing delimiter
676 33
                    array_pop($objval);
677
                }
678 33
                break;
679
680 34
            case '<':  // \x3C LESS-THAN SIGN
681 34
            case '>':  // \x3E GREATER-THAN SIGN
682 34
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
683
                    // dictionary object
684 34
                    $objtype = $char.$char;
685 34
                    $offset += 2;
686 34
                    if ('<' == $char) {
687
                        // get array content
688 34
                        $objval = [];
689
                        do {
690 34
                            $oldOffset = $offset;
691
                            // get element
692 34
                            $element = $this->getRawObject($pdfData, $offset);
693 34
                            $offset = $element[2];
694 34
                            $objval[] = $element;
695 34
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
696
                        // remove closing delimiter
697 34
                        array_pop($objval);
698
                    }
699
                } else {
700
                    // hexadecimal string object
701 12
                    $objtype = $char;
702 12
                    ++$offset;
703 12
                    $pregResult = preg_match(
704 12
                        '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
705 12
                        substr($pdfData, $offset),
706
                        $matches
707
                    );
708 12
                    if (('<' == $char) && 1 == $pregResult) {
709
                        // remove white space characters
710 12
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
711 12
                        $offset += \strlen($matches[0]);
712
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
713
                        $offset = $endpos + 1;
714
                    }
715
                }
716 34
                break;
717
718
            default:
719 34
                if ('endobj' == substr($pdfData, $offset, 6)) {
720
                    // indirect object
721 33
                    $objtype = 'endobj';
722 33
                    $offset += 6;
723 34
                } elseif ('null' == substr($pdfData, $offset, 4)) {
724
                    // null object
725 3
                    $objtype = 'null';
726 3
                    $offset += 4;
727 3
                    $objval = 'null';
728 34
                } elseif ('true' == substr($pdfData, $offset, 4)) {
729
                    // boolean true object
730 11
                    $objtype = 'boolean';
731 11
                    $offset += 4;
732 11
                    $objval = 'true';
733 34
                } elseif ('false' == substr($pdfData, $offset, 5)) {
734
                    // boolean false object
735 2
                    $objtype = 'boolean';
736 2
                    $offset += 5;
737 2
                    $objval = 'false';
738 34
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
739
                    // start stream object
740 33
                    $objtype = 'stream';
741 33
                    $offset += 6;
742 33
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
743 33
                        $offset += \strlen($matches[0]);
744 33
                        $pregResult = preg_match(
745 33
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
746 33
                            substr($pdfData, $offset),
747
                            $matches,
748 33
                            \PREG_OFFSET_CAPTURE
749
                        );
750 33
                        if (1 == $pregResult) {
751 33
                            $objval = substr($pdfData, $offset, $matches[0][1]);
752 33
                            $offset += $matches[1][1];
753
                        }
754
                    }
755 34
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
756
                    // end stream object
757 33
                    $objtype = 'endstream';
758 33
                    $offset += 9;
759 34
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
760
                    // indirect object reference
761 33
                    $objtype = 'objref';
762 33
                    $offset += \strlen($matches[0]);
763 33
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
764 34
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
765
                    // object start
766 5
                    $objtype = 'obj';
767 5
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
768 5
                    $offset += \strlen($matches[0]);
769 34
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
770
                    // numeric object
771 33
                    $objtype = 'numeric';
772 33
                    $objval = substr($pdfData, $offset, $numlen);
773 33
                    $offset += $numlen;
774
                }
775 34
                break;
776
        }
777
778 34
        return [$objtype, $objval, $offset];
779
    }
780
781
    /**
782
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
783
     *
784
     * @param int   $offset xref offset (if known)
785
     * @param array $xref   previous xref array (if any)
786
     *
787
     * @return array containing xref and trailer data
788
     *
789
     * @throws Exception if it was unable to find startxref
790
     * @throws Exception if it was unable to find xref
791
     */
792 34
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
793
    {
794 34
        $startxrefPreg = preg_match(
795 34
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
796
            $pdfData,
797
            $matches,
798 34
            \PREG_OFFSET_CAPTURE,
799
            $offset
800
        );
801
802 34
        if (0 == $offset) {
803
            // find last startxref
804 34
            $pregResult = preg_match_all(
805 34
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
806
                $pdfData, $matches,
807 34
                \PREG_SET_ORDER,
808
                $offset
809
            );
810 34
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
811
                throw new Exception('Unable to find startxref');
812
            }
813 34
            $matches = array_pop($matches);
814 34
            $startxref = $matches[1];
815 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
816
            // Already pointing at the xref table
817 5
            $startxref = $offset;
818 3
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
819
            // Cross-Reference Stream object
820 3
            $startxref = $offset;
821
        } elseif ($startxrefPreg) {
822
            // startxref found
823
            $startxref = $matches[1][0];
824
        } else {
825
            throw new Exception('Unable to find startxref');
826
        }
827
828 34
        if ($startxref > \strlen($pdfData)) {
829 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
830
        }
831
832
        // check xref position
833 33
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
834
            // Cross-Reference
835 28
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
836
        } else {
837
            // Cross-Reference Stream
838 5
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
839
        }
840 33
        if (empty($xref)) {
841
            throw new Exception('Unable to find xref');
842
        }
843
844 33
        return $xref;
845
    }
846
847
    /**
848
     * Parses PDF data and returns extracted data as array.
849
     *
850
     * @param string $data PDF data to parse
851
     *
852
     * @return array array of parsed PDF document objects
853
     *
854
     * @throws Exception if empty PDF data given
855
     * @throws Exception if PDF data missing %PDF header
856
     */
857 34
    public function parseData(string $data): array
858
    {
859 34
        if (empty($data)) {
860
            throw new Exception('Empty PDF data given.');
861
        }
862
        // find the pdf header starting position
863 34
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
864
            throw new Exception('Invalid PDF data: missing %PDF header.');
865
        }
866
867
        // get PDF content string
868 34
        $pdfData = substr($data, $trimpos);
869
870
        // get xref and trailer data
871 34
        $xref = $this->getXrefData($pdfData);
872
873
        // parse all document objects
874 33
        $objects = [];
875 33
        foreach ($xref['xref'] as $obj => $offset) {
876 33
            if (!isset($objects[$obj]) && ($offset > 0)) {
877
                // decode objects with positive offset
878 33
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
879
            }
880
        }
881
882 33
        return [$xref, $objects];
883
    }
884
}
885