Passed
Push — master ( 416ff0...c2c117 )
by Konrad
04:12 queued 02:09
created

RawDataParser::decodeStream()   D

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 26
CRAP Score 22.8507

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 4
dl 0
loc 56
ccs 26
cts 31
cp 0.8387
crap 22.8507
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 37
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 37
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 37
        $this->filterHelper = new FilterHelper();
75 37
        $this->config = $config ?: new Config();
76 37
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $sdic    Stream's dictionary array
83
     * @param string $stream  Stream to decode
84
     *
85
     * @return array containing decoded stream data and remaining filters
86
     *
87
     * @throws Exception
88
     */
89 33
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
90
    {
91
        // get stream length and filters
92 33
        $slength = \strlen($stream);
93 33
        if ($slength <= 0) {
94
            return ['', []];
95
        }
96 33
        $filters = [];
97 33
        foreach ($sdic as $k => $v) {
98 33
            if ('/' == $v[0]) {
99 33
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
100
                    // get declared stream length
101 32
                    $declength = (int) ($sdic[($k + 1)][1]);
102 32
                    if ($declength < $slength) {
103 32
                        $stream = substr($stream, 0, $declength);
104 32
                        $slength = $declength;
105
                    }
106 33
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
107
                    // resolve indirect object
108 33
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
109 33
                    if ('/' == $objval[0]) {
110
                        // single filter
111 33
                        $filters[] = $objval[1];
112 3
                    } elseif ('[' == $objval[0]) {
113
                        // array of filters
114 3
                        foreach ($objval[1] as $flt) {
115 3
                            if ('/' == $flt[0]) {
116 3
                                $filters[] = $flt[1];
117
                            }
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
124
        // decode the stream
125 33
        $remaining_filters = [];
126 33
        foreach ($filters as $filter) {
127 33
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
128
                try {
129 33
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
130
                } catch (Exception $e) {
131
                    $emsg = $e->getMessage();
132
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
133
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
134
                    ) {
135 33
                        throw new Exception($e->getMessage());
136
                    }
137
                }
138
            } else {
139
                // add missing filter to array
140 4
                $remaining_filters[] = $filter;
141
            }
142
        }
143
144 33
        return [$stream, $remaining_filters];
145
    }
146
147
    /**
148
     * Decode the Cross-Reference section
149
     *
150
     * @param string $pdfData   PDF data
151
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
152
     * @param array  $xref      Previous xref array (if any)
153
     *
154
     * @return array containing xref and trailer data
155
     *
156
     * @throws Exception
157
     */
158 28
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
159
    {
160 28
        $startxref += 4; // 4 is the length of the word 'xref'
161
        // skip initial white space chars
162 28
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
163
        // initialize object number
164 28
        $obj_num = 0;
165
        // search for cross-reference entries or subsection
166 28
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
167 28
            if ($matches[0][1] != $offset) {
168
                // we are on another section
169 5
                break;
170
            }
171 28
            $offset += \strlen($matches[0][0]);
172 28
            if ('n' == $matches[3][0]) {
173
                // create unique object index: [object number]_[generation number]
174 28
                $index = $obj_num.'_'.(int) ($matches[2][0]);
175
                // check if object already exist
176 28
                if (!isset($xref['xref'][$index])) {
177
                    // store object offset position
178 28
                    $xref['xref'][$index] = (int) ($matches[1][0]);
179
                }
180 28
                ++$obj_num;
181 28
            } elseif ('f' == $matches[3][0]) {
182 28
                ++$obj_num;
183
            } else {
184
                // object number (index)
185 28
                $obj_num = (int) ($matches[1][0]);
186
            }
187
        }
188
        // get trailer data
189 28
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
190 28
            $trailer_data = $matches[1][0];
191 28
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
192
                // get only the last updated version
193 28
                $xref['trailer'] = [];
194
                // parse trailer_data
195 28
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
196 28
                    $xref['trailer']['size'] = (int) ($matches[1]);
197
                }
198 28
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 28
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 28
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 28
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 28
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
206
                }
207 28
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
208 24
                    $xref['trailer']['id'] = [];
209 24
                    $xref['trailer']['id'][0] = $matches[1];
210 24
                    $xref['trailer']['id'][1] = $matches[2];
211
                }
212
            }
213 28
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214
                // get previous xref
215 28
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
216
            }
217
        } else {
218
            throw new Exception('Unable to find trailer');
219
        }
220
221 28
        return $xref;
222
    }
223
224
    /**
225
     * Decode the Cross-Reference Stream section
226
     *
227
     * @param string $pdfData   PDF data
228
     * @param int    $startxref Offset at which the xref section starts
229
     * @param array  $xref      Previous xref array (if any)
230
     *
231
     * @return array containing xref and trailer data
232
     *
233
     * @throws Exception if unknown PNG predictor detected
234
     */
235 5
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
236
    {
237
        // try to read Cross-Reference Stream
238 5
        $xrefobj = $this->getRawObject($pdfData, $startxref);
239 5
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
240 5
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
241
            // get only the last updated version
242 5
            $xref['trailer'] = [];
243 5
            $filltrailer = true;
244
        } else {
245 3
            $filltrailer = false;
246
        }
247 5
        if (!isset($xref['xref'])) {
248 5
            $xref['xref'] = [];
249
        }
250 5
        $valid_crs = false;
251 5
        $columns = 0;
252 5
        $predictor = null;
253 5
        $sarr = $xrefcrs[0][1];
254 5
        if (!\is_array($sarr)) {
255
            $sarr = [];
256
        }
257
258 5
        $wb = [];
259
260 5
        foreach ($sarr as $k => $v) {
261
            if (
262 5
                ('/' == $v[0])
263 5
                && ('Type' == $v[1])
264
                && (
265 5
                    isset($sarr[($k + 1)])
266 5
                    && '/' == $sarr[($k + 1)][0]
267 5
                    && 'XRef' == $sarr[($k + 1)][1]
268
                )
269
            ) {
270 5
                $valid_crs = true;
271 5
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272
                // first object number in the subsection
273 3
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
274 5
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
275
                // get previous xref offset
276 3
                $prevxref = (int) ($sarr[($k + 1)][1]);
277 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
278
                // number of bytes (in the decoded stream) of the corresponding field
279 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
280 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
281 5
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
282 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
283 4
                $decpar = $sarr[($k + 1)][1];
284 4
                foreach ($decpar as $kdc => $vdc) {
285
                    if (
286 4
                        '/' == $vdc[0]
287 4
                        && 'Columns' == $vdc[1]
288
                        && (
289 4
                            isset($decpar[($kdc + 1)])
290 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
291
                        )
292
                    ) {
293 4
                        $columns = (int) ($decpar[($kdc + 1)][1]);
294
                    } elseif (
295 4
                        '/' == $vdc[0]
296 4
                        && 'Predictor' == $vdc[1]
297
                        && (
298 4
                            isset($decpar[($kdc + 1)])
299 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
300
                        )
301
                    ) {
302 4
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
303
                    }
304
                }
305 5
            } elseif ($filltrailer) {
306 5
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
307 5
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
308 5
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
309 5
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
310 5
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
311 5
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
312 5
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
313
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
314 5
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
315 5
                    $xref['trailer']['id'] = [];
316 5
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
317 5
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
318
                }
319
            }
320
        }
321
322
        // decode data
323 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
324 5
            if (null !== $predictor) {
325
                // number of bytes in a row
326 4
                $rowlen = ($columns + 1);
327
                // convert the stream into an array of integers
328 4
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
329
                // split the rows
330 4
                $sdata = array_chunk($sdata, $rowlen);
331
332
                // initialize decoded array
333 4
                $ddata = [];
334
                // initialize first row with zeros
335 4
                $prev_row = array_fill(0, $rowlen, 0);
336
                // for each row apply PNG unpredictor
337 4
                foreach ($sdata as $k => $row) {
338
                    // initialize new row
339 4
                    $ddata[$k] = [];
340
                    // get PNG predictor value
341 4
                    $predictor = (10 + $row[0]);
342
                    // for each byte on the row
343 4
                    for ($i = 1; $i <= $columns; ++$i) {
344
                        // new index
345 4
                        $j = ($i - 1);
346 4
                        $row_up = $prev_row[$j];
347 4
                        if (1 == $i) {
348 4
                            $row_left = 0;
349 4
                            $row_upleft = 0;
350
                        } else {
351 4
                            $row_left = $row[($i - 1)];
352 4
                            $row_upleft = $prev_row[($j - 1)];
353
                        }
354 4
                        switch ($predictor) {
355 4
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
356
                                $ddata[$k][$j] = $row[$i];
357
                                break;
358
359 4
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
360
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
361
                                break;
362
363 4
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
364 4
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
365 4
                                break;
366
367
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
368
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
369
                                break;
370
371
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
372
                                // initial estimate
373
                                $p = ($row_left + $row_up - $row_upleft);
374
                                // distances
375
                                $pa = abs($p - $row_left);
376
                                $pb = abs($p - $row_up);
377
                                $pc = abs($p - $row_upleft);
378
                                $pmin = min($pa, $pb, $pc);
379
                                // return minimum distance
380
                                switch ($pmin) {
381
                                    case $pa:
382
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
383
                                        break;
384
385
                                    case $pb:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
387
                                        break;
388
389
                                    case $pc:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
391
                                        break;
392
                                }
393
                                break;
394
395
                            default:  // PNG prediction (on encoding, PNG optimum)
396
                                throw new Exception('Unknown PNG predictor: '.$predictor);
397
                        }
398
                    }
399 4
                    $prev_row = $ddata[$k];
400
                } // end for each row
401
                // complete decoding
402
            } else {
403
                // number of bytes in a row
404 1
                $rowlen = array_sum($wb);
405
                // convert the stream into an array of integers
406 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
407
                // split the rows
408 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

408
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
409
            }
410
411 5
            $sdata = [];
412
413
            // for every row
414 5
            foreach ($ddata as $k => $row) {
415
                // initialize new row
416 5
                $sdata[$k] = [0, 0, 0];
417 5
                if (0 == $wb[0]) {
418
                    // default type field
419
                    $sdata[$k][0] = 1;
420
                }
421 5
                $i = 0; // count bytes in the row
422
                // for every column
423 5
                for ($c = 0; $c < 3; ++$c) {
424
                    // for every byte on the column
425 5
                    for ($b = 0; $b < $wb[$c]; ++$b) {
426 5
                        if (isset($row[$i])) {
427 5
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
428
                        }
429 5
                        ++$i;
430
                    }
431
                }
432
            }
433
434
            // fill xref
435 5
            if (isset($index_first)) {
436 3
                $obj_num = $index_first;
437
            } else {
438 5
                $obj_num = 0;
439
            }
440 5
            foreach ($sdata as $k => $row) {
441 5
                switch ($row[0]) {
442 5
                    case 0:  // (f) linked list of free objects
443 5
                            break;
444
445 5
                    case 1:  // (n) objects that are in use but are not compressed
446
                            // create unique object index: [object number]_[generation number]
447 5
                            $index = $obj_num.'_'.$row[2];
448
                            // check if object already exist
449 5
                            if (!isset($xref['xref'][$index])) {
450
                                // store object offset position
451 5
                                $xref['xref'][$index] = $row[1];
452
                            }
453 5
                            break;
454
455 5
                    case 2:  // compressed objects
456
                            // $row[1] = object number of the object stream in which this object is stored
457
                            // $row[2] = index of this object within the object stream
458 5
                            $index = $row[1].'_0_'.$row[2];
459 5
                            $xref['xref'][$index] = -1;
460 5
                            break;
461
462
                    default:  // null objects
463
                            break;
464
                }
465 5
                ++$obj_num;
466
            }
467
        } // end decoding data
468 5
        if (isset($prevxref)) {
469
            // get previous xref
470 3
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
471
        }
472
473 5
        return $xref;
474
    }
475
476 33
    protected function getObjectHeaderPattern(array $objRefs): string
477
    {
478
        // consider all whitespace character (PDF specifications)
479 33
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
480
    }
481
482 33
    protected function getObjectHeaderLen(array $objRefs): int
483
    {
484
        // "4 0 obj"
485
        // 2 whitespaces + strlen("obj") = 5
486 33
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
487
    }
488
489
    /**
490
     * Get content of indirect object.
491
     *
492
     * @param string $pdfData  PDF data
493
     * @param string $objRef   Object number and generation number separated by underscore character
494
     * @param int    $offset   Object offset
495
     * @param bool   $decoding If true decode streams
496
     *
497
     * @return array containing object data
498
     *
499
     * @throws Exception if invalid object reference found
500
     */
501 33
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
502
    {
503
        /*
504
         * build indirect object header
505
         */
506
        // $objHeader = "[object number] [generation number] obj"
507 33
        $objRefArr = explode('_', $objRef);
508 33
        if (2 !== \count($objRefArr)) {
509
            throw new Exception('Invalid object reference for $obj.');
510
        }
511
512 33
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
513
514
        /*
515
         * check if we are in position
516
         */
517
        // ignore whitespace characters at offset
518 33
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
519
        // ignore leading zeros for object number
520 33
        $offset += strspn($pdfData, '0', $offset);
521 33
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
522
            // an indirect reference to an undefined object shall be considered a reference to the null object
523 1
            return ['null', 'null', $offset];
524
        }
525
526
        /*
527
         * get content
528
         */
529
        // starting position of object content
530 33
        $offset += $objHeaderLen;
531 33
        $objContentArr = [];
532 33
        $i = 0; // object main index
533
        do {
534 33
            $oldOffset = $offset;
535
            // get element
536 33
            $element = $this->getRawObject($pdfData, $offset);
537 33
            $offset = $element[2];
538
            // decode stream using stream's dictionary information
539 33
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
540 33
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
541
            }
542 33
            $objContentArr[$i] = $element;
543 33
            ++$i;
544 33
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
545
        // remove closing delimiter
546 33
        array_pop($objContentArr);
547
548
        /*
549
         * return raw object content
550
         */
551 33
        return $objContentArr;
552
    }
553
554
    /**
555
     * Get the content of object, resolving indirect object reference if necessary.
556
     *
557
     * @param string $pdfData PDF data
558
     * @param array  $obj     Object value
559
     *
560
     * @return array containing object data
561
     *
562
     * @throws Exception
563
     */
564 33
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
565
    {
566 33
        if ('objref' == $obj[0]) {
567
            // reference to indirect object
568
            if (isset($this->objects[$obj[1]])) {
569
                // this object has been already parsed
570
                return $this->objects[$obj[1]];
571
            } elseif (isset($xref[$obj[1]])) {
572
                // parse new object
573
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
574
575
                return $this->objects[$obj[1]];
576
            }
577
        }
578
579 33
        return $obj;
580
    }
581
582
    /**
583
     * Get object type, raw value and offset to next object
584
     *
585
     * @param int $offset Object offset
586
     *
587
     * @return array containing object type, raw value and offset to next object
588
     */
589 34
    protected function getRawObject(string $pdfData, int $offset = 0): array
590
    {
591 34
        $objtype = ''; // object type to be returned
592 34
        $objval = ''; // object value to be returned
593
594
        // skip initial white space chars
595 34
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
596
597
        // get first char
598 34
        $char = $pdfData[$offset];
599
        // get object type
600 34
        switch ($char) {
601 34
            case '%':  // \x25 PERCENT SIGN
602
                    // skip comment and search for next token
603
                    $next = strcspn($pdfData, "\r\n", $offset);
604
                    if ($next > 0) {
605
                        $offset += $next;
606
607
                        return $this->getRawObject($pdfData, $offset);
608
                    }
609
                    break;
610
611 34
            case '/':  // \x2F SOLIDUS
612
                    // name object
613 34
                    $objtype = $char;
614 34
                    ++$offset;
615 34
                    $pregResult = preg_match(
616 34
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
617 34
                        substr($pdfData, $offset, 256),
618
                        $matches
619
                    );
620 34
                    if (1 == $pregResult) {
621 34
                        $objval = $matches[1]; // unescaped value
622 34
                        $offset += \strlen($objval);
623
                    }
624 34
                    break;
625
626 34
            case '(':   // \x28 LEFT PARENTHESIS
627 34
            case ')':  // \x29 RIGHT PARENTHESIS
628
                    // literal string object
629 31
                    $objtype = $char;
630 31
                    ++$offset;
631 31
                    $strpos = $offset;
632 31
                    if ('(' == $char) {
633 31
                        $open_bracket = 1;
634 31
                        while ($open_bracket > 0) {
635 31
                            if (!isset($pdfData[$strpos])) {
636
                                break;
637
                            }
638 31
                            $ch = $pdfData[$strpos];
639 31
                            switch ($ch) {
640 31
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
641
                                        // skip next character
642 16
                                        ++$strpos;
643 16
                                        break;
644
645 31
                                case '(':  // LEFT PARENHESIS (28h)
646
                                        ++$open_bracket;
647
                                        break;
648
649 31
                                case ')':  // RIGHT PARENTHESIS (29h)
650 31
                                        --$open_bracket;
651 31
                                        break;
652
                            }
653 31
                            ++$strpos;
654
                        }
655 31
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
656 31
                        $offset = $strpos;
657
                    }
658 31
                    break;
659
660 34
            case '[':   // \x5B LEFT SQUARE BRACKET
661 34
            case ']':  // \x5D RIGHT SQUARE BRACKET
662
                // array object
663 33
                $objtype = $char;
664 33
                ++$offset;
665 33
                if ('[' == $char) {
666
                    // get array content
667 33
                    $objval = [];
668
                    do {
669 33
                        $oldOffset = $offset;
670
                        // get element
671 33
                        $element = $this->getRawObject($pdfData, $offset);
672 33
                        $offset = $element[2];
673 33
                        $objval[] = $element;
674 33
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
675
                    // remove closing delimiter
676 33
                    array_pop($objval);
677
                }
678 33
                break;
679
680 34
            case '<':  // \x3C LESS-THAN SIGN
681 34
            case '>':  // \x3E GREATER-THAN SIGN
682 34
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
683
                    // dictionary object
684 34
                    $objtype = $char.$char;
685 34
                    $offset += 2;
686 34
                    if ('<' == $char) {
687
                        // get array content
688 34
                        $objval = [];
689
                        do {
690 34
                            $oldOffset = $offset;
691
                            // get element
692 34
                            $element = $this->getRawObject($pdfData, $offset);
693 34
                            $offset = $element[2];
694 34
                            $objval[] = $element;
695 34
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
696
                        // remove closing delimiter
697 34
                        array_pop($objval);
698
                    }
699
                } else {
700
                    // hexadecimal string object
701 12
                    $objtype = $char;
702 12
                    ++$offset;
703 12
                    $pregResult = preg_match(
704 12
                        '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
705 12
                        substr($pdfData, $offset),
706
                        $matches
707
                    );
708 12
                    if (('<' == $char) && 1 == $pregResult) {
709
                        // remove white space characters
710 12
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
711 12
                        $offset += \strlen($matches[0]);
712
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
713
                        $offset = $endpos + 1;
714
                    }
715
                }
716 34
                break;
717
718
            default:
719 34
                if ('endobj' == substr($pdfData, $offset, 6)) {
720
                    // indirect object
721 33
                    $objtype = 'endobj';
722 33
                    $offset += 6;
723 34
                } elseif ('null' == substr($pdfData, $offset, 4)) {
724
                    // null object
725 3
                    $objtype = 'null';
726 3
                    $offset += 4;
727 3
                    $objval = 'null';
728 34
                } elseif ('true' == substr($pdfData, $offset, 4)) {
729
                    // boolean true object
730 11
                    $objtype = 'boolean';
731 11
                    $offset += 4;
732 11
                    $objval = 'true';
733 34
                } elseif ('false' == substr($pdfData, $offset, 5)) {
734
                    // boolean false object
735 2
                    $objtype = 'boolean';
736 2
                    $offset += 5;
737 2
                    $objval = 'false';
738 34
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
739
                    // start stream object
740 33
                    $objtype = 'stream';
741 33
                    $offset += 6;
742 33
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
743 33
                        $offset += \strlen($matches[0]);
744 33
                        $pregResult = preg_match(
745 33
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
746 33
                            substr($pdfData, $offset),
747
                            $matches,
748 33
                            \PREG_OFFSET_CAPTURE
749
                        );
750 33
                        if (1 == $pregResult) {
751 33
                            $objval = substr($pdfData, $offset, $matches[0][1]);
752 33
                            $offset += $matches[1][1];
753
                        }
754
                    }
755 34
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
756
                    // end stream object
757 33
                    $objtype = 'endstream';
758 33
                    $offset += 9;
759 34
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
760
                    // indirect object reference
761 33
                    $objtype = 'objref';
762 33
                    $offset += \strlen($matches[0]);
763 33
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
764 34
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
765
                    // object start
766 5
                    $objtype = 'obj';
767 5
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
768 5
                    $offset += \strlen($matches[0]);
769 34
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
770
                    // numeric object
771 33
                    $objtype = 'numeric';
772 33
                    $objval = substr($pdfData, $offset, $numlen);
773 33
                    $offset += $numlen;
774
                }
775 34
                break;
776
        }
777
778 34
        return [$objtype, $objval, $offset];
779
    }
780
781
    /**
782
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
783
     *
784
     * @param int   $offset xref offset (if known)
785
     * @param array $xref   previous xref array (if any)
786
     *
787
     * @return array containing xref and trailer data
788
     *
789
     * @throws Exception if it was unable to find startxref
790
     * @throws Exception if it was unable to find xref
791
     */
792 34
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
793
    {
794 34
        $startxrefPreg = preg_match(
795 34
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
796
            $pdfData,
797
            $matches,
798 34
            \PREG_OFFSET_CAPTURE,
799
            $offset
800
        );
801
802 34
        if (0 == $offset) {
803
            // find last startxref
804 34
            $pregResult = preg_match_all(
805 34
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
806
                $pdfData, $matches,
807 34
                \PREG_SET_ORDER,
808
                $offset
809
            );
810 34
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
811
                throw new Exception('Unable to find startxref');
812
            }
813 34
            $matches = array_pop($matches);
814 34
            $startxref = $matches[1];
815 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
816
            // Already pointing at the xref table
817 5
            $startxref = $offset;
818 3
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
819
            // Cross-Reference Stream object
820 3
            $startxref = $offset;
821
        } elseif ($startxrefPreg) {
822
            // startxref found
823
            $startxref = $matches[1][0];
824
        } else {
825
            throw new Exception('Unable to find startxref');
826
        }
827
828 34
        if ($startxref > \strlen($pdfData)) {
829 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
830
        }
831
832
        // check xref position
833 33
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
834
            // Cross-Reference
835 28
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
836
        } else {
837
            // Cross-Reference Stream
838 5
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
839
        }
840 33
        if (empty($xref)) {
841
            throw new Exception('Unable to find xref');
842
        }
843
844 33
        return $xref;
845
    }
846
847
    /**
848
     * Parses PDF data and returns extracted data as array.
849
     *
850
     * @param string $data PDF data to parse
851
     *
852
     * @return array array of parsed PDF document objects
853
     *
854
     * @throws Exception if empty PDF data given
855
     * @throws Exception if PDF data missing %PDF header
856
     */
857 34
    public function parseData(string $data): array
858
    {
859 34
        if (empty($data)) {
860
            throw new Exception('Empty PDF data given.');
861
        }
862
        // find the pdf header starting position
863 34
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
864
            throw new Exception('Invalid PDF data: missing %PDF header.');
865
        }
866
867
        // get PDF content string
868 34
        $pdfData = substr($data, $trimpos);
869
870
        // get xref and trailer data
871 34
        $xref = $this->getXrefData($pdfData);
872
873
        // parse all document objects
874 33
        $objects = [];
875 33
        foreach ($xref['xref'] as $obj => $offset) {
876 33
            if (!isset($objects[$obj]) && ($offset > 0)) {
877
                // decode objects with positive offset
878 33
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
879
            }
880
        }
881
882 33
        return [$xref, $objects];
883
    }
884
}
885