RawDataParser::decodeXref()   D
last analyzed

Complexity

Conditions 16
Paths 200

Size

Total Lines 67
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 16.005

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 16
eloc 39
c 3
b 1
f 1
nc 200
nop 4
dl 0
loc 67
ccs 36
cts 37
cp 0.973
crap 16.005
rs 4.7333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
use Smalot\PdfParser\Exception\EmptyPdfException;
47
use Smalot\PdfParser\Exception\MissingPdfHeaderException;
48
49
class RawDataParser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * Configuration array.
58
     *
59
     * @var array<string,bool>
60
     */
61
    protected $cfg = [
62
        // if `true` ignore filter decoding errors
63
        'ignore_filter_decoding_errors' => true,
64
        // if `true` ignore missing filter decoding errors
65
        'ignore_missing_filter_decoders' => true,
66
    ];
67
68
    protected $filterHelper;
69
    protected $objects;
70
71
    /**
72
     * @param array $cfg Configuration array, default is []
73
     */
74 74
    public function __construct($cfg = [], ?Config $config = null)
75
    {
76
        // merge given array with default values
77 74
        $this->cfg = array_merge($this->cfg, $cfg);
78
79 74
        $this->filterHelper = new FilterHelper();
80 74
        $this->config = $config ?: new Config();
81
    }
82
83
    /**
84
     * Decode the specified stream.
85
     *
86
     * @param string $pdfData PDF data
87
     * @param array  $sdic    Stream's dictionary array
88
     * @param string $stream  Stream to decode
89
     *
90
     * @return array containing decoded stream data and remaining filters
91
     *
92
     * @throws \Exception
93
     */
94 67
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
95
    {
96
        // get stream length and filters
97 67
        $slength = \strlen($stream);
98 67
        if ($slength <= 0) {
99
            return ['', []];
100
        }
101 67
        $filters = [];
102 67
        foreach ($sdic as $k => $v) {
103 67
            if ('/' == $v[0]) {
104 67
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
105
                    // get declared stream length
106 62
                    $declength = (int) $sdic[$k + 1][1];
107 62
                    if ($declength < $slength) {
108 62
                        $stream = substr($stream, 0, $declength);
109 62
                        $slength = $declength;
110
                    }
111 67
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
112
                    // resolve indirect object
113 67
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
114 67
                    if ('/' == $objval[0]) {
115
                        // single filter
116 67
                        $filters[] = $objval[1];
117 4
                    } elseif ('[' == $objval[0]) {
118
                        // array of filters
119 4
                        foreach ($objval[1] as $flt) {
120 4
                            if ('/' == $flt[0]) {
121 4
                                $filters[] = $flt[1];
122
                            }
123
                        }
124
                    }
125
                }
126
            }
127
        }
128
129
        // decode the stream
130 67
        $remaining_filters = [];
131 67
        foreach ($filters as $filter) {
132 67
            if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
133
                try {
134 67
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
135 2
                } catch (\Exception $e) {
136 2
                    $emsg = $e->getMessage();
137 2
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
138 2
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
139
                    ) {
140 67
                        throw new \Exception($e->getMessage());
141
                    }
142
                }
143
            } else {
144
                // add missing filter to array
145 9
                $remaining_filters[] = $filter;
146
            }
147
        }
148
149 67
        return [$stream, $remaining_filters];
150
    }
151
152
    /**
153
     * Decode the Cross-Reference section
154
     *
155
     * @param string     $pdfData        PDF data
156
     * @param int        $startxref      Offset at which the xref section starts (position of the 'xref' keyword)
157
     * @param array      $xref           Previous xref array (if any)
158
     * @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
159
     *
160
     * @return array containing xref and trailer data
161
     *
162
     * @throws \Exception
163 54
     */
164
    protected function decodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
165 54
    {
166
        $startxref += 4; // 4 is the length of the word 'xref'
167 54
        // skip initial white space chars
168
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
169 54
        // initialize object number
170
        $obj_num = 0;
171 54
        // search for cross-reference entries or subsection
172 54
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
173
            if ($matches[0][1] != $offset) {
174 12
                // we are on another section
175
                break;
176 54
            }
177 54
            $offset += \strlen($matches[0][0]);
178
            if ('n' == $matches[3][0]) {
179 54
                // create unique object index: [object number]_[generation number]
180
                $index = $obj_num.'_'.(int) $matches[2][0];
181 54
                // check if object already exist
182
                if (!isset($xref['xref'][$index])) {
183 54
                    // store object offset position
184
                    $xref['xref'][$index] = (int) $matches[1][0];
185 54
                }
186 54
                ++$obj_num;
187 53
            } elseif ('f' == $matches[3][0]) {
188
                ++$obj_num;
189
            } else {
190 54
                // object number (index)
191
                $obj_num = (int) $matches[1][0];
192
            }
193
        }
194 54
        // get trailer data
195 54
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
196 54
            $trailer_data = $matches[1][0];
197
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
198 54
                // get only the last updated version
199
                $xref['trailer'] = [];
200 54
                // parse trailer_data
201 54
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['size'] = (int) $matches[1];
203 54
                }
204 54
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
206 54
                }
207 2
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
208
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
209 54
                }
210 50
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
211
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
212 54
                }
213 41
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
214 41
                    $xref['trailer']['id'] = [];
215 41
                    $xref['trailer']['id'][0] = $matches[1];
216
                    $xref['trailer']['id'][1] = $matches[2];
217
                }
218 54
            }
219 13
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
220 13
                $offset = (int) $matches[1];
221
                if (0 != $offset) {
222 54
                    // get previous xref
223
                    $xref = $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
224
                }
225
            }
226
        } else {
227
            throw new \Exception('Unable to find trailer');
228
        }
229 54
230
        return $xref;
231
    }
232
233
    /**
234
     * Decode the Cross-Reference Stream section
235
     *
236
     * @param string     $pdfData        PDF data
237
     * @param int        $startxref      Offset at which the xref section starts
238
     * @param array      $xref           Previous xref array (if any)
239
     * @param array<int> $visitedOffsets Array of visited offsets to prevent infinite loops
240
     *
241
     * @return array containing xref and trailer data
242
     *
243 13
     * @throws \Exception if unknown PNG predictor detected
244
     */
245
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
246 13
    {
247 13
        // try to read Cross-Reference Stream
248 13
        $xrefobj = $this->getRawObject($pdfData, $startxref);
249
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
250 13
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
251 13
            // get only the last updated version
252
            $xref['trailer'] = [];
253 11
            $filltrailer = true;
254
        } else {
255 13
            $filltrailer = false;
256 13
        }
257
        if (!isset($xref['xref'])) {
258 13
            $xref['xref'] = [];
259 13
        }
260 13
        $valid_crs = false;
261 13
        $columns = 0;
262 13
        $predictor = null;
263
        $sarr = $xrefcrs[0][1];
264
        if (!\is_array($sarr)) {
265
            $sarr = [];
266 13
        }
267
268 13
        $wb = [];
269
270 13
        foreach ($sarr as $k => $v) {
271 13
            if (
272
                ('/' == $v[0])
273 13
                && ('Type' == $v[1])
274 13
                && (
275 13
                    isset($sarr[$k + 1])
276
                    && '/' == $sarr[$k + 1][0]
277
                    && 'XRef' == $sarr[$k + 1][1]
278 13
                )
279 13
            ) {
280
                $valid_crs = true;
281 11
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
282 11
                // initialize list for: first object number in the subsection / number of objects
283 11
                $index_blocks = [];
284
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
285 13
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
286
                }
287 11
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
288 13
                // get previous xref offset
289
                $prevxref = (int) $sarr[$k + 1][1];
290 13
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
291 13
                // number of bytes (in the decoded stream) of the corresponding field
292 13
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
293 13
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
294 11
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
295 11
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
296
                $decpar = $sarr[$k + 1][1];
297 11
                foreach ($decpar as $kdc => $vdc) {
298 11
                    if (
299
                        '/' == $vdc[0]
300 11
                        && 'Columns' == $vdc[1]
301 11
                        && (
302
                            isset($decpar[$kdc + 1])
303
                            && 'numeric' == $decpar[$kdc + 1][0]
304 11
                        )
305
                    ) {
306 11
                        $columns = (int) $decpar[$kdc + 1][1];
307 11
                    } elseif (
308
                        '/' == $vdc[0]
309 11
                        && 'Predictor' == $vdc[1]
310 11
                        && (
311
                            isset($decpar[$kdc + 1])
312
                            && 'numeric' == $decpar[$kdc + 1][0]
313 11
                        )
314
                    ) {
315
                        $predictor = (int) $decpar[$kdc + 1][1];
316 13
                    }
317 13
                }
318 13
            } elseif ($filltrailer) {
319 13
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
320 13
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
321 13
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
322 13
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
323 13
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
324
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
325 13
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
326 13
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
327 13
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
328 13
                    $xref['trailer']['id'] = [];
329
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
330
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
331
                }
332
            }
333
        }
334 13
335 13
        // decode data
336
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
337 11
            if (null !== $predictor) {
338
                // number of bytes in a row
339
                $rowlen = ($columns + 1);
340 11
                // convert the stream into an array of integers
341
                /** @var array<int> */
342
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
343
                // TODO: Handle the case when unpack returns false
344 11
345
                // split the rows
346
                $sdata = array_chunk($sdata, $rowlen);
347 11
348
                // initialize decoded array
349 11
                $ddata = [];
350
                // initialize first row with zeros
351 11
                $prev_row = array_fill(0, $rowlen, 0);
352
                // for each row apply PNG unpredictor
353 11
                foreach ($sdata as $k => $row) {
354
                    // initialize new row
355 11
                    $ddata[$k] = [];
356
                    // get PNG predictor value
357 11
                    $predictor = (10 + $row[0]);
358
                    // for each byte on the row
359 11
                    for ($i = 1; $i <= $columns; ++$i) {
360 11
                        // new index
361 11
                        $j = ($i - 1);
362 11
                        $row_up = $prev_row[$j];
363 11
                        if (1 == $i) {
364
                            $row_left = 0;
365 11
                            $row_upleft = 0;
366 11
                        } else {
367
                            $row_left = $row[$i - 1];
368
                            $row_upleft = $prev_row[$j - 1];
369 11
                        }
370
                        switch ($predictor) {
371
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
372
                                $ddata[$k][$j] = $row[$i];
373 11
                                break;
374
375
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
376
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
377 11
                                break;
378 11
379 11
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
380
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
381
                                break;
382
383
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
384
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
385
                                break;
386
387
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
388
                                // initial estimate
389
                                $p = ($row_left + $row_up - $row_upleft);
390
                                // distances
391
                                $pa = abs($p - $row_left);
392
                                $pb = abs($p - $row_up);
393
                                $pc = abs($p - $row_upleft);
394
                                $pmin = min($pa, $pb, $pc);
395
                                // return minimum distance
396
                                switch ($pmin) {
397
                                    case $pa:
398
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
399
                                        break;
400
401
                                    case $pb:
402
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
403
                                        break;
404
405
                                    case $pc:
406
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
407
                                        break;
408
                                }
409
                                break;
410
411
                            default:  // PNG prediction (on encoding, PNG optimum)
412
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
413 11
                        }
414
                    }
415
                    $prev_row = $ddata[$k];
416
                } // end for each row
417
                // complete decoding
418 2
            } else {
419 2
                // number of bytes in a row
420
                $rowlen = array_sum($wb);
421 2
                if (0 < $rowlen) {
422
                    // convert the stream into an array of integers
423 2
                    $sdata = unpack('C*', $xrefcrs[1][3][0]);
424
                    // split the rows
425
                    $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

425
                    $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
426
                } else {
427
                    // if the row length is zero, $ddata should be an empty array as well
428
                    $ddata = [];
429
                }
430 13
            }
431
432
            $sdata = [];
433 13
434
            // for every row
435 13
            foreach ($ddata as $k => $row) {
436 13
                // initialize new row
437
                $sdata[$k] = [0, 0, 0];
438
                if (0 == $wb[0]) {
439
                    // default type field
440 13
                    $sdata[$k][0] = 1;
441
                }
442 13
                $i = 0; // count bytes in the row
443
                // for every column
444 13
                for ($c = 0; $c < 3; ++$c) {
445 13
                    // for every byte on the column
446 13
                    for ($b = 0; $b < $wb[$c]; ++$b) {
447
                        if (isset($row[$i])) {
448 13
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
449
                        }
450
                        ++$i;
451
                    }
452
                }
453
            }
454 13
455
            // fill xref
456 11
            if (isset($index_blocks)) {
457
                // load the first object number of the first /Index entry
458 12
                $obj_num = $index_blocks[0][0];
459
            } else {
460 13
                $obj_num = 0;
461 13
            }
462 13
            foreach ($sdata as $k => $row) {
463 13
                switch ($row[0]) {
464
                    case 0:  // (f) linked list of free objects
465 13
                        break;
466
467 13
                    case 1:  // (n) objects that are in use but are not compressed
468
                        // create unique object index: [object number]_[generation number]
469 13
                        $index = $obj_num.'_'.$row[2];
470
                        // check if object already exist
471 13
                        if (!isset($xref['xref'][$index])) {
472
                            // store object offset position
473 13
                            $xref['xref'][$index] = $row[1];
474
                        }
475 13
                        break;
476
477
                    case 2:  // compressed objects
478 13
                        // $row[1] = object number of the object stream in which this object is stored
479 13
                        // $row[2] = index of this object within the object stream
480 13
                        $index = $row[1].'_0_'.$row[2];
481
                        $xref['xref'][$index] = -1;
482
                        break;
483
484
                    default:  // null objects
485 13
                        break;
486 13
                }
487
                ++$obj_num;
488 11
                if (isset($index_blocks)) {
489 11
                    // reduce the number of remaining objects
490
                    --$index_blocks[0][1];
491 11
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
492 11
                        // remove the actual used /Index entry
493
                        array_shift($index_blocks);
494 8
                        if (0 < \count($index_blocks)) {
495
                            // load the first object number of the following /Index entry
496
                            $obj_num = $index_blocks[0][0];
497 11
                        } else {
498
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
499
                            unset($index_blocks);
500
                        }
501
                    }
502
                }
503 13
            }
504
        } // end decoding data
505 11
        if (isset($prevxref)) {
506
            // get previous xref
507
            $xref = $this->getXrefData($pdfData, $prevxref, $xref, $visitedOffsets);
508 13
        }
509
510
        return $xref;
511 67
    }
512
513
    protected function getObjectHeaderPattern(array $objRefs): string
514 67
    {
515
        // consider all whitespace character (PDF specifications)
516
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
517 67
    }
518
519
    protected function getObjectHeaderLen(array $objRefs): int
520
    {
521 67
        // "4 0 obj"
522
        // 2 whitespaces + strlen("obj") = 5
523
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
524
    }
525
526
    /**
527
     * Get content of indirect object.
528
     *
529
     * @param string $pdfData  PDF data
530
     * @param string $objRef   Object number and generation number separated by underscore character
531
     * @param int    $offset   Object offset
532
     * @param bool   $decoding If true decode streams
533
     *
534
     * @return array containing object data
535
     *
536 67
     * @throws \Exception if invalid object reference found
537
     */
538
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
539
    {
540
        /*
541
         * build indirect object header
542 67
         */
543 67
        // $objHeader = "[object number] [generation number] obj"
544
        $objRefArr = explode('_', $objRef);
545
        if (2 !== \count($objRefArr)) {
546
            throw new \Exception('Invalid object reference for $obj.');
547 67
        }
548
549
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
550
551
        /*
552
         * check if we are in position
553 67
         */
554
        // ignore whitespace characters at offset
555 67
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
556 67
        // ignore leading zeros for object number
557
        $offset += strspn($pdfData, '0', $offset);
558
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
559
            // an indirect reference to an undefined object shall be considered a reference to the null object
560
            return ['null', 'null', $offset];
561
        }
562
563
        /*
564
         * get content
565 67
         */
566 67
        // starting position of object content
567 67
        $offset += $objHeaderLen;
568 67
        $objContentArr = [];
569
        $i = 0; // object main index
570 67
        $header = null;
571
        do {
572 67
            $oldOffset = $offset;
573 67
            // get element
574
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
575 67
            $offset = $element[2];
576 67
            // decode stream using stream's dictionary information
577
            if ($decoding && ('stream' === $element[0]) && null != $header) {
578 67
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
579 67
            }
580 67
            $objContentArr[$i] = $element;
581 67
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
582
            ++$i;
583 67
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
584
        // remove closing delimiter
585
        array_pop($objContentArr);
586
587
        /*
588 67
         * return raw object content
589
         */
590
        return $objContentArr;
591
    }
592
593
    /**
594
     * Get the content of object, resolving indirect object reference if necessary.
595
     *
596
     * @param string $pdfData PDF data
597
     * @param array  $obj     Object value
598
     *
599
     * @return array containing object data
600
     *
601 67
     * @throws \Exception
602
     */
603 67
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
604
    {
605
        if ('objref' == $obj[0]) {
606
            // reference to indirect object
607
            if (isset($this->objects[$obj[1]])) {
608
                // this object has been already parsed
609
                return $this->objects[$obj[1]];
610
            } elseif (isset($xref[$obj[1]])) {
611
                // parse new object
612
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
613
614
                return $this->objects[$obj[1]];
615
            }
616 67
        }
617
618
        return $obj;
619
    }
620
621
    /**
622
     * Get object type, raw value and offset to next object
623
     *
624
     * @param int        $offset    Object offset
625
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
626
     *
627 68
     * @return array containing object type, raw value and offset to next object
628
     */
629 68
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
630 68
    {
631
        $objtype = ''; // object type to be returned
632
        $objval = ''; // object value to be returned
633 68
634
        // skip initial white space chars
635
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
636 68
637
        // get first char
638
        $char = $pdfData[$offset];
639 68
        // get object type
640
        switch ($char) {
641 3
            case '%':  // \x25 PERCENT SIGN
642 3
                // skip comment and search for next token
643 3
                $next = strcspn($pdfData, "\r\n", $offset);
644
                if ($next > 0) {
645 3
                    $offset += $next;
646
647
                    return $this->getRawObject($pdfData, $offset);
648
                }
649 68
                break;
650
651 68
            case '/':  // \x2F SOLIDUS
652 68
                // name object
653 68
                $objtype = $char;
654 68
                ++$offset;
655 68
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
656 68
                if ($span > 0) {
657
                    $objval = substr($pdfData, $offset, $span); // unescaped value
658 68
                    $offset += $span;
659
                }
660 68
                break;
661 68
662
            case '(':   // \x28 LEFT PARENTHESIS
663 62
            case ')':  // \x29 RIGHT PARENTHESIS
664 62
                // literal string object
665 62
                $objtype = $char;
666 62
                ++$offset;
667 62
                $strpos = $offset;
668 62
                if ('(' == $char) {
669 62
                    $open_bracket = 1;
670
                    while ($open_bracket > 0) {
671
                        if (!isset($pdfData[$strpos])) {
672 62
                            break;
673
                        }
674 62
                        $ch = $pdfData[$strpos];
675
                        switch ($ch) {
676 29
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
677 29
                                // skip next character
678
                                ++$strpos;
679 62
                                break;
680 2
681 2
                            case '(':  // LEFT PARENHESIS (28h)
682
                                ++$open_bracket;
683 62
                                break;
684 62
685 62
                            case ')':  // RIGHT PARENTHESIS (29h)
686
                                --$open_bracket;
687 62
                                break;
688
                        }
689 62
                        ++$strpos;
690 62
                    }
691
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
692 62
                    $offset = $strpos;
693
                }
694 68
                break;
695 68
696
            case '[':   // \x5B LEFT SQUARE BRACKET
697 67
            case ']':  // \x5D RIGHT SQUARE BRACKET
698 67
                // array object
699 67
                $objtype = $char;
700
                ++$offset;
701 67
                if ('[' == $char) {
702
                    // get array content
703 67
                    $objval = [];
704
                    do {
705 67
                        $oldOffset = $offset;
706 67
                        // get element
707 67
                        $element = $this->getRawObject($pdfData, $offset);
708 67
                        $offset = $element[2];
709
                        $objval[] = $element;
710 67
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
711
                    // remove closing delimiter
712 67
                    array_pop($objval);
713
                }
714 68
                break;
715 68
716 68
            case '<':  // \x3C LESS-THAN SIGN
717
            case '>':  // \x3E GREATER-THAN SIGN
718 68
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
719 68
                    // dictionary object
720 68
                    $objtype = $char.$char;
721
                    $offset += 2;
722 68
                    if ('<' == $char) {
723
                        // get array content
724 68
                        $objval = [];
725
                        do {
726 68
                            $oldOffset = $offset;
727 68
                            // get element
728 68
                            $element = $this->getRawObject($pdfData, $offset);
729 68
                            $offset = $element[2];
730
                            $objval[] = $element;
731 68
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
732
                        // remove closing delimiter
733
                        array_pop($objval);
734
                    }
735 32
                } else {
736 32
                    // hexadecimal string object
737
                    $objtype = $char;
738 32
                    ++$offset;
739 32
740 32
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
741
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
742 32
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
743 32
                        // remove white space characters
744 2
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
745 2
                        $offset += $span + 1;
746
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
747
                        $offset = $endpos + 1;
748 68
                    }
749
                }
750
                break;
751 68
752
            default:
753 67
                if ('endobj' == substr($pdfData, $offset, 6)) {
754 67
                    // indirect object
755 68
                    $objtype = 'endobj';
756
                    $offset += 6;
757 11
                } elseif ('null' == substr($pdfData, $offset, 4)) {
758 11
                    // null object
759 11
                    $objtype = 'null';
760 68
                    $offset += 4;
761
                    $objval = 'null';
762 31
                } elseif ('true' == substr($pdfData, $offset, 4)) {
763 31
                    // boolean true object
764 31
                    $objtype = 'boolean';
765 68
                    $offset += 4;
766
                    $objval = 'true';
767 5
                } elseif ('false' == substr($pdfData, $offset, 5)) {
768 5
                    // boolean false object
769 5
                    $objtype = 'boolean';
770 68
                    $offset += 5;
771
                    $objval = 'false';
772 68
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
773 68
                    // start stream object
774 68
                    $objtype = 'stream';
775 68
                    $offset += 6;
776
                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
777
                        $offset += \strlen($matches[0]);
778 68
779 68
                        // we get stream length here to later help preg_match test less data
780
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
781 68
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
782 68
783 68
                        $pregResult = preg_match(
784 68
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
785 68
                            $pdfData,
786 68
                            $matches,
787 68
                            \PREG_OFFSET_CAPTURE,
788
                            $offset + $streamLen
789 68
                        );
790 68
791 68
                        if (1 == $pregResult) {
792
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
793
                            $offset = $matches[1][1];
794 68
                        }
795
                    }
796 67
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
797 67
                    // end stream object
798 68
                    $objtype = 'endstream';
799
                    $offset += 9;
800 67
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
801 67
                    // indirect object reference
802 67
                    $objtype = 'objref';
803 68
                    $offset += \strlen($matches[0]);
804
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
805 14
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
806 14
                    // object start
807 14
                    $objtype = 'obj';
808 68
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
809
                    $offset += \strlen($matches[0]);
810 67
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
811 67
                    // numeric object
812 67
                    $objtype = 'numeric';
813
                    $objval = substr($pdfData, $offset, $numlen);
814 68
                    $offset += $numlen;
815
                }
816
                break;
817 68
        }
818
819
        return [$objtype, $objval, $offset];
820
    }
821
822
    /**
823
     * Get value of an object header's section (obj << YYY >> part ).
824
     *
825
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
826
     * when no Smalot\PdfParser\Header objects are created yet.
827
     *
828
     * @param string            $key     header's section name
829
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
830
     * @param string|array|null $default default value for header's section
831
     *
832 68
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
833
     */
834 68
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
835 1
    {
836
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
837
            return $default;
838
        }
839
840
        /*
841
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
842
         * iterates over it, searching for section of type '/' whith requested key.
843
         * If such a section is found, it tries to receive it's value (next object in dictionary),
844 67
         * returning it, if it matches requested type, or default value otherwise.
845 67
         */
846
        foreach ($headerDic as $i => $val) {
847 67
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
848 67
            if (
849 67
                $isSectionName
850
                && $val[1] == $key
851 67
                && isset($headerDic[$i + 1])
852
            ) {
853 67
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
854 62
855 67
                return $isSectionValue && $type == $headerDic[$i + 1][0]
856
                    ? $headerDic[$i + 1][1]
857
                    : $default;
858
            }
859
        }
860
861
        return $default;
862
    }
863
864
    /**
865
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
866
     *
867
     * @param int        $offset        xref offset (if known)
868
     * @param array      $xref          previous xref array (if any)
869
     * @param array<int> $visitedOffsets array of visited offsets to prevent infinite loops
870
     *
871
     * @return array containing xref and trailer data
872
     *
873 68
     * @throws \Exception if it was unable to find startxref
874
     * @throws \Exception if it was unable to find xref
875
     */
876
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
877
    {
878
        // Check for circular references to prevent infinite loops
879 68
        if (\in_array($offset, $visitedOffsets, true)) {
880 68
            // We've already processed this offset, skip to avoid infinite loop
881 1
            return $xref;
882
        }
883
884
        // Track this offset as visited
885 68
        $visitedOffsets[] = $offset;
886 68
        // If the $offset is currently pointed at whitespace, bump it
887 68
        // forward until it isn't; affects loosely targetted offsets
888 68
        // for the 'xref' keyword
889 68
        // See: https://github.com/smalot/pdfparser/issues/673
890 68
        $bumpOffset = $offset;
891 68
        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
892
            ++$bumpOffset;
893 68
        }
894
895
        // Find all startxref tables from this $offset forward
896 68
        $startxrefPreg = preg_match_all(
897
            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
898 68
            $pdfData,
899 23
            $startxrefMatches,
900
            \PREG_SET_ORDER,
901 12
            $offset
902 11
        );
903
904 11
        if (0 == $startxrefPreg) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $startxrefPreg of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
905
            // No startxref tables were found
906
            throw new \Exception('Unable to find startxref');
907
        } elseif (0 == $offset) {
908
            // Use the last startxref in the document
909
            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
910 68
        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
911 1
            // Already pointing at the xref table
912
            $startxref = $bumpOffset;
913
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
914
            // Cross-Reference Stream object
915 67
            $startxref = $bumpOffset;
916
        } else {
917 54
            // Use the next startxref from this $offset
918
            $startxref = (int) $startxrefMatches[0][1];
919
        }
920 13
921 13
        if ($startxref > \strlen($pdfData)) {
922
            throw new \Exception('Unable to find xref (PDF corrupted?)');
923
        }
924
925
        // check xref position
926 13
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
927
            // Cross-Reference
928
            $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
929 67
        } else {
930
            // Check if the $pdfData might have the wrong line-endings
931
            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
932
            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
933 67
                // Return Unix-line-ending flag
934
                $xref = ['Unix' => true];
935
            } else {
936
                // Cross-Reference Stream
937
                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
938
            }
939
        }
940
        if (empty($xref)) {
941
            throw new \Exception('Unable to find xref');
942
        }
943
944
        return $xref;
945
    }
946 68
947
    /**
948 68
     * Parses PDF data and returns extracted data as array.
949
     *
950
     * @param string $data PDF data to parse
951
     *
952 68
     * @return array array of parsed PDF document objects
953
     *
954
     * @throws EmptyPdfException if empty PDF data given
955
     * @throws MissingPdfHeaderException if PDF data missing `%PDF-` header
956
     */
957 68
    public function parseData(string $data): array
958
    {
959
        if (empty($data)) {
960 68
            throw new EmptyPdfException('Empty PDF data given.');
961
        }
962
        // find the pdf header starting position
963 67
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
964
            throw new MissingPdfHeaderException('Invalid PDF data: Missing `%PDF-` header.');
965
        }
966
967
        // get PDF content string
968
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
969 67
970 67
        // get xref and trailer data
971 67
        $xref = $this->getXrefData($pdfData);
972
973 67
        // If we found Unix line-endings
974
        if (isset($xref['Unix'])) {
975
            $pdfData = str_replace("\r\n", "\n", $pdfData);
976
            $xref = $this->getXrefData($pdfData);
977 67
        }
978
979
        // parse all document objects
980
        $objects = [];
981
        foreach ($xref['xref'] as $obj => $offset) {
982
            if (!isset($objects[$obj]) && ($offset > 0)) {
983
                // decode objects with positive offset
984
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
985
            }
986
        }
987
988
        return [$xref, $objects];
989
    }
990
}
991