RawDataParser::decodeStream()   D
last analyzed

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 4
dl 0
loc 56
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
use Smalot\PdfParser\Exception\EmptyPdfException;
47
use Smalot\PdfParser\Exception\MissingPdfHeaderException;
48
49
class RawDataParser
50
{
51
    /**
52
     * @var Config
53
     */
54
    private $config;
55
56
    /**
57
     * Configuration array.
58
     *
59
     * @var array<string,bool>
60
     */
61
    protected $cfg = [
62
        // if `true` ignore filter decoding errors
63
        'ignore_filter_decoding_errors' => true,
64
        // if `true` ignore missing filter decoding errors
65
        'ignore_missing_filter_decoders' => true,
66
    ];
67
68
    protected $filterHelper;
69
    protected $objects;
70
71
    /**
72
     * @param array $cfg Configuration array, default is []
73
     */
74
    public function __construct($cfg = [], ?Config $config = null)
75
    {
76
        // merge given array with default values
77
        $this->cfg = array_merge($this->cfg, $cfg);
78
79
        $this->filterHelper = new FilterHelper();
80
        $this->config = $config ?: new Config();
81
    }
82
83
    /**
84
     * Decode the specified stream.
85
     *
86
     * @param string $pdfData PDF data
87
     * @param array  $sdic    Stream's dictionary array
88
     * @param string $stream  Stream to decode
89
     *
90
     * @return array containing decoded stream data and remaining filters
91
     *
92
     * @throws \Exception
93
     */
94
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
95
    {
96
        // get stream length and filters
97
        $slength = \strlen($stream);
98
        if ($slength <= 0) {
99
            return ['', []];
100
        }
101
        $filters = [];
102
        foreach ($sdic as $k => $v) {
103
            if ('/' == $v[0]) {
104
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
105
                    // get declared stream length
106
                    $declength = (int) $sdic[$k + 1][1];
107
                    if ($declength < $slength) {
108
                        $stream = substr($stream, 0, $declength);
109
                        $slength = $declength;
110
                    }
111
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
112
                    // resolve indirect object
113
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
114
                    if ('/' == $objval[0]) {
115
                        // single filter
116
                        $filters[] = $objval[1];
117
                    } elseif ('[' == $objval[0]) {
118
                        // array of filters
119
                        foreach ($objval[1] as $flt) {
120
                            if ('/' == $flt[0]) {
121
                                $filters[] = $flt[1];
122
                            }
123
                        }
124
                    }
125
                }
126
            }
127
        }
128
129
        // decode the stream
130
        $remaining_filters = [];
131
        foreach ($filters as $filter) {
132
            if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
133
                try {
134
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
135
                } catch (\Exception $e) {
136
                    $emsg = $e->getMessage();
137
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
138
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
139
                    ) {
140
                        throw new \Exception($e->getMessage());
141
                    }
142
                }
143
            } else {
144
                // add missing filter to array
145
                $remaining_filters[] = $filter;
146
            }
147
        }
148
149
        return [$stream, $remaining_filters];
150
    }
151
152
    /**
153
     * Decode the Cross-Reference section
154
     *
155
     * @param string $pdfData   PDF data
156
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
157
     * @param array  $xref      Previous xref array (if any)
158
     *
159
     * @return array containing xref and trailer data
160
     *
161
     * @throws \Exception
162
     */
163
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
164
    {
165
        $startxref += 4; // 4 is the length of the word 'xref'
166
        // skip initial white space chars
167
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
168
        // initialize object number
169
        $obj_num = 0;
170
        // search for cross-reference entries or subsection
171
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
172
            if ($matches[0][1] != $offset) {
173
                // we are on another section
174
                break;
175
            }
176
            $offset += \strlen($matches[0][0]);
177
            if ('n' == $matches[3][0]) {
178
                // create unique object index: [object number]_[generation number]
179
                $index = $obj_num.'_'.(int) $matches[2][0];
180
                // check if object already exist
181
                if (!isset($xref['xref'][$index])) {
182
                    // store object offset position
183
                    $xref['xref'][$index] = (int) $matches[1][0];
184
                }
185
                ++$obj_num;
186
            } elseif ('f' == $matches[3][0]) {
187
                ++$obj_num;
188
            } else {
189
                // object number (index)
190
                $obj_num = (int) $matches[1][0];
191
            }
192
        }
193
        // get trailer data
194
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
195
            $trailer_data = $matches[1][0];
196
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
197
                // get only the last updated version
198
                $xref['trailer'] = [];
199
                // parse trailer_data
200
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
201
                    $xref['trailer']['size'] = (int) $matches[1];
202
                }
203
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
204
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
205
                }
206
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
207
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
208
                }
209
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
210
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
211
                }
212
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
213
                    $xref['trailer']['id'] = [];
214
                    $xref['trailer']['id'][0] = $matches[1];
215
                    $xref['trailer']['id'][1] = $matches[2];
216
                }
217
            }
218
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
219
                $offset = (int) $matches[1];
220
                if (0 != $offset) {
221
                    // get previous xref
222
                    $xref = $this->getXrefData($pdfData, $offset, $xref);
223
                }
224
            }
225
        } else {
226
            throw new \Exception('Unable to find trailer');
227
        }
228
229
        return $xref;
230
    }
231
232
    /**
233
     * Decode the Cross-Reference Stream section
234
     *
235
     * @param string $pdfData   PDF data
236
     * @param int    $startxref Offset at which the xref section starts
237
     * @param array  $xref      Previous xref array (if any)
238
     *
239
     * @return array containing xref and trailer data
240
     *
241
     * @throws \Exception if unknown PNG predictor detected
242
     */
243
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
244
    {
245
        // try to read Cross-Reference Stream
246
        $xrefobj = $this->getRawObject($pdfData, $startxref);
247
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
248
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
249
            // get only the last updated version
250
            $xref['trailer'] = [];
251
            $filltrailer = true;
252
        } else {
253
            $filltrailer = false;
254
        }
255
        if (!isset($xref['xref'])) {
256
            $xref['xref'] = [];
257
        }
258
        $valid_crs = false;
259
        $columns = 0;
260
        $predictor = null;
261
        $sarr = $xrefcrs[0][1];
262
        if (!\is_array($sarr)) {
263
            $sarr = [];
264
        }
265
266
        $wb = [];
267
268
        foreach ($sarr as $k => $v) {
269
            if (
270
                ('/' == $v[0])
271
                && ('Type' == $v[1])
272
                && (
273
                    isset($sarr[$k + 1])
274
                    && '/' == $sarr[$k + 1][0]
275
                    && 'XRef' == $sarr[$k + 1][1]
276
                )
277
            ) {
278
                $valid_crs = true;
279
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
280
                // initialize list for: first object number in the subsection / number of objects
281
                $index_blocks = [];
282
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
283
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
284
                }
285
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
286
                // get previous xref offset
287
                $prevxref = (int) $sarr[$k + 1][1];
288
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
289
                // number of bytes (in the decoded stream) of the corresponding field
290
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
291
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
292
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
293
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
294
                $decpar = $sarr[$k + 1][1];
295
                foreach ($decpar as $kdc => $vdc) {
296
                    if (
297
                        '/' == $vdc[0]
298
                        && 'Columns' == $vdc[1]
299
                        && (
300
                            isset($decpar[$kdc + 1])
301
                            && 'numeric' == $decpar[$kdc + 1][0]
302
                        )
303
                    ) {
304
                        $columns = (int) $decpar[$kdc + 1][1];
305
                    } elseif (
306
                        '/' == $vdc[0]
307
                        && 'Predictor' == $vdc[1]
308
                        && (
309
                            isset($decpar[$kdc + 1])
310
                            && 'numeric' == $decpar[$kdc + 1][0]
311
                        )
312
                    ) {
313
                        $predictor = (int) $decpar[$kdc + 1][1];
314
                    }
315
                }
316
            } elseif ($filltrailer) {
317
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
318
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
319
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
320
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
321
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
322
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
323
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
324
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
325
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
326
                    $xref['trailer']['id'] = [];
327
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
328
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
329
                }
330
            }
331
        }
332
333
        // decode data
334
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
335
            if (null !== $predictor) {
336
                // number of bytes in a row
337
                $rowlen = ($columns + 1);
338
                // convert the stream into an array of integers
339
                /** @var array<int> */
340
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
341
                // TODO: Handle the case when unpack returns false
342
343
                // split the rows
344
                $sdata = array_chunk($sdata, $rowlen);
345
346
                // initialize decoded array
347
                $ddata = [];
348
                // initialize first row with zeros
349
                $prev_row = array_fill(0, $rowlen, 0);
350
                // for each row apply PNG unpredictor
351
                foreach ($sdata as $k => $row) {
352
                    // initialize new row
353
                    $ddata[$k] = [];
354
                    // get PNG predictor value
355
                    $predictor = (10 + $row[0]);
356
                    // for each byte on the row
357
                    for ($i = 1; $i <= $columns; ++$i) {
358
                        // new index
359
                        $j = ($i - 1);
360
                        $row_up = $prev_row[$j];
361
                        if (1 == $i) {
362
                            $row_left = 0;
363
                            $row_upleft = 0;
364
                        } else {
365
                            $row_left = $row[$i - 1];
366
                            $row_upleft = $prev_row[$j - 1];
367
                        }
368
                        switch ($predictor) {
369
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
370
                                $ddata[$k][$j] = $row[$i];
371
                                break;
372
373
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
374
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
375
                                break;
376
377
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
378
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
379
                                break;
380
381
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
382
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
383
                                break;
384
385
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
386
                                // initial estimate
387
                                $p = ($row_left + $row_up - $row_upleft);
388
                                // distances
389
                                $pa = abs($p - $row_left);
390
                                $pb = abs($p - $row_up);
391
                                $pc = abs($p - $row_upleft);
392
                                $pmin = min($pa, $pb, $pc);
393
                                // return minimum distance
394
                                switch ($pmin) {
395
                                    case $pa:
396
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
397
                                        break;
398
399
                                    case $pb:
400
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
401
                                        break;
402
403
                                    case $pc:
404
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
405
                                        break;
406
                                }
407
                                break;
408
409
                            default:  // PNG prediction (on encoding, PNG optimum)
410
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
411
                        }
412
                    }
413
                    $prev_row = $ddata[$k];
414
                } // end for each row
415
                // complete decoding
416
            } else {
417
                // number of bytes in a row
418
                $rowlen = array_sum($wb);
419
                if (0 < $rowlen) {
420
                    // convert the stream into an array of integers
421
                    $sdata = unpack('C*', $xrefcrs[1][3][0]);
422
                    // split the rows
423
                    $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

423
                    $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
424
                } else {
425
                    // if the row length is zero, $ddata should be an empty array as well
426
                    $ddata = [];
427
                }
428
            }
429
430
            $sdata = [];
431
432
            // for every row
433
            foreach ($ddata as $k => $row) {
434
                // initialize new row
435
                $sdata[$k] = [0, 0, 0];
436
                if (0 == $wb[0]) {
437
                    // default type field
438
                    $sdata[$k][0] = 1;
439
                }
440
                $i = 0; // count bytes in the row
441
                // for every column
442
                for ($c = 0; $c < 3; ++$c) {
443
                    // for every byte on the column
444
                    for ($b = 0; $b < $wb[$c]; ++$b) {
445
                        if (isset($row[$i])) {
446
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
447
                        }
448
                        ++$i;
449
                    }
450
                }
451
            }
452
453
            // fill xref
454
            if (isset($index_blocks)) {
455
                // load the first object number of the first /Index entry
456
                $obj_num = $index_blocks[0][0];
457
            } else {
458
                $obj_num = 0;
459
            }
460
            foreach ($sdata as $k => $row) {
461
                switch ($row[0]) {
462
                    case 0:  // (f) linked list of free objects
463
                        break;
464
465
                    case 1:  // (n) objects that are in use but are not compressed
466
                        // create unique object index: [object number]_[generation number]
467
                        $index = $obj_num.'_'.$row[2];
468
                        // check if object already exist
469
                        if (!isset($xref['xref'][$index])) {
470
                            // store object offset position
471
                            $xref['xref'][$index] = $row[1];
472
                        }
473
                        break;
474
475
                    case 2:  // compressed objects
476
                        // $row[1] = object number of the object stream in which this object is stored
477
                        // $row[2] = index of this object within the object stream
478
                        $index = $row[1].'_0_'.$row[2];
479
                        $xref['xref'][$index] = -1;
480
                        break;
481
482
                    default:  // null objects
483
                        break;
484
                }
485
                ++$obj_num;
486
                if (isset($index_blocks)) {
487
                    // reduce the number of remaining objects
488
                    --$index_blocks[0][1];
489
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
490
                        // remove the actual used /Index entry
491
                        array_shift($index_blocks);
492
                        if (0 < \count($index_blocks)) {
493
                            // load the first object number of the following /Index entry
494
                            $obj_num = $index_blocks[0][0];
495
                        } else {
496
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
497
                            unset($index_blocks);
498
                        }
499
                    }
500
                }
501
            }
502
        } // end decoding data
503
        if (isset($prevxref)) {
504
            // get previous xref
505
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
506
        }
507
508
        return $xref;
509
    }
510
511
    protected function getObjectHeaderPattern(array $objRefs): string
512
    {
513
        // consider all whitespace character (PDF specifications)
514
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
515
    }
516
517
    protected function getObjectHeaderLen(array $objRefs): int
518
    {
519
        // "4 0 obj"
520
        // 2 whitespaces + strlen("obj") = 5
521
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
522
    }
523
524
    /**
525
     * Get content of indirect object.
526
     *
527
     * @param string $pdfData  PDF data
528
     * @param string $objRef   Object number and generation number separated by underscore character
529
     * @param int    $offset   Object offset
530
     * @param bool   $decoding If true decode streams
531
     *
532
     * @return array containing object data
533
     *
534
     * @throws \Exception if invalid object reference found
535
     */
536
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
537
    {
538
        /*
539
         * build indirect object header
540
         */
541
        // $objHeader = "[object number] [generation number] obj"
542
        $objRefArr = explode('_', $objRef);
543
        if (2 !== \count($objRefArr)) {
544
            throw new \Exception('Invalid object reference for $obj.');
545
        }
546
547
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
548
549
        /*
550
         * check if we are in position
551
         */
552
        // ignore whitespace characters at offset
553
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
554
        // ignore leading zeros for object number
555
        $offset += strspn($pdfData, '0', $offset);
556
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
557
            // an indirect reference to an undefined object shall be considered a reference to the null object
558
            return ['null', 'null', $offset];
559
        }
560
561
        /*
562
         * get content
563
         */
564
        // starting position of object content
565
        $offset += $objHeaderLen;
566
        $objContentArr = [];
567
        $i = 0; // object main index
568
        $header = null;
569
        do {
570
            $oldOffset = $offset;
571
            // get element
572
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
573
            $offset = $element[2];
574
            // decode stream using stream's dictionary information
575
            if ($decoding && ('stream' === $element[0]) && null != $header) {
576
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
577
            }
578
            $objContentArr[$i] = $element;
579
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
580
            ++$i;
581
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
582
        // remove closing delimiter
583
        array_pop($objContentArr);
584
585
        /*
586
         * return raw object content
587
         */
588
        return $objContentArr;
589
    }
590
591
    /**
592
     * Get the content of object, resolving indirect object reference if necessary.
593
     *
594
     * @param string $pdfData PDF data
595
     * @param array  $obj     Object value
596
     *
597
     * @return array containing object data
598
     *
599
     * @throws \Exception
600
     */
601
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
602
    {
603
        if ('objref' == $obj[0]) {
604
            // reference to indirect object
605
            if (isset($this->objects[$obj[1]])) {
606
                // this object has been already parsed
607
                return $this->objects[$obj[1]];
608
            } elseif (isset($xref[$obj[1]])) {
609
                // parse new object
610
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
611
612
                return $this->objects[$obj[1]];
613
            }
614
        }
615
616
        return $obj;
617
    }
618
619
    /**
620
     * Get object type, raw value and offset to next object
621
     *
622
     * @param int        $offset    Object offset
623
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
624
     *
625
     * @return array containing object type, raw value and offset to next object
626
     */
627
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
628
    {
629
        $objtype = ''; // object type to be returned
630
        $objval = ''; // object value to be returned
631
632
        // skip initial white space chars
633
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
634
635
        // get first char
636
        $char = $pdfData[$offset];
637
        // get object type
638
        switch ($char) {
639
            case '%':  // \x25 PERCENT SIGN
640
                // skip comment and search for next token
641
                $next = strcspn($pdfData, "\r\n", $offset);
642
                if ($next > 0) {
643
                    $offset += $next;
644
645
                    return $this->getRawObject($pdfData, $offset);
646
                }
647
                break;
648
649
            case '/':  // \x2F SOLIDUS
650
                // name object
651
                $objtype = $char;
652
                ++$offset;
653
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
654
                if ($span > 0) {
655
                    $objval = substr($pdfData, $offset, $span); // unescaped value
656
                    $offset += $span;
657
                }
658
                break;
659
660
            case '(':   // \x28 LEFT PARENTHESIS
661
            case ')':  // \x29 RIGHT PARENTHESIS
662
                // literal string object
663
                $objtype = $char;
664
                ++$offset;
665
                $strpos = $offset;
666
                if ('(' == $char) {
667
                    $open_bracket = 1;
668
                    while ($open_bracket > 0) {
669
                        if (!isset($pdfData[$strpos])) {
670
                            break;
671
                        }
672
                        $ch = $pdfData[$strpos];
673
                        switch ($ch) {
674
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
675
                                // skip next character
676
                                ++$strpos;
677
                                break;
678
679
                            case '(':  // LEFT PARENHESIS (28h)
680
                                ++$open_bracket;
681
                                break;
682
683
                            case ')':  // RIGHT PARENTHESIS (29h)
684
                                --$open_bracket;
685
                                break;
686
                        }
687
                        ++$strpos;
688
                    }
689
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
690
                    $offset = $strpos;
691
                }
692
                break;
693
694
            case '[':   // \x5B LEFT SQUARE BRACKET
695
            case ']':  // \x5D RIGHT SQUARE BRACKET
696
                // array object
697
                $objtype = $char;
698
                ++$offset;
699
                if ('[' == $char) {
700
                    // get array content
701
                    $objval = [];
702
                    do {
703
                        $oldOffset = $offset;
704
                        // get element
705
                        $element = $this->getRawObject($pdfData, $offset);
706
                        $offset = $element[2];
707
                        $objval[] = $element;
708
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
709
                    // remove closing delimiter
710
                    array_pop($objval);
711
                }
712
                break;
713
714
            case '<':  // \x3C LESS-THAN SIGN
715
            case '>':  // \x3E GREATER-THAN SIGN
716
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
717
                    // dictionary object
718
                    $objtype = $char.$char;
719
                    $offset += 2;
720
                    if ('<' == $char) {
721
                        // get array content
722
                        $objval = [];
723
                        do {
724
                            $oldOffset = $offset;
725
                            // get element
726
                            $element = $this->getRawObject($pdfData, $offset);
727
                            $offset = $element[2];
728
                            $objval[] = $element;
729
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
730
                        // remove closing delimiter
731
                        array_pop($objval);
732
                    }
733
                } else {
734
                    // hexadecimal string object
735
                    $objtype = $char;
736
                    ++$offset;
737
738
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
739
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
740
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
741
                        // remove white space characters
742
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
743
                        $offset += $span + 1;
744
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
745
                        $offset = $endpos + 1;
746
                    }
747
                }
748
                break;
749
750
            default:
751
                if ('endobj' == substr($pdfData, $offset, 6)) {
752
                    // indirect object
753
                    $objtype = 'endobj';
754
                    $offset += 6;
755
                } elseif ('null' == substr($pdfData, $offset, 4)) {
756
                    // null object
757
                    $objtype = 'null';
758
                    $offset += 4;
759
                    $objval = 'null';
760
                } elseif ('true' == substr($pdfData, $offset, 4)) {
761
                    // boolean true object
762
                    $objtype = 'boolean';
763
                    $offset += 4;
764
                    $objval = 'true';
765
                } elseif ('false' == substr($pdfData, $offset, 5)) {
766
                    // boolean false object
767
                    $objtype = 'boolean';
768
                    $offset += 5;
769
                    $objval = 'false';
770
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
771
                    // start stream object
772
                    $objtype = 'stream';
773
                    $offset += 6;
774
                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
775
                        $offset += \strlen($matches[0]);
776
777
                        // we get stream length here to later help preg_match test less data
778
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
779
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
780
781
                        $pregResult = preg_match(
782
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
783
                            $pdfData,
784
                            $matches,
785
                            \PREG_OFFSET_CAPTURE,
786
                            $offset + $streamLen
787
                        );
788
789
                        if (1 == $pregResult) {
790
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
791
                            $offset = $matches[1][1];
792
                        }
793
                    }
794
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
795
                    // end stream object
796
                    $objtype = 'endstream';
797
                    $offset += 9;
798
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
799
                    // indirect object reference
800
                    $objtype = 'objref';
801
                    $offset += \strlen($matches[0]);
802
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
803
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
804
                    // object start
805
                    $objtype = 'obj';
806
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
807
                    $offset += \strlen($matches[0]);
808
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
809
                    // numeric object
810
                    $objtype = 'numeric';
811
                    $objval = substr($pdfData, $offset, $numlen);
812
                    $offset += $numlen;
813
                }
814
                break;
815
        }
816
817
        return [$objtype, $objval, $offset];
818
    }
819
820
    /**
821
     * Get value of an object header's section (obj << YYY >> part ).
822
     *
823
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
824
     * when no Smalot\PdfParser\Header objects are created yet.
825
     *
826
     * @param string            $key     header's section name
827
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
828
     * @param string|array|null $default default value for header's section
829
     *
830
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
831
     */
832
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
833
    {
834
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
835
            return $default;
836
        }
837
838
        /*
839
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
840
         * iterates over it, searching for section of type '/' whith requested key.
841
         * If such a section is found, it tries to receive it's value (next object in dictionary),
842
         * returning it, if it matches requested type, or default value otherwise.
843
         */
844
        foreach ($headerDic as $i => $val) {
845
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
846
            if (
847
                $isSectionName
848
                && $val[1] == $key
849
                && isset($headerDic[$i + 1])
850
            ) {
851
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
852
853
                return $isSectionValue && $type == $headerDic[$i + 1][0]
854
                    ? $headerDic[$i + 1][1]
855
                    : $default;
856
            }
857
        }
858
859
        return $default;
860
    }
861
862
    /**
863
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
864
     *
865
     * @param int   $offset xref offset (if known)
866
     * @param array $xref   previous xref array (if any)
867
     *
868
     * @return array containing xref and trailer data
869
     *
870
     * @throws \Exception if it was unable to find startxref
871
     * @throws \Exception if it was unable to find xref
872
     */
873
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
874
    {
875
        // If the $offset is currently pointed at whitespace, bump it
876
        // forward until it isn't; affects loosely targetted offsets
877
        // for the 'xref' keyword
878
        // See: https://github.com/smalot/pdfparser/issues/673
879
        $bumpOffset = $offset;
880
        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
881
            ++$bumpOffset;
882
        }
883
884
        // Find all startxref tables from this $offset forward
885
        $startxrefPreg = preg_match_all(
886
            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
887
            $pdfData,
888
            $startxrefMatches,
889
            \PREG_SET_ORDER,
890
            $offset
891
        );
892
893
        if (0 == $startxrefPreg) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $startxrefPreg of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
894
            // No startxref tables were found
895
            throw new \Exception('Unable to find startxref');
896
        } elseif (0 == $offset) {
897
            // Use the last startxref in the document
898
            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
899
        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
900
            // Already pointing at the xref table
901
            $startxref = $bumpOffset;
902
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
903
            // Cross-Reference Stream object
904
            $startxref = $bumpOffset;
905
        } else {
906
            // Use the next startxref from this $offset
907
            $startxref = (int) $startxrefMatches[0][1];
908
        }
909
910
        if ($startxref > \strlen($pdfData)) {
911
            throw new \Exception('Unable to find xref (PDF corrupted?)');
912
        }
913
914
        // check xref position
915
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
916
            // Cross-Reference
917
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
918
        } else {
919
            // Check if the $pdfData might have the wrong line-endings
920
            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
921
            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
922
                // Return Unix-line-ending flag
923
                $xref = ['Unix' => true];
924
            } else {
925
                // Cross-Reference Stream
926
                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
927
            }
928
        }
929
        if (empty($xref)) {
930
            throw new \Exception('Unable to find xref');
931
        }
932
933
        return $xref;
934
    }
935
936
    /**
937
     * Parses PDF data and returns extracted data as array.
938
     *
939
     * @param string $data PDF data to parse
940
     *
941
     * @return array array of parsed PDF document objects
942
     *
943
     * @throws EmptyPdfException if empty PDF data given
944
     * @throws MissingPdfHeaderException if PDF data missing `%PDF-` header
945
     */
946
    public function parseData(string $data): array
947
    {
948
        if (empty($data)) {
949
            throw new EmptyPdfException('Empty PDF data given.');
950
        }
951
        // find the pdf header starting position
952
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
953
            throw new MissingPdfHeaderException('Invalid PDF data: Missing `%PDF-` header.');
954
        }
955
956
        // get PDF content string
957
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
958
959
        // get xref and trailer data
960
        $xref = $this->getXrefData($pdfData);
961
962
        // If we found Unix line-endings
963
        if (isset($xref['Unix'])) {
964
            $pdfData = str_replace("\r\n", "\n", $pdfData);
965
            $xref = $this->getXrefData($pdfData);
966
        }
967
968
        // parse all document objects
969
        $objects = [];
970
        foreach ($xref['xref'] as $obj => $offset) {
971
            if (!isset($objects[$obj]) && ($offset > 0)) {
972
                // decode objects with positive offset
973
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
974
            }
975
        }
976
977
        return [$xref, $objects];
978
    }
979
}
980