RawDataParser::getRawObject()   F
last analyzed

Complexity

Conditions 43
Paths 43

Size

Total Lines 191
Code Lines 132

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 126
CRAP Score 43.007

Importance

Changes 6
Bugs 2 Features 1
Metric Value
cc 43
eloc 132
c 6
b 2
f 1
nc 43
nop 3
dl 0
loc 191
ccs 126
cts 128
cp 0.9844
crap 43.007
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     *
57
     * @var array<string,bool>
58
     */
59
    protected $cfg = [
60
        // if `true` ignore filter decoding errors
61
        'ignore_filter_decoding_errors' => true,
62
        // if `true` ignore missing filter decoding errors
63
        'ignore_missing_filter_decoders' => true,
64
    ];
65
66
    protected $filterHelper;
67
    protected $objects;
68
69
    /**
70
     * @param array $cfg Configuration array, default is []
71
     */
72 72
    public function __construct($cfg = [], ?Config $config = null)
73
    {
74
        // merge given array with default values
75 72
        $this->cfg = array_merge($this->cfg, $cfg);
76
77 72
        $this->filterHelper = new FilterHelper();
78 72
        $this->config = $config ?: new Config();
79
    }
80
81
    /**
82
     * Decode the specified stream.
83
     *
84
     * @param string $pdfData PDF data
85
     * @param array  $sdic    Stream's dictionary array
86
     * @param string $stream  Stream to decode
87
     *
88
     * @return array containing decoded stream data and remaining filters
89
     *
90
     * @throws \Exception
91
     */
92 65
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
93
    {
94
        // get stream length and filters
95 65
        $slength = \strlen($stream);
96 65
        if ($slength <= 0) {
97
            return ['', []];
98
        }
99 65
        $filters = [];
100 65
        foreach ($sdic as $k => $v) {
101 65
            if ('/' == $v[0]) {
102 65
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
103
                    // get declared stream length
104 60
                    $declength = (int) $sdic[$k + 1][1];
105 60
                    if ($declength < $slength) {
106 60
                        $stream = substr($stream, 0, $declength);
107 60
                        $slength = $declength;
108
                    }
109 65
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
110
                    // resolve indirect object
111 65
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
112 65
                    if ('/' == $objval[0]) {
113
                        // single filter
114 65
                        $filters[] = $objval[1];
115 4
                    } elseif ('[' == $objval[0]) {
116
                        // array of filters
117 4
                        foreach ($objval[1] as $flt) {
118 4
                            if ('/' == $flt[0]) {
119 4
                                $filters[] = $flt[1];
120
                            }
121
                        }
122
                    }
123
                }
124
            }
125
        }
126
127
        // decode the stream
128 65
        $remaining_filters = [];
129 65
        foreach ($filters as $filter) {
130 65
            if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
131
                try {
132 65
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
133 2
                } catch (\Exception $e) {
134 2
                    $emsg = $e->getMessage();
135 2
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
136 2
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
137
                    ) {
138 65
                        throw new \Exception($e->getMessage());
139
                    }
140
                }
141
            } else {
142
                // add missing filter to array
143 9
                $remaining_filters[] = $filter;
144
            }
145
        }
146
147 65
        return [$stream, $remaining_filters];
148
    }
149
150
    /**
151
     * Decode the Cross-Reference section
152
     *
153
     * @param string $pdfData   PDF data
154
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
155
     * @param array  $xref      Previous xref array (if any)
156
     *
157
     * @return array containing xref and trailer data
158
     *
159
     * @throws \Exception
160
     */
161 52
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
162
    {
163 52
        $startxref += 4; // 4 is the length of the word 'xref'
164
        // skip initial white space chars
165 52
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
166
        // initialize object number
167 52
        $obj_num = 0;
168
        // search for cross-reference entries or subsection
169 52
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
170 52
            if ($matches[0][1] != $offset) {
171
                // we are on another section
172 12
                break;
173
            }
174 52
            $offset += \strlen($matches[0][0]);
175 52
            if ('n' == $matches[3][0]) {
176
                // create unique object index: [object number]_[generation number]
177 52
                $index = $obj_num.'_'.(int) $matches[2][0];
178
                // check if object already exist
179 52
                if (!isset($xref['xref'][$index])) {
180
                    // store object offset position
181 52
                    $xref['xref'][$index] = (int) $matches[1][0];
182
                }
183 52
                ++$obj_num;
184 52
            } elseif ('f' == $matches[3][0]) {
185 52
                ++$obj_num;
186
            } else {
187
                // object number (index)
188 52
                $obj_num = (int) $matches[1][0];
189
            }
190
        }
191
        // get trailer data
192 52
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
193 52
            $trailer_data = $matches[1][0];
194 52
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
195
                // get only the last updated version
196 52
                $xref['trailer'] = [];
197
                // parse trailer_data
198 52
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
199 52
                    $xref['trailer']['size'] = (int) $matches[1];
200
                }
201 52
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202 52
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
203
                }
204 52
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 2
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
206
                }
207 52
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
208 49
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
209
                }
210 52
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
211 41
                    $xref['trailer']['id'] = [];
212 41
                    $xref['trailer']['id'][0] = $matches[1];
213 41
                    $xref['trailer']['id'][1] = $matches[2];
214
                }
215
            }
216 52
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
217
                // get previous xref
218 52
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
219
            }
220
        } else {
221
            throw new \Exception('Unable to find trailer');
222
        }
223
224 52
        return $xref;
225
    }
226
227
    /**
228
     * Decode the Cross-Reference Stream section
229
     *
230
     * @param string $pdfData   PDF data
231
     * @param int    $startxref Offset at which the xref section starts
232
     * @param array  $xref      Previous xref array (if any)
233
     *
234
     * @return array containing xref and trailer data
235
     *
236
     * @throws \Exception if unknown PNG predictor detected
237
     */
238 13
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
239
    {
240
        // try to read Cross-Reference Stream
241 13
        $xrefobj = $this->getRawObject($pdfData, $startxref);
242 13
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
243 13
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
244
            // get only the last updated version
245 13
            $xref['trailer'] = [];
246 13
            $filltrailer = true;
247
        } else {
248 11
            $filltrailer = false;
249
        }
250 13
        if (!isset($xref['xref'])) {
251 13
            $xref['xref'] = [];
252
        }
253 13
        $valid_crs = false;
254 13
        $columns = 0;
255 13
        $predictor = null;
256 13
        $sarr = $xrefcrs[0][1];
257 13
        if (!\is_array($sarr)) {
258
            $sarr = [];
259
        }
260
261 13
        $wb = [];
262
263 13
        foreach ($sarr as $k => $v) {
264
            if (
265 13
                ('/' == $v[0])
266 13
                && ('Type' == $v[1])
267 13
                && (isset($sarr[$k + 1])
268 13
                    && '/' == $sarr[$k + 1][0]
269 13
                    && 'XRef' == $sarr[$k + 1][1]
270
                )
271
            ) {
272 13
                $valid_crs = true;
273 13
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
274
                // initialize list for: first object number in the subsection / number of objects
275 11
                $index_blocks = [];
276 11
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
277 11
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
278
                }
279 13
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
280
                // get previous xref offset
281 11
                $prevxref = (int) $sarr[$k + 1][1];
282 13
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
283
                // number of bytes (in the decoded stream) of the corresponding field
284 13
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
285 13
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
286 13
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
287 13
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
288 11
                $decpar = $sarr[$k + 1][1];
289 11
                foreach ($decpar as $kdc => $vdc) {
290
                    if (
291 11
                        '/' == $vdc[0]
292 11
                        && 'Columns' == $vdc[1]
293 11
                        && (isset($decpar[$kdc + 1])
294 11
                            && 'numeric' == $decpar[$kdc + 1][0]
295
                        )
296
                    ) {
297 11
                        $columns = (int) $decpar[$kdc + 1][1];
298
                    } elseif (
299 11
                        '/' == $vdc[0]
300 11
                        && 'Predictor' == $vdc[1]
301 11
                        && (isset($decpar[$kdc + 1])
302 11
                            && 'numeric' == $decpar[$kdc + 1][0]
303
                        )
304
                    ) {
305 11
                        $predictor = (int) $decpar[$kdc + 1][1];
306
                    }
307
                }
308 13
            } elseif ($filltrailer) {
309 13
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
310 13
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
311 13
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
312 13
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
313 13
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
314 13
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
315 13
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
316
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
317 13
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
318 13
                    $xref['trailer']['id'] = [];
319 13
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
320 13
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
321
                }
322
            }
323
        }
324
325
        // decode data
326 13
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
327 13
            if (null !== $predictor) {
328
                // number of bytes in a row
329 11
                $rowlen = ($columns + 1);
330
                // convert the stream into an array of integers
331
                /** @var array<int> */
332 11
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
333
                // TODO: Handle the case when unpack returns false
334
335
                // split the rows
336 11
                $sdata = array_chunk($sdata, $rowlen);
337
338
                // initialize decoded array
339 11
                $ddata = [];
340
                // initialize first row with zeros
341 11
                $prev_row = array_fill(0, $rowlen, 0);
342
                // for each row apply PNG unpredictor
343 11
                foreach ($sdata as $k => $row) {
344
                    // initialize new row
345 11
                    $ddata[$k] = [];
346
                    // get PNG predictor value
347 11
                    $predictor = (10 + $row[0]);
348
                    // for each byte on the row
349 11
                    for ($i = 1; $i <= $columns; ++$i) {
350
                        // new index
351 11
                        $j = ($i - 1);
352 11
                        $row_up = $prev_row[$j];
353 11
                        if (1 == $i) {
354 11
                            $row_left = 0;
355 11
                            $row_upleft = 0;
356
                        } else {
357 11
                            $row_left = $row[$i - 1];
358 11
                            $row_upleft = $prev_row[$j - 1];
359
                        }
360
                        switch ($predictor) {
361 11
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
362
                                $ddata[$k][$j] = $row[$i];
363
                                break;
364
365 11
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
366
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
367
                                break;
368
369 11
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
370 11
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
371 11
                                break;
372
373
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
374
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
375
                                break;
376
377
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
378
                                // initial estimate
379
                                $p = ($row_left + $row_up - $row_upleft);
380
                                // distances
381
                                $pa = abs($p - $row_left);
382
                                $pb = abs($p - $row_up);
383
                                $pc = abs($p - $row_upleft);
384
                                $pmin = min($pa, $pb, $pc);
385
                                // return minimum distance
386
                                switch ($pmin) {
387
                                    case $pa:
388
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
389
                                        break;
390
391
                                    case $pb:
392
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
393
                                        break;
394
395
                                    case $pc:
396
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
397
                                        break;
398
                                }
399
                                break;
400
401
                            default:  // PNG prediction (on encoding, PNG optimum)
402
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
403
                        }
404
                    }
405 11
                    $prev_row = $ddata[$k];
406
                } // end for each row
407
            // complete decoding
408
            } else {
409
                // number of bytes in a row
410 2
                $rowlen = array_sum($wb);
411 2
                if (0 < $rowlen) {
412
                    // convert the stream into an array of integers
413 2
                    $sdata = unpack('C*', $xrefcrs[1][3][0]);
414
                    // split the rows
415 2
                    $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

415
                    $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
416
                } else {
417
                    // if the row length is zero, $ddata should be an empty array as well
418
                    $ddata = [];
419
                }
420
            }
421
422 13
            $sdata = [];
423
424
            // for every row
425 13
            foreach ($ddata as $k => $row) {
426
                // initialize new row
427 13
                $sdata[$k] = [0, 0, 0];
428 13
                if (0 == $wb[0]) {
429
                    // default type field
430
                    $sdata[$k][0] = 1;
431
                }
432 13
                $i = 0; // count bytes in the row
433
                // for every column
434 13
                for ($c = 0; $c < 3; ++$c) {
435
                    // for every byte on the column
436 13
                    for ($b = 0; $b < $wb[$c]; ++$b) {
437 13
                        if (isset($row[$i])) {
438 13
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
439
                        }
440 13
                        ++$i;
441
                    }
442
                }
443
            }
444
445
            // fill xref
446 13
            if (isset($index_blocks)) {
447
                // load the first object number of the first /Index entry
448 11
                $obj_num = $index_blocks[0][0];
449
            } else {
450 12
                $obj_num = 0;
451
            }
452 13
            foreach ($sdata as $k => $row) {
453 13
                switch ($row[0]) {
454 13
                    case 0:  // (f) linked list of free objects
455 13
                        break;
456
457 13
                    case 1:  // (n) objects that are in use but are not compressed
458
                        // create unique object index: [object number]_[generation number]
459 13
                        $index = $obj_num.'_'.$row[2];
460
                        // check if object already exist
461 13
                        if (!isset($xref['xref'][$index])) {
462
                            // store object offset position
463 13
                            $xref['xref'][$index] = $row[1];
464
                        }
465 13
                        break;
466
467 13
                    case 2:  // compressed objects
468
                        // $row[1] = object number of the object stream in which this object is stored
469
                        // $row[2] = index of this object within the object stream
470 13
                        $index = $row[1].'_0_'.$row[2];
471 13
                        $xref['xref'][$index] = -1;
472 13
                        break;
473
474
                    default:  // null objects
475
                        break;
476
                }
477 13
                ++$obj_num;
478 13
                if (isset($index_blocks)) {
479
                    // reduce the number of remaining objects
480 11
                    --$index_blocks[0][1];
481 11
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
482
                        // remove the actual used /Index entry
483 11
                        array_shift($index_blocks);
484 11
                        if (0 < \count($index_blocks)) {
485
                            // load the first object number of the following /Index entry
486 8
                            $obj_num = $index_blocks[0][0];
487
                        } else {
488
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
489 11
                            unset($index_blocks);
490
                        }
491
                    }
492
                }
493
            }
494
        } // end decoding data
495 13
        if (isset($prevxref)) {
496
            // get previous xref
497 11
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
498
        }
499
500 13
        return $xref;
501
    }
502
503 65
    protected function getObjectHeaderPattern(array $objRefs): string
504
    {
505
        // consider all whitespace character (PDF specifications)
506 65
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
507
    }
508
509 65
    protected function getObjectHeaderLen(array $objRefs): int
510
    {
511
        // "4 0 obj"
512
        // 2 whitespaces + strlen("obj") = 5
513 65
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
514
    }
515
516
    /**
517
     * Get content of indirect object.
518
     *
519
     * @param string $pdfData  PDF data
520
     * @param string $objRef   Object number and generation number separated by underscore character
521
     * @param int    $offset   Object offset
522
     * @param bool   $decoding If true decode streams
523
     *
524
     * @return array containing object data
525
     *
526
     * @throws \Exception if invalid object reference found
527
     */
528 65
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
529
    {
530
        /*
531
         * build indirect object header
532
         */
533
        // $objHeader = "[object number] [generation number] obj"
534 65
        $objRefArr = explode('_', $objRef);
535 65
        if (2 !== \count($objRefArr)) {
536
            throw new \Exception('Invalid object reference for $obj.');
537
        }
538
539 65
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
540
541
        /*
542
         * check if we are in position
543
         */
544
        // ignore whitespace characters at offset
545 65
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
546
        // ignore leading zeros for object number
547 65
        $offset += strspn($pdfData, '0', $offset);
548 65
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
549
            // an indirect reference to an undefined object shall be considered a reference to the null object
550
            return ['null', 'null', $offset];
551
        }
552
553
        /*
554
         * get content
555
         */
556
        // starting position of object content
557 65
        $offset += $objHeaderLen;
558 65
        $objContentArr = [];
559 65
        $i = 0; // object main index
560 65
        $header = null;
561
        do {
562 65
            $oldOffset = $offset;
563
            // get element
564 65
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
565 65
            $offset = $element[2];
566
            // decode stream using stream's dictionary information
567 65
            if ($decoding && ('stream' === $element[0]) && null != $header) {
568 65
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
569
            }
570 65
            $objContentArr[$i] = $element;
571 65
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
572 65
            ++$i;
573 65
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
574
        // remove closing delimiter
575 65
        array_pop($objContentArr);
576
577
        /*
578
         * return raw object content
579
         */
580 65
        return $objContentArr;
581
    }
582
583
    /**
584
     * Get the content of object, resolving indirect object reference if necessary.
585
     *
586
     * @param string $pdfData PDF data
587
     * @param array  $obj     Object value
588
     *
589
     * @return array containing object data
590
     *
591
     * @throws \Exception
592
     */
593 65
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
594
    {
595 65
        if ('objref' == $obj[0]) {
596
            // reference to indirect object
597
            if (isset($this->objects[$obj[1]])) {
598
                // this object has been already parsed
599
                return $this->objects[$obj[1]];
600
            } elseif (isset($xref[$obj[1]])) {
601
                // parse new object
602
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
603
604
                return $this->objects[$obj[1]];
605
            }
606
        }
607
608 65
        return $obj;
609
    }
610
611
    /**
612
     * Get object type, raw value and offset to next object
613
     *
614
     * @param int        $offset    Object offset
615
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
616
     *
617
     * @return array containing object type, raw value and offset to next object
618
     */
619 66
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
620
    {
621 66
        $objtype = ''; // object type to be returned
622 66
        $objval = ''; // object value to be returned
623
624
        // skip initial white space chars
625 66
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
626
627
        // get first char
628 66
        $char = $pdfData[$offset];
629
        // get object type
630
        switch ($char) {
631 66
            case '%':  // \x25 PERCENT SIGN
632
                // skip comment and search for next token
633 3
                $next = strcspn($pdfData, "\r\n", $offset);
634 3
                if ($next > 0) {
635 3
                    $offset += $next;
636
637 3
                    return $this->getRawObject($pdfData, $offset);
638
                }
639
                break;
640
641 66
            case '/':  // \x2F SOLIDUS
642
                // name object
643 66
                $objtype = $char;
644 66
                ++$offset;
645 66
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
646 66
                if ($span > 0) {
647 66
                    $objval = substr($pdfData, $offset, $span); // unescaped value
648 66
                    $offset += $span;
649
                }
650 66
                break;
651
652 66
            case '(':   // \x28 LEFT PARENTHESIS
653 66
            case ')':  // \x29 RIGHT PARENTHESIS
654
                // literal string object
655 61
                $objtype = $char;
656 61
                ++$offset;
657 61
                $strpos = $offset;
658 61
                if ('(' == $char) {
659 61
                    $open_bracket = 1;
660 61
                    while ($open_bracket > 0) {
661 61
                        if (!isset($pdfData[$strpos])) {
662
                            break;
663
                        }
664 61
                        $ch = $pdfData[$strpos];
665
                        switch ($ch) {
666 61
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
667
                                // skip next character
668 29
                                ++$strpos;
669 29
                                break;
670
671 61
                            case '(':  // LEFT PARENHESIS (28h)
672 2
                                ++$open_bracket;
673 2
                                break;
674
675 61
                            case ')':  // RIGHT PARENTHESIS (29h)
676 61
                                --$open_bracket;
677 61
                                break;
678
                        }
679 61
                        ++$strpos;
680
                    }
681 61
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
682 61
                    $offset = $strpos;
683
                }
684 61
                break;
685
686 66
            case '[':   // \x5B LEFT SQUARE BRACKET
687 66
            case ']':  // \x5D RIGHT SQUARE BRACKET
688
                // array object
689 65
                $objtype = $char;
690 65
                ++$offset;
691 65
                if ('[' == $char) {
692
                    // get array content
693 65
                    $objval = [];
694
                    do {
695 65
                        $oldOffset = $offset;
696
                        // get element
697 65
                        $element = $this->getRawObject($pdfData, $offset);
698 65
                        $offset = $element[2];
699 65
                        $objval[] = $element;
700 65
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
701
                    // remove closing delimiter
702 65
                    array_pop($objval);
703
                }
704 65
                break;
705
706 66
            case '<':  // \x3C LESS-THAN SIGN
707 66
            case '>':  // \x3E GREATER-THAN SIGN
708 66
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
709
                    // dictionary object
710 66
                    $objtype = $char.$char;
711 66
                    $offset += 2;
712 66
                    if ('<' == $char) {
713
                        // get array content
714 66
                        $objval = [];
715
                        do {
716 66
                            $oldOffset = $offset;
717
                            // get element
718 66
                            $element = $this->getRawObject($pdfData, $offset);
719 66
                            $offset = $element[2];
720 66
                            $objval[] = $element;
721 66
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
722
                        // remove closing delimiter
723 66
                        array_pop($objval);
724
                    }
725
                } else {
726
                    // hexadecimal string object
727 32
                    $objtype = $char;
728 32
                    ++$offset;
729
730 32
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
731 32
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
732 32
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
733
                        // remove white space characters
734 32
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
735 32
                        $offset += $span + 1;
736 2
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
737 2
                        $offset = $endpos + 1;
738
                    }
739
                }
740 66
                break;
741
742
            default:
743 66
                if ('endobj' == substr($pdfData, $offset, 6)) {
744
                    // indirect object
745 65
                    $objtype = 'endobj';
746 65
                    $offset += 6;
747 66
                } elseif ('null' == substr($pdfData, $offset, 4)) {
748
                    // null object
749 11
                    $objtype = 'null';
750 11
                    $offset += 4;
751 11
                    $objval = 'null';
752 66
                } elseif ('true' == substr($pdfData, $offset, 4)) {
753
                    // boolean true object
754 31
                    $objtype = 'boolean';
755 31
                    $offset += 4;
756 31
                    $objval = 'true';
757 66
                } elseif ('false' == substr($pdfData, $offset, 5)) {
758
                    // boolean false object
759 5
                    $objtype = 'boolean';
760 5
                    $offset += 5;
761 5
                    $objval = 'false';
762 66
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
763
                    // start stream object
764 66
                    $objtype = 'stream';
765 66
                    $offset += 6;
766 66
                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
767 66
                        $offset += \strlen($matches[0]);
768
769
                        // we get stream length here to later help preg_match test less data
770 66
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
771 66
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
772
773 66
                        $pregResult = preg_match(
774 66
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
775 66
                            $pdfData,
776 66
                            $matches,
777 66
                            \PREG_OFFSET_CAPTURE,
778 66
                            $offset + $streamLen
779 66
                        );
780
781 66
                        if (1 == $pregResult) {
782 66
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
783 66
                            $offset = $matches[1][1];
784
                        }
785
                    }
786 66
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
787
                    // end stream object
788 65
                    $objtype = 'endstream';
789 65
                    $offset += 9;
790 66
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
791
                    // indirect object reference
792 65
                    $objtype = 'objref';
793 65
                    $offset += \strlen($matches[0]);
794 65
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
795 66
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
796
                    // object start
797 14
                    $objtype = 'obj';
798 14
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
799 14
                    $offset += \strlen($matches[0]);
800 66
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
801
                    // numeric object
802 65
                    $objtype = 'numeric';
803 65
                    $objval = substr($pdfData, $offset, $numlen);
804 65
                    $offset += $numlen;
805
                }
806 66
                break;
807
        }
808
809 66
        return [$objtype, $objval, $offset];
810
    }
811
812
    /**
813
     * Get value of an object header's section (obj << YYY >> part ).
814
     *
815
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
816
     * when no Smalot\PdfParser\Header objects are created yet.
817
     *
818
     * @param string            $key     header's section name
819
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
820
     * @param string|array|null $default default value for header's section
821
     *
822
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
823
     */
824 66
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
825
    {
826 66
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
827 1
            return $default;
828
        }
829
830
        /*
831
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
832
         * iterates over it, searching for section of type '/' whith requested key.
833
         * If such a section is found, it tries to receive it's value (next object in dictionary),
834
         * returning it, if it matches requested type, or default value otherwise.
835
         */
836 65
        foreach ($headerDic as $i => $val) {
837 65
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
838
            if (
839 65
                $isSectionName
840 65
                && $val[1] == $key
841 65
                && isset($headerDic[$i + 1])
842
            ) {
843 65
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
844
845 65
                return $isSectionValue && $type == $headerDic[$i + 1][0]
846 60
                    ? $headerDic[$i + 1][1]
847 65
                    : $default;
848
            }
849
        }
850
851
        return $default;
852
    }
853
854
    /**
855
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
856
     *
857
     * @param int   $offset xref offset (if known)
858
     * @param array $xref   previous xref array (if any)
859
     *
860
     * @return array containing xref and trailer data
861
     *
862
     * @throws \Exception if it was unable to find startxref
863
     * @throws \Exception if it was unable to find xref
864
     */
865 66
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
866
    {
867
        // If the $offset is currently pointed at whitespace, bump it
868
        // forward until it isn't; affects loosely targetted offsets
869
        // for the 'xref' keyword
870
        // See: https://github.com/smalot/pdfparser/issues/673
871 66
        $bumpOffset = $offset;
872 66
        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
873 1
            ++$bumpOffset;
874
        }
875
876
        // Find all startxref tables from this $offset forward
877 66
        $startxrefPreg = preg_match_all(
878 66
            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
879 66
            $pdfData,
880 66
            $startxrefMatches,
881 66
            \PREG_SET_ORDER,
882 66
            $offset
883 66
        );
884
885 66
        if (0 == $startxrefPreg) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $startxrefPreg of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
886
            // No startxref tables were found
887
            throw new \Exception('Unable to find startxref');
888 66
        } elseif (0 == $offset) {
889
            // Use the last startxref in the document
890 66
            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
891 23
        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
892
            // Already pointing at the xref table
893 12
            $startxref = $bumpOffset;
894 11
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
895
            // Cross-Reference Stream object
896 11
            $startxref = $bumpOffset;
897
        } else {
898
            // Use the next startxref from this $offset
899
            $startxref = (int) $startxrefMatches[0][1];
900
        }
901
902 66
        if ($startxref > \strlen($pdfData)) {
903 1
            throw new \Exception('Unable to find xref (PDF corrupted?)');
904
        }
905
906
        // check xref position
907 65
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
908
            // Cross-Reference
909 52
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
910
        } else {
911
            // Check if the $pdfData might have the wrong line-endings
912 13
            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
913 13
            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
914
                // Return Unix-line-ending flag
915
                $xref = ['Unix' => true];
916
            } else {
917
                // Cross-Reference Stream
918 13
                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
919
            }
920
        }
921 65
        if (empty($xref)) {
922
            throw new \Exception('Unable to find xref');
923
        }
924
925 65
        return $xref;
926
    }
927
928
    /**
929
     * Parses PDF data and returns extracted data as array.
930
     *
931
     * @param string $data PDF data to parse
932
     *
933
     * @return array array of parsed PDF document objects
934
     *
935
     * @throws \Exception if empty PDF data given
936
     * @throws \Exception if PDF data missing %PDF header
937
     */
938 66
    public function parseData(string $data): array
939
    {
940 66
        if (empty($data)) {
941
            throw new \Exception('Empty PDF data given.');
942
        }
943
        // find the pdf header starting position
944 66
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
945
            throw new \Exception('Invalid PDF data: missing %PDF header.');
946
        }
947
948
        // get PDF content string
949 66
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
950
951
        // get xref and trailer data
952 66
        $xref = $this->getXrefData($pdfData);
953
954
        // If we found Unix line-endings
955 65
        if (isset($xref['Unix'])) {
956
            $pdfData = str_replace("\r\n", "\n", $pdfData);
957
            $xref = $this->getXrefData($pdfData);
958
        }
959
960
        // parse all document objects
961 65
        $objects = [];
962 65
        foreach ($xref['xref'] as $obj => $offset) {
963 65
            if (!isset($objects[$obj]) && ($offset > 0)) {
964
                // decode objects with positive offset
965 65
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
966
            }
967
        }
968
969 65
        return [$xref, $objects];
970
    }
971
}
972