Passed
Pull Request — master (#500)
by
unknown
02:02
created

RawDataParser::decodeXrefStream()   F

Complexity

Conditions 83
Paths 1632

Size

Total Lines 258
Code Lines 163

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 122
CRAP Score 127.8321

Importance

Changes 4
Bugs 1 Features 1
Metric Value
cc 83
eloc 163
c 4
b 1
f 1
nc 1632
nop 3
dl 0
loc 258
ccs 122
cts 150
cp 0.8133
crap 127.8321
rs 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 39
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 39
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 39
        $this->filterHelper = new FilterHelper();
75 39
        $this->config = $config ?: new Config();
76 39
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $sdic    Stream's dictionary array
83
     * @param string $stream  Stream to decode
84
     *
85
     * @return array containing decoded stream data and remaining filters
86
     *
87
     * @throws Exception
88
     */
89 35
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
90
    {
91
        // get stream length and filters
92 35
        $slength = \strlen($stream);
93 35
        if ($slength <= 0) {
94
            return ['', []];
95
        }
96 35
        $filters = [];
97 35
        foreach ($sdic as $k => $v) {
98 35
            if ('/' == $v[0]) {
99 35
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
100
                    // get declared stream length
101 34
                    $declength = (int) ($sdic[($k + 1)][1]);
102 34
                    if ($declength < $slength) {
103 33
                        $stream = substr($stream, 0, $declength);
104 34
                        $slength = $declength;
105
                    }
106 34
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
107
                    // resolve indirect object
108 34
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
109 34
                    if ('/' == $objval[0]) {
110
                        // single filter
111 34
                        $filters[] = $objval[1];
112 3
                    } elseif ('[' == $objval[0]) {
113
                        // array of filters
114 3
                        foreach ($objval[1] as $flt) {
115 3
                            if ('/' == $flt[0]) {
116 3
                                $filters[] = $flt[1];
117
                            }
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
124
        // decode the stream
125 35
        $remaining_filters = [];
126 35
        foreach ($filters as $filter) {
127 34
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
128
                try {
129 34
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
130
                } catch (Exception $e) {
131
                    $emsg = $e->getMessage();
132
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
133
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
134
                    ) {
135 34
                        throw new Exception($e->getMessage());
136
                    }
137
                }
138
            } else {
139
                // add missing filter to array
140 4
                $remaining_filters[] = $filter;
141
            }
142
        }
143
144 35
        return [$stream, $remaining_filters];
145
    }
146
147
    /**
148
     * Decode the Cross-Reference section
149
     *
150
     * @param string $pdfData   PDF data
151
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
152
     * @param array  $xref      Previous xref array (if any)
153
     *
154
     * @return array containing xref and trailer data
155
     *
156
     * @throws Exception
157
     */
158 29
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
159
    {
160 29
        $startxref += 4; // 4 is the length of the word 'xref'
161
        // skip initial white space chars
162 29
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
163
        // initialize object number
164 29
        $obj_num = 0;
165
        // search for cross-reference entries or subsection
166 29
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
167 29
            if ($matches[0][1] != $offset) {
168
                // we are on another section
169 5
                break;
170
            }
171 29
            $offset += \strlen($matches[0][0]);
172 29
            if ('n' == $matches[3][0]) {
173
                // create unique object index: [object number]_[generation number]
174 29
                $index = $obj_num.'_'.(int) ($matches[2][0]);
175
                // check if object already exist
176 29
                if (!isset($xref['xref'][$index])) {
177
                    // store object offset position
178 29
                    $xref['xref'][$index] = (int) ($matches[1][0]);
179
                }
180 29
                ++$obj_num;
181 29
            } elseif ('f' == $matches[3][0]) {
182 29
                ++$obj_num;
183
            } else {
184
                // object number (index)
185 29
                $obj_num = (int) ($matches[1][0]);
186
            }
187
        }
188
        // get trailer data
189 29
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
190 29
            $trailer_data = $matches[1][0];
191 29
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
192
                // get only the last updated version
193 29
                $xref['trailer'] = [];
194
                // parse trailer_data
195 29
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
196 29
                    $xref['trailer']['size'] = (int) ($matches[1]);
197
                }
198 29
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 29
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 29
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 29
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 28
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
206
                }
207 29
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
208 24
                    $xref['trailer']['id'] = [];
209 24
                    $xref['trailer']['id'][0] = $matches[1];
210 24
                    $xref['trailer']['id'][1] = $matches[2];
211
                }
212
            }
213 29
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214
                // get previous xref
215 29
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
216
            }
217
        } else {
218
            throw new Exception('Unable to find trailer');
219
        }
220
221 29
        return $xref;
222
    }
223
224
    /**
225
     * Decode the Cross-Reference Stream section
226
     *
227
     * @param string $pdfData   PDF data
228
     * @param int    $startxref Offset at which the xref section starts
229
     * @param array  $xref      Previous xref array (if any)
230
     *
231
     * @return array containing xref and trailer data
232
     *
233
     * @throws Exception if unknown PNG predictor detected
234
     */
235 6
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
236
    {
237
        // try to read Cross-Reference Stream
238 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
239 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
240 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
241
            // get only the last updated version
242 6
            $xref['trailer'] = [];
243 6
            $filltrailer = true;
244
        } else {
245 4
            $filltrailer = false;
246
        }
247 6
        if (!isset($xref['xref'])) {
248 6
            $xref['xref'] = [];
249
        }
250 6
        $valid_crs = false;
251 6
        $columns = 0;
252 6
        $predictor = null;
253 6
        $sarr = $xrefcrs[0][1];
254 6
        if (!\is_array($sarr)) {
255
            $sarr = [];
256
        }
257
258 6
        $wb = [];
259
260 6
        foreach ($sarr as $k => $v) {
261
            if (
262 6
                ('/' == $v[0])
263 6
                && ('Type' == $v[1])
264
                && (
265 6
                    isset($sarr[($k + 1)])
266 6
                    && '/' == $sarr[($k + 1)][0]
267 6
                    && 'XRef' == $sarr[($k + 1)][1]
268
                )
269
            ) {
270 6
                $valid_crs = true;
271 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272
                // initialize list for: first object number in the subsection / number of objects
273 4
                $index_blocks = [];
274 4
                for ($m = 0; $m < \count($sarr[($k + 1)][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
275 4
                    $index_blocks[] = [$sarr[($k + 1)][1][$m][1], $sarr[($k + 1)][1][$m + 1][1]];
276
                }
277 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
278
                // get previous xref offset
279 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
280 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
281
                // number of bytes (in the decoded stream) of the corresponding field
282 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
283 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
284 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
285 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
286 5
                $decpar = $sarr[($k + 1)][1];
287 5
                foreach ($decpar as $kdc => $vdc) {
288
                    if (
289 5
                        '/' == $vdc[0]
290 5
                        && 'Columns' == $vdc[1]
291
                        && (
292 5
                            isset($decpar[($kdc + 1)])
293 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
294
                        )
295
                    ) {
296 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
297
                    } elseif (
298 5
                        '/' == $vdc[0]
299 5
                        && 'Predictor' == $vdc[1]
300
                        && (
301 5
                            isset($decpar[($kdc + 1)])
302 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
303
                        )
304
                    ) {
305 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
306
                    }
307
                }
308 6
            } elseif ($filltrailer) {
309 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
310 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
311 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
312 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
313 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
314 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
315 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
316
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
317 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
318 6
                    $xref['trailer']['id'] = [];
319 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
320 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
321
                }
322
            }
323
        }
324
325
        // decode data
326 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
327 6
            if (null !== $predictor) {
328
                // number of bytes in a row
329 5
                $rowlen = ($columns + 1);
330
                // convert the stream into an array of integers
331 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
332
                // split the rows
333 5
                $sdata = array_chunk($sdata, $rowlen);
334
335
                // initialize decoded array
336 5
                $ddata = [];
337
                // initialize first row with zeros
338 5
                $prev_row = array_fill(0, $rowlen, 0);
339
                // for each row apply PNG unpredictor
340 5
                foreach ($sdata as $k => $row) {
341
                    // initialize new row
342 5
                    $ddata[$k] = [];
343
                    // get PNG predictor value
344 5
                    $predictor = (10 + $row[0]);
345
                    // for each byte on the row
346 5
                    for ($i = 1; $i <= $columns; ++$i) {
347
                        // new index
348 5
                        $j = ($i - 1);
349 5
                        $row_up = $prev_row[$j];
350 5
                        if (1 == $i) {
351 5
                            $row_left = 0;
352 5
                            $row_upleft = 0;
353
                        } else {
354 5
                            $row_left = $row[($i - 1)];
355 5
                            $row_upleft = $prev_row[($j - 1)];
356
                        }
357 5
                        switch ($predictor) {
358 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
359
                                $ddata[$k][$j] = $row[$i];
360
                                break;
361
362 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
363
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
364
                                break;
365
366 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
367 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
368 5
                                break;
369
370
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
371
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
372
                                break;
373
374
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
375
                                // initial estimate
376
                                $p = ($row_left + $row_up - $row_upleft);
377
                                // distances
378
                                $pa = abs($p - $row_left);
379
                                $pb = abs($p - $row_up);
380
                                $pc = abs($p - $row_upleft);
381
                                $pmin = min($pa, $pb, $pc);
382
                                // return minimum distance
383
                                switch ($pmin) {
384
                                    case $pa:
385
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
386
                                        break;
387
388
                                    case $pb:
389
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
390
                                        break;
391
392
                                    case $pc:
393
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
394
                                        break;
395
                                }
396
                                break;
397
398
                            default:  // PNG prediction (on encoding, PNG optimum)
399
                                throw new Exception('Unknown PNG predictor: '.$predictor);
400
                        }
401
                    }
402 5
                    $prev_row = $ddata[$k];
403
                } // end for each row
404
                // complete decoding
405
            } else {
406
                // number of bytes in a row
407 1
                $rowlen = array_sum($wb);
408
                // convert the stream into an array of integers
409 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
410
                // split the rows
411 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

411
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
412
            }
413
414 6
            $sdata = [];
415
416
            // for every row
417 6
            foreach ($ddata as $k => $row) {
418
                // initialize new row
419 6
                $sdata[$k] = [0, 0, 0];
420 6
                if (0 == $wb[0]) {
421
                    // default type field
422
                    $sdata[$k][0] = 1;
423
                }
424 6
                $i = 0; // count bytes in the row
425
                // for every column
426 6
                for ($c = 0; $c < 3; ++$c) {
427
                    // for every byte on the column
428 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
429 6
                        if (isset($row[$i])) {
430 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
431
                        }
432 6
                        ++$i;
433
                    }
434
                }
435
            }
436
437
            // fill xref
438 6
            if (isset($index_blocks)) {
439
                // load the first object number of the first /Index entry
440 4
                $obj_num = $index_blocks[0][0];
441
            } else {
442 6
                $obj_num = 0;
443
            }
444 6
            foreach ($sdata as $k => $row) {
445 6
                switch ($row[0]) {
446 6
                    case 0:  // (f) linked list of free objects
447 6
                            break;
448
449 6
                    case 1:  // (n) objects that are in use but are not compressed
450
                            // create unique object index: [object number]_[generation number]
451 6
                            $index = $obj_num.'_'.$row[2];
452
                            // check if object already exist
453 6
                            if (!isset($xref['xref'][$index])) {
454
                                // store object offset position
455 6
                                $xref['xref'][$index] = $row[1];
456
                            }
457 6
                            break;
458
459 6
                    case 2:  // compressed objects
460
                            // $row[1] = object number of the object stream in which this object is stored
461
                            // $row[2] = index of this object within the object stream
462 6
                            $index = $row[1].'_0_'.$row[2];
463 6
                            $xref['xref'][$index] = -1;
464 6
                            break;
465
466
                    default:  // null objects
467
                            break;
468
                }
469 6
                ++$obj_num;
470 6
                if (isset($index_blocks)) {
471
                    // reduce the number of remaining objects
472 4
                    --$index_blocks[0][1];
473 4
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
474
                        // remove the actual used /Index entry
475 4
                        array_shift($index_blocks);
476 4
                        if (0 < \count($index_blocks)) {
477
                            // load the first object number of the following /Index entry
478 2
                            $obj_num = $index_blocks[0][0];
479
                        } else {
480
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
481 4
                            unset($index_blocks);
482
                        }
483
                    }
484
                }
485
            }
486
        } // end decoding data
487 6
        if (isset($prevxref)) {
488
            // get previous xref
489 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
490
        }
491
492 6
        return $xref;
493
    }
494
495 35
    protected function getObjectHeaderPattern(array $objRefs): string
496
    {
497
        // consider all whitespace character (PDF specifications)
498 35
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
499
    }
500
501 35
    protected function getObjectHeaderLen(array $objRefs): int
502
    {
503
        // "4 0 obj"
504
        // 2 whitespaces + strlen("obj") = 5
505 35
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
506
    }
507
508
    /**
509
     * Get content of indirect object.
510
     *
511
     * @param string $pdfData  PDF data
512
     * @param string $objRef   Object number and generation number separated by underscore character
513
     * @param int    $offset   Object offset
514
     * @param bool   $decoding If true decode streams
515
     *
516
     * @return array containing object data
517
     *
518
     * @throws Exception if invalid object reference found
519
     */
520 35
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
521
    {
522
        /*
523
         * build indirect object header
524
         */
525
        // $objHeader = "[object number] [generation number] obj"
526 35
        $objRefArr = explode('_', $objRef);
527 35
        if (2 !== \count($objRefArr)) {
528
            throw new Exception('Invalid object reference for $obj.');
529
        }
530
531 35
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
532
533
        /*
534
         * check if we are in position
535
         */
536
        // ignore whitespace characters at offset
537 35
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
538
        // ignore leading zeros for object number
539 35
        $offset += strspn($pdfData, '0', $offset);
540 35
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
541
            // an indirect reference to an undefined object shall be considered a reference to the null object
542
            return ['null', 'null', $offset];
543
        }
544
545
        /*
546
         * get content
547
         */
548
        // starting position of object content
549 35
        $offset += $objHeaderLen;
550 35
        $objContentArr = [];
551 35
        $i = 0; // object main index
552
        do {
553 35
            $oldOffset = $offset;
554
            // get element
555 35
            $element = $this->getRawObject($pdfData, $offset);
556 35
            $offset = $element[2];
557
            // decode stream using stream's dictionary information
558 35
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
559 35
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
560
            }
561 35
            $objContentArr[$i] = $element;
562 35
            ++$i;
563 35
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
564
        // remove closing delimiter
565 35
        array_pop($objContentArr);
566
567
        /*
568
         * return raw object content
569
         */
570 35
        return $objContentArr;
571
    }
572
573
    /**
574
     * Get the content of object, resolving indirect object reference if necessary.
575
     *
576
     * @param string $pdfData PDF data
577
     * @param array  $obj     Object value
578
     *
579
     * @return array containing object data
580
     *
581
     * @throws Exception
582
     */
583 34
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
584
    {
585 34
        if ('objref' == $obj[0]) {
586
            // reference to indirect object
587
            if (isset($this->objects[$obj[1]])) {
588
                // this object has been already parsed
589
                return $this->objects[$obj[1]];
590
            } elseif (isset($xref[$obj[1]])) {
591
                // parse new object
592
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
593
594
                return $this->objects[$obj[1]];
595
            }
596
        }
597
598 34
        return $obj;
599
    }
600
601
    /**
602
     * Get object type, raw value and offset to next object
603
     *
604
     * @param int $offset Object offset
605
     *
606
     * @return array containing object type, raw value and offset to next object
607
     */
608 36
    protected function getRawObject(string $pdfData, int $offset = 0): array
609
    {
610 36
        $objtype = ''; // object type to be returned
611 36
        $objval = ''; // object value to be returned
612
613
        // skip initial white space chars
614 36
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
615
616
        // get first char
617 36
        $char = $pdfData[$offset];
618
        // get object type
619 36
        switch ($char) {
620 36
            case '%':  // \x25 PERCENT SIGN
621
                    // skip comment and search for next token
622 1
                    $next = strcspn($pdfData, "\r\n", $offset);
623 1
                    if ($next > 0) {
624 1
                        $offset += $next;
625
626 1
                        return $this->getRawObject($pdfData, $offset);
627
                    }
628
                    break;
629
630 36
            case '/':  // \x2F SOLIDUS
631
                    // name object
632 36
                    $objtype = $char;
633 36
                    ++$offset;
634 36
                    $pregResult = preg_match(
635 36
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
636 36
                        substr($pdfData, $offset, 256),
637
                        $matches
638
                    );
639 36
                    if (1 == $pregResult) {
640 36
                        $objval = $matches[1]; // unescaped value
641 36
                        $offset += \strlen($objval);
642
                    }
643 36
                    break;
644
645 36
            case '(':   // \x28 LEFT PARENTHESIS
646 36
            case ')':  // \x29 RIGHT PARENTHESIS
647
                    // literal string object
648 33
                    $objtype = $char;
649 33
                    ++$offset;
650 33
                    $strpos = $offset;
651 33
                    if ('(' == $char) {
652 33
                        $open_bracket = 1;
653 33
                        while ($open_bracket > 0) {
654 33
                            if (!isset($pdfData[$strpos])) {
655
                                break;
656
                            }
657 33
                            $ch = $pdfData[$strpos];
658 33
                            switch ($ch) {
659 33
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
660
                                        // skip next character
661 17
                                        ++$strpos;
662 17
                                        break;
663
664 33
                                case '(':  // LEFT PARENHESIS (28h)
665
                                        ++$open_bracket;
666
                                        break;
667
668 33
                                case ')':  // RIGHT PARENTHESIS (29h)
669 33
                                        --$open_bracket;
670 33
                                        break;
671
                            }
672 33
                            ++$strpos;
673
                        }
674 33
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
675 33
                        $offset = $strpos;
676
                    }
677 33
                    break;
678
679 36
            case '[':   // \x5B LEFT SQUARE BRACKET
680 36
            case ']':  // \x5D RIGHT SQUARE BRACKET
681
                // array object
682 35
                $objtype = $char;
683 35
                ++$offset;
684 35
                if ('[' == $char) {
685
                    // get array content
686 35
                    $objval = [];
687
                    do {
688 35
                        $oldOffset = $offset;
689
                        // get element
690 35
                        $element = $this->getRawObject($pdfData, $offset);
691 35
                        $offset = $element[2];
692 35
                        $objval[] = $element;
693 35
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
694
                    // remove closing delimiter
695 35
                    array_pop($objval);
696
                }
697 35
                break;
698
699 36
            case '<':  // \x3C LESS-THAN SIGN
700 36
            case '>':  // \x3E GREATER-THAN SIGN
701 36
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
702
                    // dictionary object
703 36
                    $objtype = $char.$char;
704 36
                    $offset += 2;
705 36
                    if ('<' == $char) {
706
                        // get array content
707 36
                        $objval = [];
708
                        do {
709 36
                            $oldOffset = $offset;
710
                            // get element
711 36
                            $element = $this->getRawObject($pdfData, $offset);
712 36
                            $offset = $element[2];
713 36
                            $objval[] = $element;
714 36
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
715
                        // remove closing delimiter
716 36
                        array_pop($objval);
717
                    }
718
                } else {
719
                    // hexadecimal string object
720 13
                    $objtype = $char;
721 13
                    ++$offset;
722 13
                    $pregResult = preg_match(
723 13
                        '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
724 13
                        substr($pdfData, $offset),
725
                        $matches
726
                    );
727 13
                    if (('<' == $char) && 1 == $pregResult) {
728
                        // remove white space characters
729 13
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
730 13
                        $offset += \strlen($matches[0]);
731
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
732
                        $offset = $endpos + 1;
733
                    }
734
                }
735 36
                break;
736
737
            default:
738 36
                if ('endobj' == substr($pdfData, $offset, 6)) {
739
                    // indirect object
740 35
                    $objtype = 'endobj';
741 35
                    $offset += 6;
742 36
                } elseif ('null' == substr($pdfData, $offset, 4)) {
743
                    // null object
744 3
                    $objtype = 'null';
745 3
                    $offset += 4;
746 3
                    $objval = 'null';
747 36
                } elseif ('true' == substr($pdfData, $offset, 4)) {
748
                    // boolean true object
749 11
                    $objtype = 'boolean';
750 11
                    $offset += 4;
751 11
                    $objval = 'true';
752 36
                } elseif ('false' == substr($pdfData, $offset, 5)) {
753
                    // boolean false object
754 2
                    $objtype = 'boolean';
755 2
                    $offset += 5;
756 2
                    $objval = 'false';
757 36
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
758
                    // start stream object
759 35
                    $objtype = 'stream';
760 35
                    $offset += 6;
761 35
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
762 35
                        $offset += \strlen($matches[0]);
763 35
                        $pregResult = preg_match(
764 35
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
765 35
                            substr($pdfData, $offset),
766
                            $matches,
767 35
                            \PREG_OFFSET_CAPTURE
768
                        );
769 35
                        if (1 == $pregResult) {
770 35
                            $objval = substr($pdfData, $offset, $matches[0][1]);
771 35
                            $offset += $matches[1][1];
772
                        }
773
                    }
774 36
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
775
                    // end stream object
776 35
                    $objtype = 'endstream';
777 35
                    $offset += 9;
778 36
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
779
                    // indirect object reference
780 35
                    $objtype = 'objref';
781 35
                    $offset += \strlen($matches[0]);
782 35
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
783 36
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
784
                    // object start
785 6
                    $objtype = 'obj';
786 6
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
787 6
                    $offset += \strlen($matches[0]);
788 36
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
789
                    // numeric object
790 35
                    $objtype = 'numeric';
791 35
                    $objval = substr($pdfData, $offset, $numlen);
792 35
                    $offset += $numlen;
793
                }
794 36
                break;
795
        }
796
797 36
        return [$objtype, $objval, $offset];
798
    }
799
800
    /**
801
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
802
     *
803
     * @param int   $offset xref offset (if known)
804
     * @param array $xref   previous xref array (if any)
805
     *
806
     * @return array containing xref and trailer data
807
     *
808
     * @throws Exception if it was unable to find startxref
809
     * @throws Exception if it was unable to find xref
810
     */
811 36
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
812
    {
813 36
        $startxrefPreg = preg_match(
814 36
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
815
            $pdfData,
816
            $matches,
817 36
            \PREG_OFFSET_CAPTURE,
818
            $offset
819
        );
820
821 36
        if (0 == $offset) {
822
            // find last startxref
823 36
            $pregResult = preg_match_all(
824 36
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
825
                $pdfData, $matches,
826 36
                \PREG_SET_ORDER,
827
                $offset
828
            );
829 36
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
830
                throw new Exception('Unable to find startxref');
831
            }
832 36
            $matches = array_pop($matches);
833 36
            $startxref = $matches[1];
834 9
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
835
            // Already pointing at the xref table
836 5
            $startxref = $offset;
837 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
838
            // Cross-Reference Stream object
839 4
            $startxref = $offset;
840
        } elseif ($startxrefPreg) {
841
            // startxref found
842
            $startxref = $matches[1][0];
843
        } else {
844
            throw new Exception('Unable to find startxref');
845
        }
846
847 36
        if ($startxref > \strlen($pdfData)) {
848 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
849
        }
850
851
        // check xref position
852 35
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
853
            // Cross-Reference
854 29
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
855
        } else {
856
            // Cross-Reference Stream
857 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
858
        }
859 35
        if (empty($xref)) {
860
            throw new Exception('Unable to find xref');
861
        }
862
863 35
        return $xref;
864
    }
865
866
    /**
867
     * Parses PDF data and returns extracted data as array.
868
     *
869
     * @param string $data PDF data to parse
870
     *
871
     * @return array array of parsed PDF document objects
872
     *
873
     * @throws Exception if empty PDF data given
874
     * @throws Exception if PDF data missing %PDF header
875
     */
876 36
    public function parseData(string $data): array
877
    {
878 36
        if (empty($data)) {
879
            throw new Exception('Empty PDF data given.');
880
        }
881
        // find the pdf header starting position
882 36
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
883
            throw new Exception('Invalid PDF data: missing %PDF header.');
884
        }
885
886
        // get PDF content string
887 36
        $pdfData = substr($data, $trimpos);
888
889
        // get xref and trailer data
890 36
        $xref = $this->getXrefData($pdfData);
891
892
        // parse all document objects
893 35
        $objects = [];
894 35
        foreach ($xref['xref'] as $obj => $offset) {
895 35
            if (!isset($objects[$obj]) && ($offset > 0)) {
896
                // decode objects with positive offset
897 35
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
898
            }
899
        }
900
901 35
        return [$xref, $objects];
902
    }
903
}
904