Passed
Pull Request — master (#615)
by Jeffrey
02:30
created

RawDataParser::parseData()   B

Complexity

Conditions 7
Paths 8

Size

Total Lines 26
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 7.2269

Importance

Changes 3
Bugs 1 Features 0
Metric Value
cc 7
eloc 11
c 3
b 1
f 0
nc 8
nop 1
dl 0
loc 26
ccs 10
cts 12
cp 0.8333
crap 7.2269
rs 8.8333
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var \Smalot\PdfParser\Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     */
57
    protected $cfg = [
58
        // if `true` ignore filter decoding errors
59
        'ignore_filter_decoding_errors' => true,
60
        // if `true` ignore missing filter decoding errors
61
        'ignore_missing_filter_decoders' => true,
62
    ];
63
64
    protected $filterHelper;
65
    protected $objects;
66
67
    /**
68
     * @param array $cfg Configuration array, default is []
69
     */
70 47
    public function __construct($cfg = [], Config $config = null)
71
    {
72
        // merge given array with default values
73 47
        $this->cfg = array_merge($this->cfg, $cfg);
74
75 47
        $this->filterHelper = new FilterHelper();
76 47
        $this->config = $config ?: new Config();
77 47
    }
78
79
    /**
80
     * Decode the specified stream.
81
     *
82
     * @param string $pdfData PDF data
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     *
88
     * @throws \Exception
89
     */
90 43
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
91
    {
92
        // get stream length and filters
93 43
        $slength = \strlen($stream);
94 43
        if ($slength <= 0) {
95
            return ['', []];
96
        }
97 43
        $filters = [];
98 43
        foreach ($sdic as $k => $v) {
99 43
            if ('/' == $v[0]) {
100 43
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
101
                    // get declared stream length
102 41
                    $declength = (int) $sdic[$k + 1][1];
103 41
                    if ($declength < $slength) {
104 41
                        $stream = substr($stream, 0, $declength);
105 41
                        $slength = $declength;
106
                    }
107 43
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
108
                    // resolve indirect object
109 43
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
110 43
                    if ('/' == $objval[0]) {
111
                        // single filter
112 43
                        $filters[] = $objval[1];
113 3
                    } elseif ('[' == $objval[0]) {
114
                        // array of filters
115 3
                        foreach ($objval[1] as $flt) {
116 3
                            if ('/' == $flt[0]) {
117 3
                                $filters[] = $flt[1];
118
                            }
119
                        }
120
                    }
121
                }
122
            }
123
        }
124
125
        // decode the stream
126 43
        $remaining_filters = [];
127 43
        foreach ($filters as $filter) {
128 43
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
129
                try {
130 43
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
131
                } catch (\Exception $e) {
132
                    $emsg = $e->getMessage();
133
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
134
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
135
                    ) {
136 43
                        throw new \Exception($e->getMessage());
137
                    }
138
                }
139
            } else {
140
                // add missing filter to array
141 4
                $remaining_filters[] = $filter;
142
            }
143
        }
144
145 43
        return [$stream, $remaining_filters];
146
    }
147
148
    /**
149
     * Decode the Cross-Reference section
150
     *
151
     * @param string $pdfData   PDF data
152
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
153
     * @param array  $xref      Previous xref array (if any)
154
     *
155
     * @return array containing xref and trailer data
156
     *
157
     * @throws \Exception
158
     */
159 34
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
160
    {
161 34
        $startxref += 4; // 4 is the length of the word 'xref'
162
        // skip initial white space chars
163 34
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
164
        // initialize object number
165 34
        $obj_num = 0;
166
        // search for cross-reference entries or subsection
167 34
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
168 34
            if ($matches[0][1] != $offset) {
169
                // we are on another section
170 9
                break;
171
            }
172 34
            $offset += \strlen($matches[0][0]);
173 34
            if ('n' == $matches[3][0]) {
174
                // create unique object index: [object number]_[generation number]
175 34
                $index = $obj_num.'_'.(int) $matches[2][0];
176
                // check if object already exist
177 34
                if (!isset($xref['xref'][$index])) {
178
                    // store object offset position
179 34
                    $xref['xref'][$index] = (int) $matches[1][0];
180
                }
181 34
                ++$obj_num;
182 34
            } elseif ('f' == $matches[3][0]) {
183 34
                ++$obj_num;
184
            } else {
185
                // object number (index)
186 34
                $obj_num = (int) $matches[1][0];
187
            }
188
        }
189
        // get trailer data
190 34
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
191 34
            $trailer_data = $matches[1][0];
192 34
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
193
                // get only the last updated version
194 34
                $xref['trailer'] = [];
195
                // parse trailer_data
196 34
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
197 34
                    $xref['trailer']['size'] = (int) $matches[1];
198
                }
199 34
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
200 34
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
201
                }
202 34
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
203
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
204
                }
205 34
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
206 33
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
207
                }
208 34
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
209 28
                    $xref['trailer']['id'] = [];
210 28
                    $xref['trailer']['id'][0] = $matches[1];
211 28
                    $xref['trailer']['id'][1] = $matches[2];
212
                }
213
            }
214 34
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
215
                // get previous xref
216 34
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
217
            }
218
        } else {
219
            throw new \Exception('Unable to find trailer');
220
        }
221
222 34
        return $xref;
223
    }
224
225
    /**
226
     * Decode the Cross-Reference Stream section
227
     *
228
     * @param string $pdfData   PDF data
229
     * @param int    $startxref Offset at which the xref section starts
230
     * @param array  $xref      Previous xref array (if any)
231
     *
232
     * @return array containing xref and trailer data
233
     *
234
     * @throws \Exception if unknown PNG predictor detected
235
     */
236 9
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
237
    {
238
        // try to read Cross-Reference Stream
239 9
        $xrefobj = $this->getRawObject($pdfData, $startxref);
240 9
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
241 9
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
242
            // get only the last updated version
243 9
            $xref['trailer'] = [];
244 9
            $filltrailer = true;
245
        } else {
246 7
            $filltrailer = false;
247
        }
248 9
        if (!isset($xref['xref'])) {
249 9
            $xref['xref'] = [];
250
        }
251 9
        $valid_crs = false;
252 9
        $columns = 0;
253 9
        $predictor = null;
254 9
        $sarr = $xrefcrs[0][1];
255 9
        if (!\is_array($sarr)) {
256
            $sarr = [];
257
        }
258
259 9
        $wb = [];
260
261 9
        foreach ($sarr as $k => $v) {
262
            if (
263 9
                ('/' == $v[0])
264 9
                && ('Type' == $v[1])
265 9
                && (isset($sarr[$k + 1])
266 9
                    && '/' == $sarr[$k + 1][0]
267 9
                    && 'XRef' == $sarr[$k + 1][1]
268
                )
269
            ) {
270 9
                $valid_crs = true;
271 9
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
272
                // initialize list for: first object number in the subsection / number of objects
273 7
                $index_blocks = [];
274 7
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
275 7
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
276
                }
277 9
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
278
                // get previous xref offset
279 7
                $prevxref = (int) $sarr[$k + 1][1];
280 9
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
281
                // number of bytes (in the decoded stream) of the corresponding field
282 9
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
283 9
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
284 9
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
285 9
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
286 8
                $decpar = $sarr[$k + 1][1];
287 8
                foreach ($decpar as $kdc => $vdc) {
288
                    if (
289 8
                        '/' == $vdc[0]
290 8
                        && 'Columns' == $vdc[1]
291 8
                        && (isset($decpar[$kdc + 1])
292 8
                            && 'numeric' == $decpar[$kdc + 1][0]
293
                        )
294
                    ) {
295 8
                        $columns = (int) $decpar[$kdc + 1][1];
296
                    } elseif (
297 8
                        '/' == $vdc[0]
298 8
                        && 'Predictor' == $vdc[1]
299 8
                        && (isset($decpar[$kdc + 1])
300 8
                            && 'numeric' == $decpar[$kdc + 1][0]
301
                        )
302
                    ) {
303 8
                        $predictor = (int) $decpar[$kdc + 1][1];
304
                    }
305
                }
306 9
            } elseif ($filltrailer) {
307 9
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
308 9
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
309 9
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
310 9
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
311 9
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
312 9
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
313 9
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
314
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
315 9
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
316 9
                    $xref['trailer']['id'] = [];
317 9
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
318 9
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
319
                }
320
            }
321
        }
322
323
        // decode data
324 9
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
325 9
            if (null !== $predictor) {
326
                // number of bytes in a row
327 8
                $rowlen = ($columns + 1);
328
                // convert the stream into an array of integers
329
                /** @var array<int> */
330 8
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
331
                // TODO: Handle the case when unpack returns false
332
333
                // split the rows
334 8
                $sdata = array_chunk($sdata, $rowlen);
335
336
                // initialize decoded array
337 8
                $ddata = [];
338
                // initialize first row with zeros
339 8
                $prev_row = array_fill(0, $rowlen, 0);
340
                // for each row apply PNG unpredictor
341 8
                foreach ($sdata as $k => $row) {
342
                    // initialize new row
343 8
                    $ddata[$k] = [];
344
                    // get PNG predictor value
345 8
                    $predictor = (10 + $row[0]);
346
                    // for each byte on the row
347 8
                    for ($i = 1; $i <= $columns; ++$i) {
348
                        // new index
349 8
                        $j = ($i - 1);
350 8
                        $row_up = $prev_row[$j];
351 8
                        if (1 == $i) {
352 8
                            $row_left = 0;
353 8
                            $row_upleft = 0;
354
                        } else {
355 8
                            $row_left = $row[$i - 1];
356 8
                            $row_upleft = $prev_row[$j - 1];
357
                        }
358 8
                        switch ($predictor) {
359 8
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
360
                                $ddata[$k][$j] = $row[$i];
361
                                break;
362
363 8
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
364
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
365
                                break;
366
367 8
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
368 8
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
369 8
                                break;
370
371
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
372
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
373
                                break;
374
375
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
376
                                // initial estimate
377
                                $p = ($row_left + $row_up - $row_upleft);
378
                                // distances
379
                                $pa = abs($p - $row_left);
380
                                $pb = abs($p - $row_up);
381
                                $pc = abs($p - $row_upleft);
382
                                $pmin = min($pa, $pb, $pc);
383
                                // return minimum distance
384
                                switch ($pmin) {
385
                                    case $pa:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
387
                                        break;
388
389
                                    case $pb:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
391
                                        break;
392
393
                                    case $pc:
394
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
395
                                        break;
396
                                }
397
                                break;
398
399
                            default:  // PNG prediction (on encoding, PNG optimum)
400
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
401
                        }
402
                    }
403 8
                    $prev_row = $ddata[$k];
404
                } // end for each row
405
                // complete decoding
406
            } else {
407
                // number of bytes in a row
408 1
                $rowlen = array_sum($wb);
409
                // convert the stream into an array of integers
410 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
411
                // split the rows
412 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

412
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
413
            }
414
415 9
            $sdata = [];
416
417
            // for every row
418 9
            foreach ($ddata as $k => $row) {
419
                // initialize new row
420 9
                $sdata[$k] = [0, 0, 0];
421 9
                if (0 == $wb[0]) {
422
                    // default type field
423
                    $sdata[$k][0] = 1;
424
                }
425 9
                $i = 0; // count bytes in the row
426
                // for every column
427 9
                for ($c = 0; $c < 3; ++$c) {
428
                    // for every byte on the column
429 9
                    for ($b = 0; $b < $wb[$c]; ++$b) {
430 9
                        if (isset($row[$i])) {
431 9
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
432
                        }
433 9
                        ++$i;
434
                    }
435
                }
436
            }
437
438
            // fill xref
439 9
            if (isset($index_blocks)) {
440
                // load the first object number of the first /Index entry
441 7
                $obj_num = $index_blocks[0][0];
442
            } else {
443 9
                $obj_num = 0;
444
            }
445 9
            foreach ($sdata as $k => $row) {
446 9
                switch ($row[0]) {
447 9
                    case 0:  // (f) linked list of free objects
448 9
                        break;
449
450 9
                    case 1:  // (n) objects that are in use but are not compressed
451
                        // create unique object index: [object number]_[generation number]
452 9
                        $index = $obj_num.'_'.$row[2];
453
                        // check if object already exist
454 9
                        if (!isset($xref['xref'][$index])) {
455
                            // store object offset position
456 9
                            $xref['xref'][$index] = $row[1];
457
                        }
458 9
                        break;
459
460 9
                    case 2:  // compressed objects
461
                        // $row[1] = object number of the object stream in which this object is stored
462
                        // $row[2] = index of this object within the object stream
463 9
                        $index = $row[1].'_0_'.$row[2];
464 9
                        $xref['xref'][$index] = -1;
465 9
                        break;
466
467
                    default:  // null objects
468
                        break;
469
                }
470 9
                ++$obj_num;
471 9
                if (isset($index_blocks)) {
472
                    // reduce the number of remaining objects
473 7
                    --$index_blocks[0][1];
474 7
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
475
                        // remove the actual used /Index entry
476 7
                        array_shift($index_blocks);
477 7
                        if (0 < \count($index_blocks)) {
478
                            // load the first object number of the following /Index entry
479 5
                            $obj_num = $index_blocks[0][0];
480
                        } else {
481
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
482 7
                            unset($index_blocks);
483
                        }
484
                    }
485
                }
486
            }
487
        } // end decoding data
488 9
        if (isset($prevxref)) {
489
            // get previous xref
490 7
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
491
        }
492
493 9
        return $xref;
494
    }
495
496 43
    protected function getObjectHeaderPattern(array $objRefs): string
497
    {
498
        // consider all whitespace character (PDF specifications)
499 43
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
500
    }
501
502 43
    protected function getObjectHeaderLen(array $objRefs): int
503
    {
504
        // "4 0 obj"
505
        // 2 whitespaces + strlen("obj") = 5
506 43
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
507
    }
508
509
    /**
510
     * Get content of indirect object.
511
     *
512
     * @param string $pdfData  PDF data
513
     * @param string $objRef   Object number and generation number separated by underscore character
514
     * @param int    $offset   Object offset
515
     * @param bool   $decoding If true decode streams
516
     *
517
     * @return array containing object data
518
     *
519
     * @throws \Exception if invalid object reference found
520
     */
521 43
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
522
    {
523
        /*
524
         * build indirect object header
525
         */
526
        // $objHeader = "[object number] [generation number] obj"
527 43
        $objRefArr = explode('_', $objRef);
528 43
        if (2 !== \count($objRefArr)) {
529
            throw new \Exception('Invalid object reference for $obj.');
530
        }
531
532 43
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
533
534
        /*
535
         * check if we are in position
536
         */
537
        // ignore whitespace characters at offset
538 43
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
539
        // ignore leading zeros for object number
540 43
        $offset += strspn($pdfData, '0', $offset);
541 43
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
542
            // an indirect reference to an undefined object shall be considered a reference to the null object
543
            return ['null', 'null', $offset];
544
        }
545
546
        /*
547
         * get content
548
         */
549
        // starting position of object content
550 43
        $offset += $objHeaderLen;
551 43
        $objContentArr = [];
552 43
        $i = 0; // object main index
553 43
        $header = null;
554
        do {
555 43
            $oldOffset = $offset;
556
            // get element
557 43
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
558 43
            $offset = $element[2];
559
            // decode stream using stream's dictionary information
560 43
            if ($decoding && ('stream' === $element[0]) && null != $header) {
561 43
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
562
            }
563 43
            $objContentArr[$i] = $element;
564 43
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
565 43
            ++$i;
566 43
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
567
        // remove closing delimiter
568 43
        array_pop($objContentArr);
569
570
        /*
571
         * return raw object content
572
         */
573 43
        return $objContentArr;
574
    }
575
576
    /**
577
     * Get the content of object, resolving indirect object reference if necessary.
578
     *
579
     * @param string $pdfData PDF data
580
     * @param array  $obj     Object value
581
     *
582
     * @return array containing object data
583
     *
584
     * @throws \Exception
585
     */
586 43
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
587
    {
588 43
        if ('objref' == $obj[0]) {
589
            // reference to indirect object
590
            if (isset($this->objects[$obj[1]])) {
591
                // this object has been already parsed
592
                return $this->objects[$obj[1]];
593
            } elseif (isset($xref[$obj[1]])) {
594
                // parse new object
595
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
596
597
                return $this->objects[$obj[1]];
598
            }
599
        }
600
601 43
        return $obj;
602
    }
603
604
    /**
605
     * Get object type, raw value and offset to next object
606
     *
607
     * @param int        $offset    Object offset
608
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
609
     *
610
     * @return array containing object type, raw value and offset to next object
611
     */
612 44
    protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
613
    {
614 44
        $objtype = ''; // object type to be returned
615 44
        $objval = ''; // object value to be returned
616
617
        // skip initial white space chars
618 44
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
619
620
        // get first char
621 44
        $char = $pdfData[$offset];
622
        // get object type
623 44
        switch ($char) {
624 44
            case '%':  // \x25 PERCENT SIGN
625
                // skip comment and search for next token
626 1
                $next = strcspn($pdfData, "\r\n", $offset);
627 1
                if ($next > 0) {
628 1
                    $offset += $next;
629
630 1
                    return $this->getRawObject($pdfData, $offset);
631
                }
632
                break;
633
634 44
            case '/':  // \x2F SOLIDUS
635
                // name object
636 44
                $objtype = $char;
637 44
                ++$offset;
638 44
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
639 44
                if ($span > 0) {
640 44
                    $objval = substr($pdfData, $offset, $span); // unescaped value
641 44
                    $offset += $span;
642
                }
643 44
                break;
644
645 44
            case '(':   // \x28 LEFT PARENTHESIS
646 44
            case ')':  // \x29 RIGHT PARENTHESIS
647
                // literal string object
648 39
                $objtype = $char;
649 39
                ++$offset;
650 39
                $strpos = $offset;
651 39
                if ('(' == $char) {
652 39
                    $open_bracket = 1;
653 39
                    while ($open_bracket > 0) {
654 39
                        if (!isset($pdfData[$strpos])) {
655
                            break;
656
                        }
657 39
                        $ch = $pdfData[$strpos];
658 39
                        switch ($ch) {
659 39
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
660
                                // skip next character
661 20
                                ++$strpos;
662 20
                                break;
663
664 39
                            case '(':  // LEFT PARENHESIS (28h)
665
                                ++$open_bracket;
666
                                break;
667
668 39
                            case ')':  // RIGHT PARENTHESIS (29h)
669 39
                                --$open_bracket;
670 39
                                break;
671
                        }
672 39
                        ++$strpos;
673
                    }
674 39
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
675 39
                    $offset = $strpos;
676
                }
677 39
                break;
678
679 44
            case '[':   // \x5B LEFT SQUARE BRACKET
680 44
            case ']':  // \x5D RIGHT SQUARE BRACKET
681
                // array object
682 43
                $objtype = $char;
683 43
                ++$offset;
684 43
                if ('[' == $char) {
685
                    // get array content
686 43
                    $objval = [];
687
                    do {
688 43
                        $oldOffset = $offset;
689
                        // get element
690 43
                        $element = $this->getRawObject($pdfData, $offset);
691 43
                        $offset = $element[2];
692 43
                        $objval[] = $element;
693 43
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
694
                    // remove closing delimiter
695 43
                    array_pop($objval);
696
                }
697 43
                break;
698
699 44
            case '<':  // \x3C LESS-THAN SIGN
700 44
            case '>':  // \x3E GREATER-THAN SIGN
701 44
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
702
                    // dictionary object
703 44
                    $objtype = $char.$char;
704 44
                    $offset += 2;
705 44
                    if ('<' == $char) {
706
                        // get array content
707 44
                        $objval = [];
708
                        do {
709 44
                            $oldOffset = $offset;
710
                            // get element
711 44
                            $element = $this->getRawObject($pdfData, $offset);
712 44
                            $offset = $element[2];
713 44
                            $objval[] = $element;
714 44
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
715
                        // remove closing delimiter
716 44
                        array_pop($objval);
717
                    }
718
                } else {
719
                    // hexadecimal string object
720 20
                    $objtype = $char;
721 20
                    ++$offset;
722
723 20
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
724 20
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
725 20
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
726
                        // remove white space characters
727 20
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
728 20
                        $offset += $span + 1;
729
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
730
                        $offset = $endpos + 1;
731
                    }
732
                }
733 44
                break;
734
735
            default:
736 44
                if ('endobj' == substr($pdfData, $offset, 6)) {
737
                    // indirect object
738 43
                    $objtype = 'endobj';
739 43
                    $offset += 6;
740 44
                } elseif ('null' == substr($pdfData, $offset, 4)) {
741
                    // null object
742 3
                    $objtype = 'null';
743 3
                    $offset += 4;
744 3
                    $objval = 'null';
745 44
                } elseif ('true' == substr($pdfData, $offset, 4)) {
746
                    // boolean true object
747 17
                    $objtype = 'boolean';
748 17
                    $offset += 4;
749 17
                    $objval = 'true';
750 44
                } elseif ('false' == substr($pdfData, $offset, 5)) {
751
                    // boolean false object
752 3
                    $objtype = 'boolean';
753 3
                    $offset += 5;
754 3
                    $objval = 'false';
755 44
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
756
                    // start stream object
757 43
                    $objtype = 'stream';
758 43
                    $offset += 6;
759 43
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
760 43
                        $offset += \strlen($matches[0]);
761
762
                        // we get stream length here to later help preg_match test less data
763 43
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
764 43
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
765
766 43
                        $pregResult = preg_match(
767 43
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
768
                            $pdfData,
769
                            $matches,
770 43
                            \PREG_OFFSET_CAPTURE,
771 43
                            $offset + $streamLen
772
                        );
773
774 43
                        if (1 == $pregResult) {
775 43
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
776 43
                            $offset = $matches[1][1];
777
                        }
778
                    }
779 44
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
780
                    // end stream object
781 43
                    $objtype = 'endstream';
782 43
                    $offset += 9;
783 44
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
784
                    // indirect object reference
785 43
                    $objtype = 'objref';
786 43
                    $offset += \strlen($matches[0]);
787 43
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
788 44
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
789
                    // object start
790 10
                    $objtype = 'obj';
791 10
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
792 10
                    $offset += \strlen($matches[0]);
793 44
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
794
                    // numeric object
795 43
                    $objtype = 'numeric';
796 43
                    $objval = substr($pdfData, $offset, $numlen);
797 43
                    $offset += $numlen;
798
                }
799 44
                break;
800
        }
801
802 44
        return [$objtype, $objval, $offset];
803
    }
804
805
    /**
806
     * Get value of an object header's section (obj << YYY >> part ).
807
     *
808
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
809
     * when no Smalot\PdfParser\Header objects are created yet.
810
     *
811
     * @param string            $key     header's section name
812
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
813
     * @param string|array|null $default default value for header's section
814
     *
815
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
816
     */
817 43
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
818
    {
819 43
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
820
            return $default;
821
        }
822
823
        /*
824
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
825
         * iterates over it, searching for section of type '/' whith requested key.
826
         * If such a section is found, it tries to receive it's value (next object in dictionary),
827
         * returning it, if it matches requested type, or default value otherwise.
828
         */
829 43
        foreach ($headerDic as $i => $val) {
830 43
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
831
            if (
832 43
                $isSectionName
833 43
                && $val[1] == $key
834 43
                && isset($headerDic[$i + 1])
835
            ) {
836 43
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
837
838 43
                return $isSectionValue && $type == $headerDic[$i + 1][0]
839 41
                    ? $headerDic[$i + 1][1]
840 43
                    : $default;
841
            }
842
        }
843
844
        return $default;
845
    }
846
847
    /**
848
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
849
     *
850
     * @param int   $offset xref offset (if known)
851
     * @param array $xref   previous xref array (if any)
852
     *
853
     * @return array containing xref and trailer data
854
     *
855
     * @throws \Exception if it was unable to find startxref
856
     * @throws \Exception if it was unable to find xref
857
     */
858 44
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
859
    {
860 44
        $startxrefPreg = preg_match(
861 44
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
862
            $pdfData,
863
            $matches,
864 44
            \PREG_OFFSET_CAPTURE,
865
            $offset
866
        );
867
868 44
        if (0 == $offset) {
869
            // find last startxref
870 44
            $pregResult = preg_match_all(
871 44
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
872
                $pdfData,
873
                $matches,
874 44
                \PREG_SET_ORDER,
875
                $offset
876
            );
877 44
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
878
                throw new \Exception('Unable to find startxref');
879
            }
880 44
            $matches = array_pop($matches);
881 44
            $startxref = $matches[1];
882 16
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
883
            // Already pointing at the xref table
884 9
            $startxref = $offset;
885 7
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
886
            // Cross-Reference Stream object
887 7
            $startxref = $offset;
888
        } elseif ($startxrefPreg) {
889
            // startxref found
890
            $startxref = $matches[1][0];
891
        } else {
892
            throw new \Exception('Unable to find startxref');
893
        }
894
895 44
        if ($startxref > \strlen($pdfData)) {
896 1
            throw new \Exception('Unable to find xref (PDF corrupted?)');
897
        }
898
899
        // check xref position
900 43
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
901
            // Cross-Reference
902 34
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
903
        } else {
904
            // Cross-Reference Stream
905 9
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
906
        }
907 43
        if (empty($xref)) {
908
            throw new \Exception('Unable to find xref');
909
        }
910
911 43
        return $xref;
912
    }
913
914
    /**
915
     * Parses PDF data and returns extracted data as array.
916
     *
917
     * @param string $data PDF data to parse
918
     *
919
     * @return array array of parsed PDF document objects
920
     *
921
     * @throws \Exception if empty PDF data given
922
     * @throws \Exception if PDF data missing %PDF header
923
     */
924 44
    public function parseData(string $data): array
925
    {
926 44
        if (empty($data)) {
927
            throw new \Exception('Empty PDF data given.');
928
        }
929
        // find the pdf header starting position
930 44
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
931
            throw new \Exception('Invalid PDF data: missing %PDF header.');
932
        }
933
934
        // get PDF content string
935 44
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
936
937
        // get xref and trailer data
938 44
        $xref = $this->getXrefData($pdfData);
939
940
        // parse all document objects
941 43
        $objects = [];
942 43
        foreach ($xref['xref'] as $obj => $offset) {
943 43
            if (!isset($objects[$obj]) && ($offset > 0)) {
944
                // decode objects with positive offset
945 43
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
946
            }
947
        }
948
949 43
        return [$xref, $objects];
950
    }
951
}
952