Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

RawDataParser::__construct()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 2
eloc 3
nc 1
nop 2
dl 0
loc 7
ccs 4
cts 4
cp 1
crap 2
rs 10
c 1
b 0
f 1
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     *
57
     * @var array<string,bool>
58
     */
59
    protected $cfg = [
60
        // if `true` ignore filter decoding errors
61
        'ignore_filter_decoding_errors' => true,
62
        // if `true` ignore missing filter decoding errors
63
        'ignore_missing_filter_decoders' => true,
64
    ];
65
66
    protected $filterHelper;
67
    protected $objects;
68
69
    /**
70
     * @param array $cfg Configuration array, default is []
71
     */
72 71
    public function __construct($cfg = [], ?Config $config = null)
73
    {
74
        // merge given array with default values
75 71
        $this->cfg = array_merge($this->cfg, $cfg);
76
77 71
        $this->filterHelper = new FilterHelper();
78 71
        $this->config = $config ?: new Config();
79
    }
80
81
    /**
82
     * Decode the specified stream.
83
     *
84
     * @param string $pdfData PDF data
85
     * @param array  $sdic    Stream's dictionary array
86
     * @param string $stream  Stream to decode
87
     *
88
     * @return array containing decoded stream data and remaining filters
89
     *
90
     * @throws \Exception
91
     */
92 64
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
93
    {
94
        // get stream length and filters
95 64
        $slength = \strlen($stream);
96 64
        if ($slength <= 0) {
97
            return ['', []];
98
        }
99 64
        $filters = [];
100 64
        foreach ($sdic as $k => $v) {
101 64
            if ('/' == $v[0]) {
102 64
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
103
                    // get declared stream length
104 60
                    $declength = (int) $sdic[$k + 1][1];
105 60
                    if ($declength < $slength) {
106 60
                        $stream = substr($stream, 0, $declength);
107 60
                        $slength = $declength;
108
                    }
109 64
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
110
                    // resolve indirect object
111 64
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
112 64
                    if ('/' == $objval[0]) {
113
                        // single filter
114 64
                        $filters[] = $objval[1];
115 4
                    } elseif ('[' == $objval[0]) {
116
                        // array of filters
117 4
                        foreach ($objval[1] as $flt) {
118 4
                            if ('/' == $flt[0]) {
119 4
                                $filters[] = $flt[1];
120
                            }
121
                        }
122
                    }
123
                }
124
            }
125
        }
126
127
        // decode the stream
128 64
        $remaining_filters = [];
129 64
        foreach ($filters as $filter) {
130 64
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
131
                try {
132 64
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
133 2
                } catch (\Exception $e) {
134 2
                    $emsg = $e->getMessage();
135 2
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
136 2
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
137
                    ) {
138 64
                        throw new \Exception($e->getMessage());
139
                    }
140
                }
141
            } else {
142
                // add missing filter to array
143 9
                $remaining_filters[] = $filter;
144
            }
145
        }
146
147 64
        return [$stream, $remaining_filters];
148
    }
149
150
    /**
151
     * Decode the Cross-Reference section
152
     *
153
     * @param string $pdfData   PDF data
154
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
155
     * @param array  $xref      Previous xref array (if any)
156
     *
157
     * @return array containing xref and trailer data
158
     *
159
     * @throws \Exception
160
     */
161 51
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
162
    {
163 51
        $startxref += 4; // 4 is the length of the word 'xref'
164
        // skip initial white space chars
165 51
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
166
        // initialize object number
167 51
        $obj_num = 0;
168
        // search for cross-reference entries or subsection
169 51
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
170 51
            if ($matches[0][1] != $offset) {
171
                // we are on another section
172 11
                break;
173
            }
174 51
            $offset += \strlen($matches[0][0]);
175 51
            if ('n' == $matches[3][0]) {
176
                // create unique object index: [object number]_[generation number]
177 51
                $index = $obj_num.'_'.(int) $matches[2][0];
178
                // check if object already exist
179 51
                if (!isset($xref['xref'][$index])) {
180
                    // store object offset position
181 51
                    $xref['xref'][$index] = (int) $matches[1][0];
182
                }
183 51
                ++$obj_num;
184 51
            } elseif ('f' == $matches[3][0]) {
185 51
                ++$obj_num;
186
            } else {
187
                // object number (index)
188 51
                $obj_num = (int) $matches[1][0];
189
            }
190
        }
191
        // get trailer data
192 51
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
193 51
            $trailer_data = $matches[1][0];
194 51
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
195
                // get only the last updated version
196 51
                $xref['trailer'] = [];
197
                // parse trailer_data
198 51
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
199 51
                    $xref['trailer']['size'] = (int) $matches[1];
200
                }
201 51
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202 51
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
203
                }
204 51
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 2
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
206
                }
207 51
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
208 48
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
209
                }
210 51
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
211 41
                    $xref['trailer']['id'] = [];
212 41
                    $xref['trailer']['id'][0] = $matches[1];
213 41
                    $xref['trailer']['id'][1] = $matches[2];
214
                }
215
            }
216 51
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
217
                // get previous xref
218 51
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
219
            }
220
        } else {
221
            throw new \Exception('Unable to find trailer');
222
        }
223
224 51
        return $xref;
225
    }
226
227
    /**
228
     * Decode the Cross-Reference Stream section
229
     *
230
     * @param string $pdfData   PDF data
231
     * @param int    $startxref Offset at which the xref section starts
232
     * @param array  $xref      Previous xref array (if any)
233
     *
234
     * @return array containing xref and trailer data
235
     *
236
     * @throws \Exception if unknown PNG predictor detected
237
     */
238 13
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
239
    {
240
        // try to read Cross-Reference Stream
241 13
        $xrefobj = $this->getRawObject($pdfData, $startxref);
242 13
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
243 13
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
244
            // get only the last updated version
245 13
            $xref['trailer'] = [];
246 13
            $filltrailer = true;
247
        } else {
248 11
            $filltrailer = false;
249
        }
250 13
        if (!isset($xref['xref'])) {
251 13
            $xref['xref'] = [];
252
        }
253 13
        $valid_crs = false;
254 13
        $columns = 0;
255 13
        $predictor = null;
256 13
        $sarr = $xrefcrs[0][1];
257 13
        if (!\is_array($sarr)) {
258
            $sarr = [];
259
        }
260
261 13
        $wb = [];
262
263 13
        foreach ($sarr as $k => $v) {
264
            if (
265 13
                ('/' == $v[0])
266 13
                && ('Type' == $v[1])
267 13
                && (isset($sarr[$k + 1])
268 13
                    && '/' == $sarr[$k + 1][0]
269 13
                    && 'XRef' == $sarr[$k + 1][1]
270
                )
271
            ) {
272 13
                $valid_crs = true;
273 13
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
274
                // initialize list for: first object number in the subsection / number of objects
275 11
                $index_blocks = [];
276 11
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
277 11
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
278
                }
279 13
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
280
                // get previous xref offset
281 11
                $prevxref = (int) $sarr[$k + 1][1];
282 13
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
283
                // number of bytes (in the decoded stream) of the corresponding field
284 13
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
285 13
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
286 13
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
287 13
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
288 11
                $decpar = $sarr[$k + 1][1];
289 11
                foreach ($decpar as $kdc => $vdc) {
290
                    if (
291 11
                        '/' == $vdc[0]
292 11
                        && 'Columns' == $vdc[1]
293 11
                        && (isset($decpar[$kdc + 1])
294 11
                            && 'numeric' == $decpar[$kdc + 1][0]
295
                        )
296
                    ) {
297 11
                        $columns = (int) $decpar[$kdc + 1][1];
298
                    } elseif (
299 11
                        '/' == $vdc[0]
300 11
                        && 'Predictor' == $vdc[1]
301 11
                        && (isset($decpar[$kdc + 1])
302 11
                            && 'numeric' == $decpar[$kdc + 1][0]
303
                        )
304
                    ) {
305 11
                        $predictor = (int) $decpar[$kdc + 1][1];
306
                    }
307
                }
308 13
            } elseif ($filltrailer) {
309 13
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
310 13
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
311 13
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
312 13
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
313 13
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
314 13
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
315 13
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
316
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
317 13
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
318 13
                    $xref['trailer']['id'] = [];
319 13
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
320 13
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
321
                }
322
            }
323
        }
324
325
        // decode data
326 13
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
327 13
            if (null !== $predictor) {
328
                // number of bytes in a row
329 11
                $rowlen = ($columns + 1);
330
                // convert the stream into an array of integers
331
                /** @var array<int> */
332 11
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
333
                // TODO: Handle the case when unpack returns false
334
335
                // split the rows
336 11
                $sdata = array_chunk($sdata, $rowlen);
337
338
                // initialize decoded array
339 11
                $ddata = [];
340
                // initialize first row with zeros
341 11
                $prev_row = array_fill(0, $rowlen, 0);
342
                // for each row apply PNG unpredictor
343 11
                foreach ($sdata as $k => $row) {
344
                    // initialize new row
345 11
                    $ddata[$k] = [];
346
                    // get PNG predictor value
347 11
                    $predictor = (10 + $row[0]);
348
                    // for each byte on the row
349 11
                    for ($i = 1; $i <= $columns; ++$i) {
350
                        // new index
351 11
                        $j = ($i - 1);
352 11
                        $row_up = $prev_row[$j];
353 11
                        if (1 == $i) {
354 11
                            $row_left = 0;
355 11
                            $row_upleft = 0;
356
                        } else {
357 11
                            $row_left = $row[$i - 1];
358 11
                            $row_upleft = $prev_row[$j - 1];
359
                        }
360
                        switch ($predictor) {
361 11
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
362
                                $ddata[$k][$j] = $row[$i];
363
                                break;
364
365 11
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
366
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
367
                                break;
368
369 11
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
370 11
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
371 11
                                break;
372
373
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
374
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
375
                                break;
376
377
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
378
                                // initial estimate
379
                                $p = ($row_left + $row_up - $row_upleft);
380
                                // distances
381
                                $pa = abs($p - $row_left);
382
                                $pb = abs($p - $row_up);
383
                                $pc = abs($p - $row_upleft);
384
                                $pmin = min($pa, $pb, $pc);
385
                                // return minimum distance
386
                                switch ($pmin) {
387
                                    case $pa:
388
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
389
                                        break;
390
391
                                    case $pb:
392
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
393
                                        break;
394
395
                                    case $pc:
396
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
397
                                        break;
398
                                }
399
                                break;
400
401
                            default:  // PNG prediction (on encoding, PNG optimum)
402
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
403
                        }
404
                    }
405 11
                    $prev_row = $ddata[$k];
406
                } // end for each row
407
            // complete decoding
408
            } else {
409
                // number of bytes in a row
410 2
                $rowlen = array_sum($wb);
411
                // convert the stream into an array of integers
412 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
413
                // split the rows
414 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

414
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
415
            }
416
417 13
            $sdata = [];
418
419
            // for every row
420 13
            foreach ($ddata as $k => $row) {
421
                // initialize new row
422 13
                $sdata[$k] = [0, 0, 0];
423 13
                if (0 == $wb[0]) {
424
                    // default type field
425
                    $sdata[$k][0] = 1;
426
                }
427 13
                $i = 0; // count bytes in the row
428
                // for every column
429 13
                for ($c = 0; $c < 3; ++$c) {
430
                    // for every byte on the column
431 13
                    for ($b = 0; $b < $wb[$c]; ++$b) {
432 13
                        if (isset($row[$i])) {
433 13
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
434
                        }
435 13
                        ++$i;
436
                    }
437
                }
438
            }
439
440
            // fill xref
441 13
            if (isset($index_blocks)) {
442
                // load the first object number of the first /Index entry
443 11
                $obj_num = $index_blocks[0][0];
444
            } else {
445 12
                $obj_num = 0;
446
            }
447 13
            foreach ($sdata as $k => $row) {
448 13
                switch ($row[0]) {
449 13
                    case 0:  // (f) linked list of free objects
450 13
                        break;
451
452 13
                    case 1:  // (n) objects that are in use but are not compressed
453
                        // create unique object index: [object number]_[generation number]
454 13
                        $index = $obj_num.'_'.$row[2];
455
                        // check if object already exist
456 13
                        if (!isset($xref['xref'][$index])) {
457
                            // store object offset position
458 13
                            $xref['xref'][$index] = $row[1];
459
                        }
460 13
                        break;
461
462 13
                    case 2:  // compressed objects
463
                        // $row[1] = object number of the object stream in which this object is stored
464
                        // $row[2] = index of this object within the object stream
465 13
                        $index = $row[1].'_0_'.$row[2];
466 13
                        $xref['xref'][$index] = -1;
467 13
                        break;
468
469
                    default:  // null objects
470
                        break;
471
                }
472 13
                ++$obj_num;
473 13
                if (isset($index_blocks)) {
474
                    // reduce the number of remaining objects
475 11
                    --$index_blocks[0][1];
476 11
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
477
                        // remove the actual used /Index entry
478 11
                        array_shift($index_blocks);
479 11
                        if (0 < \count($index_blocks)) {
480
                            // load the first object number of the following /Index entry
481 8
                            $obj_num = $index_blocks[0][0];
482
                        } else {
483
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
484 11
                            unset($index_blocks);
485
                        }
486
                    }
487
                }
488
            }
489
        } // end decoding data
490 13
        if (isset($prevxref)) {
491
            // get previous xref
492 11
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
493
        }
494
495 13
        return $xref;
496
    }
497
498 64
    protected function getObjectHeaderPattern(array $objRefs): string
499
    {
500
        // consider all whitespace character (PDF specifications)
501 64
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
502
    }
503
504 64
    protected function getObjectHeaderLen(array $objRefs): int
505
    {
506
        // "4 0 obj"
507
        // 2 whitespaces + strlen("obj") = 5
508 64
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
509
    }
510
511
    /**
512
     * Get content of indirect object.
513
     *
514
     * @param string $pdfData  PDF data
515
     * @param string $objRef   Object number and generation number separated by underscore character
516
     * @param int    $offset   Object offset
517
     * @param bool   $decoding If true decode streams
518
     *
519
     * @return array containing object data
520
     *
521
     * @throws \Exception if invalid object reference found
522
     */
523 64
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
524
    {
525
        /*
526
         * build indirect object header
527
         */
528
        // $objHeader = "[object number] [generation number] obj"
529 64
        $objRefArr = explode('_', $objRef);
530 64
        if (2 !== \count($objRefArr)) {
531
            throw new \Exception('Invalid object reference for $obj.');
532
        }
533
534 64
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
535
536
        /*
537
         * check if we are in position
538
         */
539
        // ignore whitespace characters at offset
540 64
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
541
        // ignore leading zeros for object number
542 64
        $offset += strspn($pdfData, '0', $offset);
543 64
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
544
            // an indirect reference to an undefined object shall be considered a reference to the null object
545
            return ['null', 'null', $offset];
546
        }
547
548
        /*
549
         * get content
550
         */
551
        // starting position of object content
552 64
        $offset += $objHeaderLen;
553 64
        $objContentArr = [];
554 64
        $i = 0; // object main index
555 64
        $header = null;
556
        do {
557 64
            $oldOffset = $offset;
558
            // get element
559 64
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
560 64
            $offset = $element[2];
561
            // decode stream using stream's dictionary information
562 64
            if ($decoding && ('stream' === $element[0]) && null != $header) {
563 64
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
564
            }
565 64
            $objContentArr[$i] = $element;
566 64
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
567 64
            ++$i;
568 64
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
569
        // remove closing delimiter
570 64
        array_pop($objContentArr);
571
572
        /*
573
         * return raw object content
574
         */
575 64
        return $objContentArr;
576
    }
577
578
    /**
579
     * Get the content of object, resolving indirect object reference if necessary.
580
     *
581
     * @param string $pdfData PDF data
582
     * @param array  $obj     Object value
583
     *
584
     * @return array containing object data
585
     *
586
     * @throws \Exception
587
     */
588 64
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
589
    {
590 64
        if ('objref' == $obj[0]) {
591
            // reference to indirect object
592
            if (isset($this->objects[$obj[1]])) {
593
                // this object has been already parsed
594
                return $this->objects[$obj[1]];
595
            } elseif (isset($xref[$obj[1]])) {
596
                // parse new object
597
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
598
599
                return $this->objects[$obj[1]];
600
            }
601
        }
602
603 64
        return $obj;
604
    }
605
606
    /**
607
     * Get object type, raw value and offset to next object
608
     *
609
     * @param int        $offset    Object offset
610
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
611
     *
612
     * @return array containing object type, raw value and offset to next object
613
     */
614 65
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
615
    {
616 65
        $objtype = ''; // object type to be returned
617 65
        $objval = ''; // object value to be returned
618
619
        // skip initial white space chars
620 65
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
621
622
        // get first char
623 65
        $char = $pdfData[$offset];
624
        // get object type
625
        switch ($char) {
626 65
            case '%':  // \x25 PERCENT SIGN
627
                // skip comment and search for next token
628 3
                $next = strcspn($pdfData, "\r\n", $offset);
629 3
                if ($next > 0) {
630 3
                    $offset += $next;
631
632 3
                    return $this->getRawObject($pdfData, $offset);
633
                }
634
                break;
635
636 65
            case '/':  // \x2F SOLIDUS
637
                // name object
638 65
                $objtype = $char;
639 65
                ++$offset;
640 65
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
641 65
                if ($span > 0) {
642 65
                    $objval = substr($pdfData, $offset, $span); // unescaped value
643 65
                    $offset += $span;
644
                }
645 65
                break;
646
647 65
            case '(':   // \x28 LEFT PARENTHESIS
648 65
            case ')':  // \x29 RIGHT PARENTHESIS
649
                // literal string object
650 60
                $objtype = $char;
651 60
                ++$offset;
652 60
                $strpos = $offset;
653 60
                if ('(' == $char) {
654 60
                    $open_bracket = 1;
655 60
                    while ($open_bracket > 0) {
656 60
                        if (!isset($pdfData[$strpos])) {
657
                            break;
658
                        }
659 60
                        $ch = $pdfData[$strpos];
660
                        switch ($ch) {
661 60
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
662
                                // skip next character
663 29
                                ++$strpos;
664 29
                                break;
665
666 60
                            case '(':  // LEFT PARENHESIS (28h)
667 2
                                ++$open_bracket;
668 2
                                break;
669
670 60
                            case ')':  // RIGHT PARENTHESIS (29h)
671 60
                                --$open_bracket;
672 60
                                break;
673
                        }
674 60
                        ++$strpos;
675
                    }
676 60
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
677 60
                    $offset = $strpos;
678
                }
679 60
                break;
680
681 65
            case '[':   // \x5B LEFT SQUARE BRACKET
682 65
            case ']':  // \x5D RIGHT SQUARE BRACKET
683
                // array object
684 64
                $objtype = $char;
685 64
                ++$offset;
686 64
                if ('[' == $char) {
687
                    // get array content
688 64
                    $objval = [];
689
                    do {
690 64
                        $oldOffset = $offset;
691
                        // get element
692 64
                        $element = $this->getRawObject($pdfData, $offset);
693 64
                        $offset = $element[2];
694 64
                        $objval[] = $element;
695 64
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
696
                    // remove closing delimiter
697 64
                    array_pop($objval);
698
                }
699 64
                break;
700
701 65
            case '<':  // \x3C LESS-THAN SIGN
702 65
            case '>':  // \x3E GREATER-THAN SIGN
703 65
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
704
                    // dictionary object
705 65
                    $objtype = $char.$char;
706 65
                    $offset += 2;
707 65
                    if ('<' == $char) {
708
                        // get array content
709 65
                        $objval = [];
710
                        do {
711 65
                            $oldOffset = $offset;
712
                            // get element
713 65
                            $element = $this->getRawObject($pdfData, $offset);
714 65
                            $offset = $element[2];
715 65
                            $objval[] = $element;
716 65
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
717
                        // remove closing delimiter
718 65
                        array_pop($objval);
719
                    }
720
                } else {
721
                    // hexadecimal string object
722 31
                    $objtype = $char;
723 31
                    ++$offset;
724
725 31
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
726 31
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
727 31
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
728
                        // remove white space characters
729 31
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
730 31
                        $offset += $span + 1;
731 2
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
732 2
                        $offset = $endpos + 1;
733
                    }
734
                }
735 65
                break;
736
737
            default:
738 65
                if ('endobj' == substr($pdfData, $offset, 6)) {
739
                    // indirect object
740 64
                    $objtype = 'endobj';
741 64
                    $offset += 6;
742 65
                } elseif ('null' == substr($pdfData, $offset, 4)) {
743
                    // null object
744 11
                    $objtype = 'null';
745 11
                    $offset += 4;
746 11
                    $objval = 'null';
747 65
                } elseif ('true' == substr($pdfData, $offset, 4)) {
748
                    // boolean true object
749 31
                    $objtype = 'boolean';
750 31
                    $offset += 4;
751 31
                    $objval = 'true';
752 65
                } elseif ('false' == substr($pdfData, $offset, 5)) {
753
                    // boolean false object
754 5
                    $objtype = 'boolean';
755 5
                    $offset += 5;
756 5
                    $objval = 'false';
757 65
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
758
                    // start stream object
759 65
                    $objtype = 'stream';
760 65
                    $offset += 6;
761 65
                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
762 65
                        $offset += \strlen($matches[0]);
763
764
                        // we get stream length here to later help preg_match test less data
765 65
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
766 65
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
767
768 65
                        $pregResult = preg_match(
769 65
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
770 65
                            $pdfData,
771 65
                            $matches,
772 65
                            \PREG_OFFSET_CAPTURE,
773 65
                            $offset + $streamLen
774 65
                        );
775
776 65
                        if (1 == $pregResult) {
777 65
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
778 65
                            $offset = $matches[1][1];
779
                        }
780
                    }
781 65
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
782
                    // end stream object
783 64
                    $objtype = 'endstream';
784 64
                    $offset += 9;
785 65
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
786
                    // indirect object reference
787 64
                    $objtype = 'objref';
788 64
                    $offset += \strlen($matches[0]);
789 64
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
790 65
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
791
                    // object start
792 14
                    $objtype = 'obj';
793 14
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
794 14
                    $offset += \strlen($matches[0]);
795 65
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
796
                    // numeric object
797 64
                    $objtype = 'numeric';
798 64
                    $objval = substr($pdfData, $offset, $numlen);
799 64
                    $offset += $numlen;
800
                }
801 65
                break;
802
        }
803
804 65
        return [$objtype, $objval, $offset];
805
    }
806
807
    /**
808
     * Get value of an object header's section (obj << YYY >> part ).
809
     *
810
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
811
     * when no Smalot\PdfParser\Header objects are created yet.
812
     *
813
     * @param string            $key     header's section name
814
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
815
     * @param string|array|null $default default value for header's section
816
     *
817
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
818
     */
819 65
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
820
    {
821 65
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
822 1
            return $default;
823
        }
824
825
        /*
826
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
827
         * iterates over it, searching for section of type '/' whith requested key.
828
         * If such a section is found, it tries to receive it's value (next object in dictionary),
829
         * returning it, if it matches requested type, or default value otherwise.
830
         */
831 64
        foreach ($headerDic as $i => $val) {
832 64
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
833
            if (
834 64
                $isSectionName
835 64
                && $val[1] == $key
836 64
                && isset($headerDic[$i + 1])
837
            ) {
838 64
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
839
840 64
                return $isSectionValue && $type == $headerDic[$i + 1][0]
841 60
                    ? $headerDic[$i + 1][1]
842 64
                    : $default;
843
            }
844
        }
845
846
        return $default;
847
    }
848
849
    /**
850
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
851
     *
852
     * @param int   $offset xref offset (if known)
853
     * @param array $xref   previous xref array (if any)
854
     *
855
     * @return array containing xref and trailer data
856
     *
857
     * @throws \Exception if it was unable to find startxref
858
     * @throws \Exception if it was unable to find xref
859
     */
860 65
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
861
    {
862 65
        $startxrefPreg = preg_match(
863 65
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
864 65
            $pdfData,
865 65
            $matches,
866 65
            \PREG_OFFSET_CAPTURE,
867 65
            $offset
868 65
        );
869
870 65
        if (0 == $offset) {
871
            // find last startxref
872 65
            $pregResult = preg_match_all(
873 65
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
874 65
                $pdfData,
875 65
                $matches,
876 65
                \PREG_SET_ORDER,
877 65
                $offset
878 65
            );
879 65
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
880
                throw new \Exception('Unable to find startxref');
881
            }
882 65
            $matches = array_pop($matches);
883 65
            $startxref = $matches[1];
884 22
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
885
            // Already pointing at the xref table
886 11
            $startxref = $offset;
887 11
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
888
            // Cross-Reference Stream object
889 11
            $startxref = $offset;
890
        } elseif ($startxrefPreg) {
891
            // startxref found
892
            $startxref = $matches[1][0];
893
        } else {
894
            throw new \Exception('Unable to find startxref');
895
        }
896
897 65
        if ($startxref > \strlen($pdfData)) {
898 1
            throw new \Exception('Unable to find xref (PDF corrupted?)');
899
        }
900
901
        // check xref position
902 64
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
903
            // Cross-Reference
904 51
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
905
        } else {
906
            // Check if the $pdfData might have the wrong line-endings
907 13
            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
908 13
            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
909
                // Return Unix-line-ending flag
910
                $xref = ['Unix' => true];
911
            } else {
912
                // Cross-Reference Stream
913 13
                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
914
            }
915
        }
916 64
        if (empty($xref)) {
917
            throw new \Exception('Unable to find xref');
918
        }
919
920 64
        return $xref;
921
    }
922
923
    /**
924
     * Parses PDF data and returns extracted data as array.
925
     *
926
     * @param string $data PDF data to parse
927
     *
928
     * @return array array of parsed PDF document objects
929
     *
930
     * @throws \Exception if empty PDF data given
931
     * @throws \Exception if PDF data missing %PDF header
932
     */
933 65
    public function parseData(string $data): array
934
    {
935 65
        if (empty($data)) {
936
            throw new \Exception('Empty PDF data given.');
937
        }
938
        // find the pdf header starting position
939 65
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
940
            throw new \Exception('Invalid PDF data: missing %PDF header.');
941
        }
942
943
        // get PDF content string
944 65
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
945
946
        // get xref and trailer data
947 65
        $xref = $this->getXrefData($pdfData);
948
949
        // If we found Unix line-endings
950 64
        if (isset($xref['Unix'])) {
951
            $pdfData = str_replace("\r\n", "\n", $pdfData);
952
            $xref = $this->getXrefData($pdfData);
953
        }
954
955
        // parse all document objects
956 64
        $objects = [];
957 64
        foreach ($xref['xref'] as $obj => $offset) {
958 64
            if (!isset($objects[$obj]) && ($offset > 0)) {
959
                // decode objects with positive offset
960 64
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
961
            }
962
        }
963
964 64
        return [$xref, $objects];
965
    }
966
}
967