Test Failed
Pull Request — master (#667)
by
unknown
02:48
created

RawDataParser::parseEncryptionInfo()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 11
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
eloc 7
nc 2
nop 3
dl 0
loc 11
ccs 0
cts 0
cp 0
crap 6
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var \Smalot\PdfParser\Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     */
57
    protected $cfg = [
58
        // if `true` ignore filter decoding errors
59
        'ignore_filter_decoding_errors' => true,
60
        // if `true` ignore missing filter decoding errors
61
        'ignore_missing_filter_decoders' => true,
62
    ];
63
64
    protected $filterHelper;
65
    protected $objects;
66
    /**
67
     * @var \Smalot\PdfParser\Encryption\Info or null
68
     */
69
    protected $encryptionInfo = null;
70 45
    /**
71
     * @var \Smalot\PdfParser\Encryption\Stream or null
72
     */
73 45
    protected $decryptionHelper = null;
74
75 45
    /**
76 45
     * @param array $cfg Configuration array, default is []
77 45
     */
78
    public function __construct($cfg = [], Config $config = null)
79
    {
80
        // merge given array with default values
81
        $this->cfg = array_merge($this->cfg, $cfg);
82
83
        $this->filterHelper = new FilterHelper();
84
        $this->config = $config ?: new Config();
85
    }
86
87
    /**
88
     * Decode the specified stream.
89
     *
90 41
     * @param string $pdfData PDF data
91
     * @param array  $sdic    Stream's dictionary array
92
     * @param string $stream  Stream to decode
93 41
     *
94 41
     * @return array containing decoded stream data and remaining filters
95
     *
96
     * @throws \Exception
97 41
     */
98 41
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream, array $objRefArr = null): array
99 41
    {
100 41
        // get stream length and filters
101
        $slength = \strlen($stream);
102 39
        if ($slength <= 0) {
103 39
            return ['', []];
104 39
        }
105 39
        $filters = [];
106
        foreach ($sdic as $k => $v) {
107 41
            if ('/' == $v[0]) {
108
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
109 41
                    // get declared stream length
110 41
                    $declength = (int) $sdic[$k + 1][1];
111
                    if ($declength < $slength) {
112 41
                        $stream = substr($stream, 0, $declength);
113 3
                        $slength = $declength;
114
                    }
115 3
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
116 3
                    // resolve indirect object
117 3
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
118
                    if ('/' == $objval[0]) {
119
                        // single filter
120
                        $filters[] = $objval[1];
121
                    } elseif ('[' == $objval[0]) {
122
                        // array of filters
123
                        foreach ($objval[1] as $flt) {
124
                            if ('/' == $flt[0]) {
125
                                $filters[] = $flt[1];
126 41
                            }
127 41
                        }
128 41
                    }
129
                }
130 41
            }
131
        }
132
133
        if (!is_null($this->decryptionHelper)) {
134
            if (!is_null($objRefArr)) {
135
                list($num, $gen) = $objRefArr;
136 41
                $stream = $this->decryptionHelper->decrypt($stream, $num, $gen);
0 ignored issues
show
Bug introduced by
The method decrypt() does not exist on Smalot\PdfParser\Encryption\Stream. Since it exists in all sub-types, consider adding an abstract or default implementation to Smalot\PdfParser\Encryption\Stream. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

136
                /** @scrutinizer ignore-call */ 
137
                $stream = $this->decryptionHelper->decrypt($stream, $num, $gen);
Loading history...
137
            } else {
138
                throw new \Exception('Logic error: $objRefArr not passed to decodeStream()');
139
            }
140
        }
141 4
142
        // decode the stream
143
        $remaining_filters = [];
144
        foreach ($filters as $filter) {
145 41
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
146
                try {
147
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
148
                } catch (\Exception $e) {
149
                    $emsg = $e->getMessage();
150
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
151
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
152
                    ) {
153
                        throw new \Exception($e->getMessage());
154
                    }
155
                }
156
            } else {
157
                // add missing filter to array
158
                $remaining_filters[] = $filter;
159 32
            }
160
        }
161 32
162
        return [$stream, $remaining_filters];
163 32
    }
164
165 32
    /**
166
     * Decode the Cross-Reference section
167 32
     *
168 32
     * @param string $pdfData   PDF data
169
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
170 7
     * @param array  $xref      Previous xref array (if any)
171
     *
172 32
     * @return array containing xref and trailer data
173 32
     *
174
     * @throws \Exception
175 32
     */
176
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
177 32
    {
178
        $startxref += 4; // 4 is the length of the word 'xref'
179 32
        // skip initial white space chars
180
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
181 32
        // initialize object number
182 32
        $obj_num = 0;
183 32
        // search for cross-reference entries or subsection
184
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
185
            if ($matches[0][1] != $offset) {
186 32
                // we are on another section
187
                break;
188
            }
189
            $offset += \strlen($matches[0][0]);
190 32
            if ('n' == $matches[3][0]) {
191 32
                // create unique object index: [object number]_[generation number]
192 32
                $index = $obj_num.'_'.(int) $matches[2][0];
193
                // check if object already exist
194 32
                if (!isset($xref['xref'][$index])) {
195
                    // store object offset position
196 32
                    $xref['xref'][$index] = (int) $matches[1][0];
197 32
                }
198
                ++$obj_num;
199 32
            } elseif ('f' == $matches[3][0]) {
200 32
                ++$obj_num;
201
            } else {
202 32
                // object number (index)
203
                $obj_num = (int) $matches[1][0];
204
            }
205 32
        }
206 31
        // get trailer data
207
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
208 32
            $trailer_data = $matches[1][0];
209 26
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
210 26
                // get only the last updated version
211 26
                $xref['trailer'] = [];
212
                // parse trailer_data
213
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214 32
                    $xref['trailer']['size'] = (int) $matches[1];
215
                }
216 32
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
217
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
218
                }
219
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
220
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
221
                }
222 32
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
223
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
224
                }
225
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
226
                    $xref['trailer']['id'] = [];
227
                    $xref['trailer']['id'][0] = $matches[1];
228
                    $xref['trailer']['id'][1] = $matches[2];
229
                }
230
            }
231
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
232
                // get previous xref
233
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
234
            }
235
        } else {
236 9
            throw new \Exception('Unable to find trailer');
237
        }
238
239 9
        return $xref;
240 9
    }
241 9
242
    /**
243 9
     * Decode the Cross-Reference Stream section
244 9
     *
245
     * @param string $pdfData   PDF data
246 7
     * @param int    $startxref Offset at which the xref section starts
247
     * @param array  $xref      Previous xref array (if any)
248 9
     *
249 9
     * @return array containing xref and trailer data
250
     *
251 9
     * @throws \Exception if unknown PNG predictor detected
252 9
     */
253 9
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
254 9
    {
255 9
        // try to read Cross-Reference Stream
256
        $xrefobj = $this->getRawObject($pdfData, $startxref);
257
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
258
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
259 9
            // get only the last updated version
260
            $xref['trailer'] = [];
261 9
            $filltrailer = true;
262
        } else {
263 9
            $filltrailer = false;
264 9
        }
265
        if (!isset($xref['xref'])) {
266 9
            $xref['xref'] = [];
267 9
        }
268 9
        $valid_crs = false;
269
        $columns = 0;
270
        $predictor = null;
271 9
        $sarr = $xrefcrs[0][1];
272 9
        if (!\is_array($sarr)) {
273
            $sarr = [];
274 7
        }
275 7
276 7
        $wb = [];
277
278 9
        foreach ($sarr as $k => $v) {
279
            if (
280 7
                ('/' == $v[0])
281 9
                && ('Type' == $v[1])
282
                && (isset($sarr[$k + 1])
283 9
                    && '/' == $sarr[$k + 1][0]
284 9
                    && 'XRef' == $sarr[$k + 1][1]
285 9
                )
286 9
            ) {
287 8
                $valid_crs = true;
288 8
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
289
                // initialize list for: first object number in the subsection / number of objects
290 8
                $index_blocks = [];
291 8
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
292
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
293 8
                }
294 8
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
295
                // get previous xref offset
296
                $prevxref = (int) $sarr[$k + 1][1];
297 8
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
298
                // number of bytes (in the decoded stream) of the corresponding field
299 8
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
300 8
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
301
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
302 8
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
303 8
                $decpar = $sarr[$k + 1][1];
304
                foreach ($decpar as $kdc => $vdc) {
305
                    if (
306 8
                        '/' == $vdc[0]
307
                        && 'Columns' == $vdc[1]
308
                        && (isset($decpar[$kdc + 1])
309 9
                            && 'numeric' == $decpar[$kdc + 1][0]
310 9
                        )
311 9
                    ) {
312 9
                        $columns = (int) $decpar[$kdc + 1][1];
313 9
                    } elseif (
314 9
                        '/' == $vdc[0]
315 9
                        && 'Predictor' == $vdc[1]
316 9
                        && (isset($decpar[$kdc + 1])
317
                            && 'numeric' == $decpar[$kdc + 1][0]
318 9
                        )
319 9
                    ) {
320 9
                        $predictor = (int) $decpar[$kdc + 1][1];
321 9
                    }
322
                }
323
            } elseif ($filltrailer) {
324
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
325
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
326
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
327 9
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
328 9
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
329
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
330 8
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
331
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
332
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
333 8
                    $xref['trailer']['id'] = [];
334
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
335
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
336
                }
337 8
            }
338
        }
339
340 8
        // decode data
341
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
342 8
            if (null !== $predictor) {
343
                // number of bytes in a row
344 8
                $rowlen = ($columns + 1);
345
                // convert the stream into an array of integers
346 8
                /** @var array<int> */
347
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
348 8
                // TODO: Handle the case when unpack returns false
349
350 8
                // split the rows
351
                $sdata = array_chunk($sdata, $rowlen);
352 8
353 8
                // initialize decoded array
354 8
                $ddata = [];
355 8
                // initialize first row with zeros
356 8
                $prev_row = array_fill(0, $rowlen, 0);
357
                // for each row apply PNG unpredictor
358 8
                foreach ($sdata as $k => $row) {
359 8
                    // initialize new row
360
                    $ddata[$k] = [];
361 8
                    // get PNG predictor value
362 8
                    $predictor = (10 + $row[0]);
363
                    // for each byte on the row
364
                    for ($i = 1; $i <= $columns; ++$i) {
365
                        // new index
366 8
                        $j = ($i - 1);
367
                        $row_up = $prev_row[$j];
368
                        if (1 == $i) {
369
                            $row_left = 0;
370 8
                            $row_upleft = 0;
371 8
                        } else {
372 8
                            $row_left = $row[$i - 1];
373
                            $row_upleft = $prev_row[$j - 1];
374
                        }
375
                        switch ($predictor) {
376
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
377
                                $ddata[$k][$j] = $row[$i];
378
                                break;
379
380
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
381
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
382
                                break;
383
384
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
385
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
386
                                break;
387
388
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
389
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
390
                                break;
391
392
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
393
                                // initial estimate
394
                                $p = ($row_left + $row_up - $row_upleft);
395
                                // distances
396
                                $pa = abs($p - $row_left);
397
                                $pb = abs($p - $row_up);
398
                                $pc = abs($p - $row_upleft);
399
                                $pmin = min($pa, $pb, $pc);
400
                                // return minimum distance
401
                                switch ($pmin) {
402
                                    case $pa:
403
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
404
                                        break;
405
406 8
                                    case $pb:
407
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
408
                                        break;
409
410
                                    case $pc:
411 1
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
412
                                        break;
413 1
                                }
414
                                break;
415 1
416
                            default:  // PNG prediction (on encoding, PNG optimum)
417
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
418 9
                        }
419
                    }
420
                    $prev_row = $ddata[$k];
421 9
                } // end for each row
422
                // complete decoding
423 9
            } else {
424 9
                // number of bytes in a row
425
                $rowlen = array_sum($wb);
426
                // convert the stream into an array of integers
427
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
428 9
                // split the rows
429
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

429
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
430 9
            }
431
432 9
            $sdata = [];
433 9
434 9
            // for every row
435
            foreach ($ddata as $k => $row) {
436 9
                // initialize new row
437
                $sdata[$k] = [0, 0, 0];
438
                if (0 == $wb[0]) {
439
                    // default type field
440
                    $sdata[$k][0] = 1;
441
                }
442 9
                $i = 0; // count bytes in the row
443
                // for every column
444 7
                for ($c = 0; $c < 3; ++$c) {
445
                    // for every byte on the column
446 9
                    for ($b = 0; $b < $wb[$c]; ++$b) {
447
                        if (isset($row[$i])) {
448 9
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
449 9
                        }
450 9
                        ++$i;
451 9
                    }
452
                }
453 9
            }
454
455 9
            // fill xref
456
            if (isset($index_blocks)) {
457 9
                // load the first object number of the first /Index entry
458
                $obj_num = $index_blocks[0][0];
459 9
            } else {
460
                $obj_num = 0;
461 9
            }
462
            foreach ($sdata as $k => $row) {
463 9
                switch ($row[0]) {
464
                    case 0:  // (f) linked list of free objects
465
                        break;
466 9
467 9
                    case 1:  // (n) objects that are in use but are not compressed
468 9
                        // create unique object index: [object number]_[generation number]
469
                        $index = $obj_num.'_'.$row[2];
470
                        // check if object already exist
471
                        if (!isset($xref['xref'][$index])) {
472
                            // store object offset position
473 9
                            $xref['xref'][$index] = $row[1];
474 9
                        }
475
                        break;
476 7
477 7
                    case 2:  // compressed objects
478
                        // $row[1] = object number of the object stream in which this object is stored
479 7
                        // $row[2] = index of this object within the object stream
480 7
                        $index = $row[1].'_0_'.$row[2];
481
                        $xref['xref'][$index] = -1;
482 5
                        break;
483
484
                    default:  // null objects
485 7
                        break;
486
                }
487
                ++$obj_num;
488
                if (isset($index_blocks)) {
489
                    // reduce the number of remaining objects
490
                    --$index_blocks[0][1];
491 9
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
492
                        // remove the actual used /Index entry
493 7
                        array_shift($index_blocks);
494
                        if (0 < \count($index_blocks)) {
495
                            // load the first object number of the following /Index entry
496 9
                            $obj_num = $index_blocks[0][0];
497
                        } else {
498
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
499 41
                            unset($index_blocks);
500
                        }
501
                    }
502 41
                }
503
            }
504
        } // end decoding data
505 41
        if (isset($prevxref)) {
506
            // get previous xref
507
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
508
        }
509 41
510
        return $xref;
511
    }
512
513
    protected function getObjectHeaderPattern(array $objRefs): string
514
    {
515
        // consider all whitespace character (PDF specifications)
516
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
517
    }
518
519
    protected function getObjectHeaderLen(array $objRefs): int
520
    {
521
        // "4 0 obj"
522
        // 2 whitespaces + strlen("obj") = 5
523
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
524 41
    }
525
526
    /**
527
     * Get content of indirect object.
528
     *
529
     * @param string $pdfData  PDF data
530 41
     * @param string $objRef   Object number and generation number separated by underscore character
531 41
     * @param int    $offset   Object offset
532
     * @param bool   $decoding If true decode streams
533
     *
534
     * @return array containing object data
535 41
     *
536
     * @throws \Exception if invalid object reference found
537
     */
538
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
539
    {
540
        /*
541 41
         * build indirect object header
542
         */
543 41
        // $objHeader = "[object number] [generation number] obj"
544 41
        $objRefArr = DataHelper::decodeRef($objRef);
545
546
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
547
548
        /*
549
         * check if we are in position
550
         */
551
        // ignore whitespace characters at offset
552
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
553 41
        // ignore leading zeros for object number
554 41
        $offset += strspn($pdfData, '0', $offset);
555 41
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
556
            // an indirect reference to an undefined object shall be considered a reference to the null object
557 41
            return ['null', 'null', $offset];
558
        }
559 41
560 41
        /*
561
         * get content
562 41
         */
563 41
        // starting position of object content
564
        $offset += $objHeaderLen;
565 41
        $objContentArr = [];
566 41
        $i = 0; // object main index
567 41
        $header = null;
568
        do {
569 41
            $oldOffset = $offset;
570
            // get element
571
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
572
            $offset = $element[2];
573
            // decode stream using stream's dictionary information
574 41
            if ($decoding && ('stream' === $element[0]) && null != $header) {
575
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1], $objRefArr);
576
            }
577
            $objContentArr[$i] = $element;
578
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
579
            ++$i;
580
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
581
        // remove closing delimiter
582
        array_pop($objContentArr);
583
584
        /*
585
         * return raw object content
586
         */
587 41
        return $objContentArr;
588
    }
589 41
590
    /**
591
     * Get the content of object, resolving indirect object reference if necessary.
592
     *
593
     * @param string $pdfData PDF data
594
     * @param array  $obj     Object value
595
     *
596
     * @return array containing object data
597
     *
598
     * @throws \Exception
599
     */
600
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
601
    {
602 41
        if ('objref' == $obj[0]) {
603
            // reference to indirect object
604
            if (isset($this->objects[$obj[1]])) {
605
                // this object has been already parsed
606
                return $this->objects[$obj[1]];
607
            } elseif (isset($xref[$obj[1]])) {
608
                // parse new object
609
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
610
611
                return $this->objects[$obj[1]];
612 42
            }
613
        }
614 42
615 42
        return $obj;
616
    }
617
618 42
    /**
619
     * Get object type, raw value and offset to next object
620
     *
621 42
     * @param int        $offset    Object offset
622
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
623 42
     *
624 42
     * @return array containing object type, raw value and offset to next object
625
     */
626 1
    protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
627 1
    {
628 1
        $objtype = ''; // object type to be returned
629
        $objval = ''; // object value to be returned
630 1
631
        // skip initial white space chars
632
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
633
634 42
        // get first char
635
        $char = $pdfData[$offset];
636 42
        // get object type
637 42
        switch ($char) {
638 42
            case '%':  // \x25 PERCENT SIGN
639 42
                // skip comment and search for next token
640 42
                $next = strcspn($pdfData, "\r\n", $offset);
641
                if ($next > 0) {
642
                    $offset += $next;
643 42
644 42
                    return $this->getRawObject($pdfData, $offset);
645 42
                }
646
                break;
647 42
648
            case '/':  // \x2F SOLIDUS
649 42
                // name object
650 42
                $objtype = $char;
651
                ++$offset;
652 37
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
653 37
                if ($span > 0) {
654 37
                    $objval = substr($pdfData, $offset, $span); // unescaped value
655 37
                    $offset += $span;
656 37
                }
657 37
                break;
658 37
659
            case '(':   // \x28 LEFT PARENTHESIS
660
            case ')':  // \x29 RIGHT PARENTHESIS
661 37
                // literal string object
662 37
                $objtype = $char;
663 37
                ++$offset;
664
                $strpos = $offset;
665 19
                if ('(' == $char) {
666 19
                    $open_bracket = 1;
667
                    $quoting = false;
668 37
                    $objval = "";
669
                    //* @var number of characters to skip; if -1, conditional on \n
670
                    $skip = 0; // If -1
671
                    while ($open_bracket > 0) {
672 37
                        if (!isset($pdfData[$strpos])) {
673 37
                            break;
674 37
                        }
675
                        $ch = $pdfData[$strpos];
676 37
                        // Reset skip
677
                        if ($skip == -1 && $ch == "\n") {
678 37
                            $skip = 1;
679 37
                        } else {
680
                            $skip = 0;
681 37
                        }
682
683 42
                        if (!$quoting) {
684 42
                            switch ($ch) {
685
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
686 41
                                    $quoting = true;
687 41
                                    $skip = 1;
688 41
                                    break;
689
690 41
                                case '(':  // LEFT PARENHESIS (28h)
691
                                    ++$open_bracket;
692 41
                                    break;
693
694 41
                                case ')':  // RIGHT PARENTHESIS (29h)
695 41
                                    --$open_bracket;
696 41
                                    if (0 === $open_bracket) {
697 41
                                        // End of string; while loop will end
698
                                        $skip = 1;
699 41
                                    }
700
                                    break;
701 41
                            }
702
                        } else {
703 42
                            // Decode quoted
704 42
                            $quoting = false;
705 42
                            switch ($ch) {
706
                                // Second half of backslash-quoted string
707 42
                                case "b": // results in 0x08
708 42
                                    $ch = "\x08";
709 42
                                    break;
710
                                case "t": // results in 0x09
711 42
                                    $ch = "\t";
712
                                    break;
713 42
                                case "n": // results in 0x0a
714
                                    $ch = "\n";
715 42
                                    break;
716 42
                                case "f": // results in 0x0c
717 42
                                    $ch = "\f";
718 42
                                    break;
719
                                case "r": // results in 0x0d
720 42
                                    $ch = "\r";
721
                                    break;
722
723
                                // Quoted control characters
724 18
                                case "\n": // 0x0a
725 18
                                    $skip = 1;
726 18
                                    break;
727 18
728 18
                                case "\r": // 0x0d
729
                                    // variable skip; also skip \n if present
730
                                    $skip = -1;
731 18
                                    break;
732
                            }
733 18
                        }
734 18
735
                        if ($skip == 0) {
736
                            $objval .= $ch;
737
                        }
738
                        ++$strpos;
739 42
                    }
740
                    $offset = $strpos;
741
                }
742 42
                break;
743
744 41
            case '[':   // \x5B LEFT SQUARE BRACKET
745 41
            case ']':  // \x5D RIGHT SQUARE BRACKET
746 42
                // array object
747
                $objtype = $char;
748 3
                ++$offset;
749 3
                if ('[' == $char) {
750 3
                    // get array content
751 42
                    $objval = [];
752
                    do {
753 15
                        $oldOffset = $offset;
754 15
                        // get element
755 15
                        $element = $this->getRawObject($pdfData, $offset);
756 42
                        $offset = $element[2];
757
                        $objval[] = $element;
758 3
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
759 3
                    // remove closing delimiter
760 3
                    array_pop($objval);
761 42
                }
762
                break;
763 41
764 41
            case '<':  // \x3C LESS-THAN SIGN
765 41
            case '>':  // \x3E GREATER-THAN SIGN
766 41
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
767 41
                    // dictionary object
768 41
                    $objtype = $char.$char;
769 41
                    $offset += 2;
770
                    if ('<' == $char) {
771 41
                        // get array content
772
                        $objval = [];
773 41
                        do {
774 41
                            $oldOffset = $offset;
775 41
                            // get element
776
                            $element = $this->getRawObject($pdfData, $offset);
777
                            $offset = $element[2];
778 42
                            $objval[] = $element;
779
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
780 41
                        // remove closing delimiter
781 41
                        array_pop($objval);
782 42
                    }
783
                } else {
784 41
                    // hexadecimal string object
785 41
                    $objtype = $char;
786 41
                    ++$offset;
787 42
788
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
789 10
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
790 10
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
791 10
                        // remove white space characters
792 42
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
793
                        $offset += $span + 1;
794 41
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
795 41
                        $offset = $endpos + 1;
796 41
                    }
797
                }
798 42
                break;
799
800
            default:
801 42
                if ('endobj' == substr($pdfData, $offset, 6)) {
802
                    // indirect object
803
                    $objtype = 'endobj';
804
                    $offset += 6;
805
                } elseif ('null' == substr($pdfData, $offset, 4)) {
806
                    // null object
807
                    $objtype = 'null';
808
                    $offset += 4;
809
                    $objval = 'null';
810
                } elseif ('true' == substr($pdfData, $offset, 4)) {
811
                    // boolean true object
812
                    $objtype = 'boolean';
813
                    $offset += 4;
814
                    $objval = 'true';
815 42
                } elseif ('false' == substr($pdfData, $offset, 5)) {
816
                    // boolean false object
817 42
                    $objtype = 'boolean';
818 42
                    $offset += 5;
819
                    $objval = 'false';
820
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
821 42
                    // start stream object
822
                    $objtype = 'stream';
823
                    $offset += 6;
824
                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
825 42
                        $offset += \strlen($matches[0]);
826
827 42
                        // we get stream length here to later help preg_match test less data
828 42
                        $streamLen = (int) self::getHeaderValue($headerDic, 'Length', 'numeric', 0);
829
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == self::getHeaderValue($headerDic, 'Type', '/') && 'Image' == self::getHeaderValue($headerDic, 'Subtype', '/');
830 42
831
                        $pregResult = preg_match(
832
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
833 42
                            $pdfData,
834
                            $matches,
835
                            \PREG_OFFSET_CAPTURE,
836 42
                            $offset + $streamLen
837 42
                        );
838 14
839
                        if (1 == $pregResult) {
840 7
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
841 7
                            $offset = $matches[1][1];
842
                        }
843 7
                    }
844
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
845
                    // end stream object
846
                    $objtype = 'endstream';
847
                    $offset += 9;
848
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
849
                    // indirect object reference
850
                    $objtype = 'objref';
851 42
                    $offset += \strlen($matches[0]);
852 1
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
853
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
854
                    // object start
855
                    $objtype = 'obj';
856 41
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
857
                    $offset += \strlen($matches[0]);
858 32
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
859
                    // numeric object
860
                    $objtype = 'numeric';
861 9
                    $objval = substr($pdfData, $offset, $numlen);
862
                    $offset += $numlen;
863 41
                }
864
                break;
865
        }
866
867 41
        return [$objtype, $objval, $offset];
868
    }
869
870
    /**
871
     * Get value of an object header's section (obj << YYY >> part ).
872
     *
873
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
874
     * when no Smalot\PdfParser\Header objects are created yet.
875
     *
876
     * @param string            $key     header's section name
877
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
878
     * @param string|array|null $default default value for header's section
879
     *
880 42
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
881
     */
882 42
    public static function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
883
    {
884
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
885
            return $default;
886 42
        }
887
888
        /*
889
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
890
         * iterates over it, searching for section of type '/' whith requested key.
891 42
         * If such a section is found, it tries to receive it's value (next object in dictionary),
892
         * returning it, if it matches requested type, or default value otherwise.
893
         */
894 42
        foreach ($headerDic as $i => $val) {
895
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
896
            if (
897 41
                $isSectionName
898 41
                && $val[1] == $key
899 41
                && isset($headerDic[$i + 1])
900
            ) {
901 41
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
902
903
                return $isSectionValue && $type == $headerDic[$i + 1][0]
904
                    ? $headerDic[$i + 1][1]
905 41
                    : $default;
906
            }
907
        }
908
909
        return $default;
910
    }
911
912
    /**
913
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
914
     *
915
     * @param int   $offset xref offset (if known)
916
     * @param array $xref   previous xref array (if any)
917
     *
918
     * @return array containing xref and trailer data
919
     *
920
     * @throws \Exception if it was unable to find startxref
921
     * @throws \Exception if it was unable to find xref
922
     */
923
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
924
    {
925
        $startxrefPreg = preg_match(
926
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
927
            $pdfData,
928
            $matches,
929
            \PREG_OFFSET_CAPTURE,
930
            $offset
931
        );
932
933
        if (0 == $offset) {
934
            // find last startxref
935
            $pregResult = preg_match_all(
936
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
937
                $pdfData,
938
                $matches,
939
                \PREG_SET_ORDER,
940
                $offset
941
            );
942
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
943
                throw new \Exception('Unable to find startxref');
944
            }
945
            $matches = array_pop($matches);
946
            $startxref = $matches[1];
947
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
948
            // Already pointing at the xref table
949
            $startxref = $offset;
950
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
951
            // Cross-Reference Stream object
952
            $startxref = $offset;
953
        } elseif ($startxrefPreg) {
954
            // startxref found
955
            $startxref = $matches[1][0];
956
        } else {
957
            throw new \Exception('Unable to find startxref');
958
        }
959
960
        if ($startxref > \strlen($pdfData)) {
961
            throw new \Exception('Unable to find xref (PDF corrupted?)');
962
        }
963
964
        // check xref position
965
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
966
            // Cross-Reference
967
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
968
        } else {
969
            // Check if the $pdfData might have the wrong line-endings
970
            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
971
            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
972
                // Return Unix-line-ending flag
973
                $xref = ['Unix' => true];
974
            } else {
975
                // Cross-Reference Stream
976
                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
977
            }
978
        }
979
        if (empty($xref)) {
980
            throw new \Exception('Unable to find xref');
981
        }
982
983
        return $xref;
984
    }
985
986
    /**
987
     *
988
     * @param string $pdfData     PDF data
989
     * @param array  $xref        xref array
990
     * @param string $encryptRef  Object number and generation number separated by underscore character
991
     *
992
     * @return \Smalot\PdfParser\Encryption\Info
993
     *
994
     * @throws \Exception if invalid object reference found
995
     */
996
    protected function setupDecryption(string $pdfData, array $xref, string $encryptRef)
997
    {
998
        $this->encryptionInfo = $this->parseEncryptionInfo($pdfData, $xref, $encryptRef);
999
        //# $ownerPassword, $userPassword
1000
        $fileKey = \Smalot\PdfParser\Encryption\FileKey::generate($this->encryptionInfo);
1001
1002
        $this->decryptionHelper = \Smalot\PdfParser\Encryption\Stream::make(
1003
            $this->encryptionInfo->getEncAlgorithm(), $fileKey);
1004
    }
1005
1006
    /**
1007
     * Get content of encryption metadata.
1008
     *
1009
     * @param string $pdfData  PDF data
1010
     * @param array  $xref     xref array
1011
     * @param string $objRef   Object number and generation number separated by underscore character
1012
     *
1013
     * @return \Smalot\PdfParser\Encryption\Info
1014
     *
1015
     * @throws \Exception if invalid object reference found
1016
     */
1017
    protected function parseEncryptionInfo(string $pdfData, array $xref, string $objRef): \Smalot\PdfParser\Encryption\Info
1018
    {
1019
        if (isset($xref['trailer']['id'])) {
1020
            $fileIdArr = $xref['trailer']['id'];
1021
        } else {
1022
            $fileIdArr = [];
1023
        }
1024
        $offset = $xref['xref'][$objRef];
1025
        $encryptArr = $this->getIndirectObject($pdfData, $xref, $objRef, $offset, true);
1026
1027
        return new \Smalot\PdfParser\Encryption\Info($encryptArr, $fileIdArr);
1028
    }
1029
1030
    /**
1031
     * Parses PDF data and returns extracted data as array.
1032
     *
1033
     * @param string $data PDF data to parse
1034
     *
1035
     * @return array array of parsed PDF document objects
1036
     *
1037
     * @throws \Exception if empty PDF data given
1038
     * @throws \Exception if PDF data missing %PDF header
1039
     */
1040
    public function parseData(string $data): array
1041
    {
1042
        if (empty($data)) {
1043
            throw new \Exception('Empty PDF data given.');
1044
        }
1045
        // find the pdf header starting position
1046
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
1047
            throw new \Exception('Invalid PDF data: missing %PDF header.');
1048
        }
1049
1050
        // get PDF content string
1051
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
1052
1053
        // get xref and trailer data
1054
        $xref = $this->getXrefData($pdfData);
1055
1056
        // If we found Unix line-endings
1057
        if (isset($xref['Unix'])) {
1058
            $pdfData = str_replace("\r\n", "\n", $pdfData);
1059
            $xref = $this->getXrefData($pdfData);
1060
        }
1061
1062
        // Pre-parse encryption object
1063
        if (isset($xref['trailer']['encrypt'])) {
1064
            $encryptRef = $xref['trailer']['encrypt'];
1065
            if (isset($xref['xref'][$encryptRef])) {
1066
                $this->setupDecryption($pdfData, $xref, $encryptRef);
1067
            }
1068
        }
1069
1070
        // parse all document objects
1071
        $objects = [];
1072
        foreach ($xref['xref'] as $obj => $offset) {
1073
            if (!isset($objects[$obj]) && ($offset > 0)) {
1074
                // decode objects with positive offset
1075
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
1076
            }
1077
        }
1078
1079
        return [$xref, $objects];
1080
    }
1081
}
1082