Test Failed
Pull Request — master (#590)
by
unknown
02:53
created

RawDataParser::getHeaderValue()   B

Complexity

Conditions 11
Paths 9

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 11

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 11
eloc 6
c 2
b 0
f 0
nc 9
nop 4
dl 0
loc 11
ccs 4
cts 4
cp 1
crap 11
rs 7.3166

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var \Smalot\PdfParser\Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     */
57
    protected $cfg = [
58
        // if `true` ignore filter decoding errors
59
        'ignore_filter_decoding_errors' => true,
60
        // if `true` ignore missing filter decoding errors
61
        'ignore_missing_filter_decoders' => true,
62
    ];
63
64
    protected $filterHelper;
65
    protected $objects;
66
67
    /**
68
     * @param array $cfg Configuration array, default is []
69
     */
70 45
    public function __construct($cfg = [], Config $config = null)
71
    {
72
        // merge given array with default values
73 45
        $this->cfg = array_merge($this->cfg, $cfg);
74
75 45
        $this->filterHelper = new FilterHelper();
76 45
        $this->config = $config ?: new Config();
77 45
    }
78
79
    /**
80
     * Decode the specified stream.
81
     *
82
     * @param string $pdfData PDF data
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     *
88
     * @throws \Exception
89
     */
90 41
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
91
    {
92
        // get stream length and filters
93 41
        $slength = \strlen($stream);
94 41
        if ($slength <= 0) {
95
            return ['', []];
96
        }
97 41
        $filters = [];
98 41
        foreach ($sdic as $k => $v) {
99 41
            if ('/' == $v[0]) {
100 41
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
101
                    // get declared stream length
102 39
                    $declength = (int) $sdic[$k + 1][1];
103 39
                    if ($declength < $slength) {
104 39
                        $stream = substr($stream, 0, $declength);
105 39
                        $slength = $declength;
106
                    }
107 41
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
108
                    // resolve indirect object
109 41
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
110 41
                    if ('/' == $objval[0]) {
111
                        // single filter
112 41
                        $filters[] = $objval[1];
113 3
                    } elseif ('[' == $objval[0]) {
114
                        // array of filters
115 3
                        foreach ($objval[1] as $flt) {
116 3
                            if ('/' == $flt[0]) {
117 3
                                $filters[] = $flt[1];
118
                            }
119
                        }
120
                    }
121
                }
122
            }
123
        }
124
125
        // decode the stream
126 41
        $remaining_filters = [];
127 41
        foreach ($filters as $filter) {
128 41
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
129
                try {
130 41
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
131
                } catch (\Exception $e) {
132
                    $emsg = $e->getMessage();
133
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
134
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
135
                    ) {
136 41
                        throw new \Exception($e->getMessage());
137
                    }
138
                }
139
            } else {
140
                // add missing filter to array
141 4
                $remaining_filters[] = $filter;
142
            }
143
        }
144
145 41
        return [$stream, $remaining_filters];
146
    }
147
148
    /**
149
     * Decode the Cross-Reference section
150
     *
151
     * @param string $pdfData   PDF data
152
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
153
     * @param array  $xref      Previous xref array (if any)
154
     *
155
     * @return array containing xref and trailer data
156
     *
157
     * @throws \Exception
158
     */
159 32
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
160
    {
161 32
        $startxref += 4; // 4 is the length of the word 'xref'
162
        // skip initial white space chars
163 32
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
164
        // initialize object number
165 32
        $obj_num = 0;
166
        // search for cross-reference entries or subsection
167 32
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
168 32
            if ($matches[0][1] != $offset) {
169
                // we are on another section
170 7
                break;
171
            }
172 32
            $offset += \strlen($matches[0][0]);
173 32
            if ('n' == $matches[3][0]) {
174
                // create unique object index: [object number]_[generation number]
175 32
                $index = $obj_num.'_'.(int) $matches[2][0];
176
                // check if object already exist
177 32
                if (!isset($xref['xref'][$index])) {
178
                    // store object offset position
179 32
                    $xref['xref'][$index] = (int) $matches[1][0];
180
                }
181 32
                ++$obj_num;
182 32
            } elseif ('f' == $matches[3][0]) {
183 32
                ++$obj_num;
184
            } else {
185
                // object number (index)
186 32
                $obj_num = (int) $matches[1][0];
187
            }
188
        }
189
        // get trailer data
190 32
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
191 32
            $trailer_data = $matches[1][0];
192 32
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
193
                // get only the last updated version
194 32
                $xref['trailer'] = [];
195
                // parse trailer_data
196 32
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
197 32
                    $xref['trailer']['size'] = (int) $matches[1];
198
                }
199 32
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
200 32
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
201
                }
202 32
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
203
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
204
                }
205 32
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
206 31
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
207
                }
208 32
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
209 26
                    $xref['trailer']['id'] = [];
210 26
                    $xref['trailer']['id'][0] = $matches[1];
211 26
                    $xref['trailer']['id'][1] = $matches[2];
212
                }
213
            }
214 32
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
215
                // get previous xref
216 32
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
217
            }
218
        } else {
219
            throw new \Exception('Unable to find trailer');
220
        }
221
222 32
        return $xref;
223
    }
224
225
    /**
226
     * Decode the Cross-Reference Stream section
227
     *
228
     * @param string $pdfData   PDF data
229
     * @param int    $startxref Offset at which the xref section starts
230
     * @param array  $xref      Previous xref array (if any)
231
     *
232
     * @return array containing xref and trailer data
233
     *
234
     * @throws \Exception if unknown PNG predictor detected
235
     */
236 9
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
237
    {
238
        // try to read Cross-Reference Stream
239 9
        $xrefobj = $this->getRawObject($pdfData, $startxref);
240 9
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
241 9
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
242
            // get only the last updated version
243 9
            $xref['trailer'] = [];
244 9
            $filltrailer = true;
245
        } else {
246 7
            $filltrailer = false;
247
        }
248 9
        if (!isset($xref['xref'])) {
249 9
            $xref['xref'] = [];
250
        }
251 9
        $valid_crs = false;
252 9
        $columns = 0;
253 9
        $predictor = null;
254 9
        $sarr = $xrefcrs[0][1];
255 9
        if (!\is_array($sarr)) {
256
            $sarr = [];
257
        }
258
259 9
        $wb = [];
260
261 9
        foreach ($sarr as $k => $v) {
262
            if (
263 9
                ('/' == $v[0])
264 9
                && ('Type' == $v[1])
265
                && (
266 9
                    isset($sarr[$k + 1])
267 9
                    && '/' == $sarr[$k + 1][0]
268 9
                    && 'XRef' == $sarr[$k + 1][1]
269
                )
270
            ) {
271 9
                $valid_crs = true;
272 9
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
273
                // initialize list for: first object number in the subsection / number of objects
274 7
                $index_blocks = [];
275 7
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
276 7
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
277
                }
278 9
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
279
                // get previous xref offset
280 7
                $prevxref = (int) $sarr[$k + 1][1];
281 9
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
282
                // number of bytes (in the decoded stream) of the corresponding field
283 9
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
284 9
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
285 9
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
286 9
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
287 8
                $decpar = $sarr[$k + 1][1];
288 8
                foreach ($decpar as $kdc => $vdc) {
289
                    if (
290 8
                        '/' == $vdc[0]
291 8
                        && 'Columns' == $vdc[1]
292
                        && (
293 8
                            isset($decpar[$kdc + 1])
294 8
                            && 'numeric' == $decpar[$kdc + 1][0]
295
                        )
296
                    ) {
297 8
                        $columns = (int) $decpar[$kdc + 1][1];
298
                    } elseif (
299 8
                        '/' == $vdc[0]
300 8
                        && 'Predictor' == $vdc[1]
301
                        && (
302 8
                            isset($decpar[$kdc + 1])
303 8
                            && 'numeric' == $decpar[$kdc + 1][0]
304
                        )
305
                    ) {
306 8
                        $predictor = (int) $decpar[$kdc + 1][1];
307
                    }
308
                }
309 9
            } elseif ($filltrailer) {
310 9
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
311 9
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
312 9
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
313 9
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
314 9
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
315 9
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
316 9
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
317
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
318 9
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
319 9
                    $xref['trailer']['id'] = [];
320 9
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
321 9
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
322
                }
323
            }
324
        }
325
326
        // decode data
327 9
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
328 9
            if (null !== $predictor) {
329
                // number of bytes in a row
330 8
                $rowlen = ($columns + 1);
331
                // convert the stream into an array of integers
332
                /** @var array<int> */
333 8
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
334
                // TODO: Handle the case when unpack returns false
335
336
                // split the rows
337 8
                $sdata = array_chunk($sdata, $rowlen);
338
339
                // initialize decoded array
340 8
                $ddata = [];
341
                // initialize first row with zeros
342 8
                $prev_row = array_fill(0, $rowlen, 0);
343
                // for each row apply PNG unpredictor
344 8
                foreach ($sdata as $k => $row) {
345
                    // initialize new row
346 8
                    $ddata[$k] = [];
347
                    // get PNG predictor value
348 8
                    $predictor = (10 + $row[0]);
349
                    // for each byte on the row
350 8
                    for ($i = 1; $i <= $columns; ++$i) {
351
                        // new index
352 8
                        $j = ($i - 1);
353 8
                        $row_up = $prev_row[$j];
354 8
                        if (1 == $i) {
355 8
                            $row_left = 0;
356 8
                            $row_upleft = 0;
357
                        } else {
358 8
                            $row_left = $row[$i - 1];
359 8
                            $row_upleft = $prev_row[$j - 1];
360
                        }
361 8
                        switch ($predictor) {
362 8
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
363
                                $ddata[$k][$j] = $row[$i];
364
                                break;
365
366 8
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
367
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
368
                                break;
369
370 8
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
371 8
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
372 8
                                break;
373
374
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
375
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
376
                                break;
377
378
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
379
                                // initial estimate
380
                                $p = ($row_left + $row_up - $row_upleft);
381
                                // distances
382
                                $pa = abs($p - $row_left);
383
                                $pb = abs($p - $row_up);
384
                                $pc = abs($p - $row_upleft);
385
                                $pmin = min($pa, $pb, $pc);
386
                                // return minimum distance
387
                                switch ($pmin) {
388
                                    case $pa:
389
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
390
                                        break;
391
392
                                    case $pb:
393
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
394
                                        break;
395
396
                                    case $pc:
397
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
398
                                        break;
399
                                }
400
                                break;
401
402
                            default:  // PNG prediction (on encoding, PNG optimum)
403
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
404
                        }
405
                    }
406 8
                    $prev_row = $ddata[$k];
407
                } // end for each row
408
            // complete decoding
409
            } else {
410
                // number of bytes in a row
411 1
                $rowlen = array_sum($wb);
412
                // convert the stream into an array of integers
413 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
414
                // split the rows
415 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

415
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
416
            }
417
418 9
            $sdata = [];
419
420
            // for every row
421 9
            foreach ($ddata as $k => $row) {
422
                // initialize new row
423 9
                $sdata[$k] = [0, 0, 0];
424 9
                if (0 == $wb[0]) {
425
                    // default type field
426
                    $sdata[$k][0] = 1;
427
                }
428 9
                $i = 0; // count bytes in the row
429
                // for every column
430 9
                for ($c = 0; $c < 3; ++$c) {
431
                    // for every byte on the column
432 9
                    for ($b = 0; $b < $wb[$c]; ++$b) {
433 9
                        if (isset($row[$i])) {
434 9
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
435
                        }
436 9
                        ++$i;
437
                    }
438
                }
439
            }
440
441
            // fill xref
442 9
            if (isset($index_blocks)) {
443
                // load the first object number of the first /Index entry
444 7
                $obj_num = $index_blocks[0][0];
445
            } else {
446 9
                $obj_num = 0;
447
            }
448 9
            foreach ($sdata as $k => $row) {
449 9
                switch ($row[0]) {
450 9
                    case 0:  // (f) linked list of free objects
451 9
                        break;
452
453 9
                    case 1:  // (n) objects that are in use but are not compressed
454
                        // create unique object index: [object number]_[generation number]
455 9
                        $index = $obj_num.'_'.$row[2];
456
                        // check if object already exist
457 9
                        if (!isset($xref['xref'][$index])) {
458
                            // store object offset position
459 9
                            $xref['xref'][$index] = $row[1];
460
                        }
461 9
                        break;
462
463 9
                    case 2:  // compressed objects
464
                        // $row[1] = object number of the object stream in which this object is stored
465
                        // $row[2] = index of this object within the object stream
466 9
                        $index = $row[1].'_0_'.$row[2];
467 9
                        $xref['xref'][$index] = -1;
468 9
                        break;
469
470
                    default:  // null objects
471
                        break;
472
                }
473 9
                ++$obj_num;
474 9
                if (isset($index_blocks)) {
475
                    // reduce the number of remaining objects
476 7
                    --$index_blocks[0][1];
477 7
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
478
                        // remove the actual used /Index entry
479 7
                        array_shift($index_blocks);
480 7
                        if (0 < \count($index_blocks)) {
481
                            // load the first object number of the following /Index entry
482 5
                            $obj_num = $index_blocks[0][0];
483
                        } else {
484
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
485 7
                            unset($index_blocks);
486
                        }
487
                    }
488
                }
489
            }
490
        } // end decoding data
491 9
        if (isset($prevxref)) {
492
            // get previous xref
493 7
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
494
        }
495
496 9
        return $xref;
497
    }
498
499 41
    protected function getObjectHeaderPattern(array $objRefs): string
500
    {
501
        // consider all whitespace character (PDF specifications)
502 41
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
503
    }
504
505 41
    protected function getObjectHeaderLen(array $objRefs): int
506
    {
507
        // "4 0 obj"
508
        // 2 whitespaces + strlen("obj") = 5
509 41
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
510
    }
511
512
    /**
513
     * Get content of indirect object.
514
     *
515
     * @param string $pdfData  PDF data
516
     * @param string $objRef   Object number and generation number separated by underscore character
517
     * @param int    $offset   Object offset
518
     * @param bool   $decoding If true decode streams
519
     *
520
     * @return array containing object data
521
     *
522
     * @throws \Exception if invalid object reference found
523
     */
524 41
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
525
    {
526
        /*
527
         * build indirect object header
528
         */
529
        // $objHeader = "[object number] [generation number] obj"
530 41
        $objRefArr = explode('_', $objRef);
531 41
        if (2 !== \count($objRefArr)) {
532
            throw new \Exception('Invalid object reference for $obj.');
533
        }
534
535 41
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
536
537
        /*
538
         * check if we are in position
539
         */
540
        // ignore whitespace characters at offset
541 41
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
542
        // ignore leading zeros for object number
543 41
        $offset += strspn($pdfData, '0', $offset);
544 41
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
545
            // an indirect reference to an undefined object shall be considered a reference to the null object
546
            return ['null', 'null', $offset];
547
        }
548
549
        /*
550
         * get content
551
         */
552
        // starting position of object content
553 41
        $offset += $objHeaderLen;
554 41
        $objContentArr = [];
555 41
        $i = 0; // object main index
556
        $header = null;
557 41
        do {
558
            $oldOffset = $offset;
559 41
            // get element
560 41
            $element = $this->getRawObject($pdfData, $offset, $header[1]);
561
            $offset = $element[2];
562 41
            // decode stream using stream's dictionary information
563 41
            if ($decoding && ('stream' === $element[0]) && $header != null) {
564
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
565 41
            }
566 41
            $objContentArr[$i] = $element;
567 41
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
568
            ++$i;
569 41
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
570
        // remove closing delimiter
571
        array_pop($objContentArr);
572
573
        /*
574 41
         * return raw object content
575
         */
576
        return $objContentArr;
577
    }
578
579
    /**
580
     * Get the content of object, resolving indirect object reference if necessary.
581
     *
582
     * @param string $pdfData PDF data
583
     * @param array  $obj     Object value
584
     *
585
     * @return array containing object data
586
     *
587 41
     * @throws \Exception
588
     */
589 41
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
590
    {
591
        if ('objref' == $obj[0]) {
592
            // reference to indirect object
593
            if (isset($this->objects[$obj[1]])) {
594
                // this object has been already parsed
595
                return $this->objects[$obj[1]];
596
            } elseif (isset($xref[$obj[1]])) {
597
                // parse new object
598
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
599
600
                return $this->objects[$obj[1]];
601
            }
602 41
        }
603
604
        return $obj;
605
    }
606
607
    /**
608
     * Get object type, raw value and offset to next object
609
     *
610
     * @param int $offset Object offset
611
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
612 42
     *
613
     * @return array containing object type, raw value and offset to next object
614 42
     */
615 42
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
616
    {
617
        $objtype = ''; // object type to be returned
618 42
        $objval = ''; // object value to be returned
619
620
        // skip initial white space chars
621 42
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
622
623 42
        // get first char
624 42
        $char = $pdfData[$offset];
625
        // get object type
626 1
        switch ($char) {
627 1
            case '%':  // \x25 PERCENT SIGN
628 1
                // skip comment and search for next token
629
                $next = strcspn($pdfData, "\r\n", $offset);
630 1
                if ($next > 0) {
631
                    $offset += $next;
632
633
                    return $this->getRawObject($pdfData, $offset);
634 42
                }
635
                break;
636 42
637 42
            case '/':  // \x2F SOLIDUS
638 42
                // name object
639 42
                $objtype = $char;
640 42
                ++$offset;
641
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
642
                if ($span > 0) {
643 42
                    $objval = substr($pdfData, $offset, $span); // unescaped value
644 42
                    $offset += $span;
645 42
                }
646
                break;
647 42
648
            case '(':   // \x28 LEFT PARENTHESIS
649 42
            case ')':  // \x29 RIGHT PARENTHESIS
650 42
                    // literal string object
651
                $objtype = $char;
652 37
                ++$offset;
653 37
                $strpos = $offset;
654 37
                if ('(' == $char) {
655 37
                    $open_bracket = 1;
656 37
                    while ($open_bracket > 0) {
657 37
                        if (!isset($pdfData[$strpos])) {
658 37
                            break;
659
                        }
660
                        $ch = $pdfData[$strpos];
661 37
                        switch ($ch) {
662 37
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
663 37
                                // skip next character
664
                                ++$strpos;
665 19
                                break;
666 19
667
                            case '(':  // LEFT PARENHESIS (28h)
668 37
                                ++$open_bracket;
669
                                break;
670
671
                            case ')':  // RIGHT PARENTHESIS (29h)
672 37
                                --$open_bracket;
673 37
                                break;
674 37
                        }
675
                        ++$strpos;
676 37
                    }
677
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
678 37
                    $offset = $strpos;
679 37
                }
680
                break;
681 37
682
            case '[':   // \x5B LEFT SQUARE BRACKET
683 42
            case ']':  // \x5D RIGHT SQUARE BRACKET
684 42
                // array object
685
                $objtype = $char;
686 41
                ++$offset;
687 41
                if ('[' == $char) {
688 41
                    // get array content
689
                    $objval = [];
690 41
                    do {
691
                        $oldOffset = $offset;
692 41
                        // get element
693
                        $element = $this->getRawObject($pdfData, $offset);
694 41
                        $offset = $element[2];
695 41
                        $objval[] = $element;
696 41
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
697 41
                    // remove closing delimiter
698
                    array_pop($objval);
699 41
                }
700
                break;
701 41
702
            case '<':  // \x3C LESS-THAN SIGN
703 42
            case '>':  // \x3E GREATER-THAN SIGN
704 42
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
705 42
                    // dictionary object
706
                    $objtype = $char.$char;
707 42
                    $offset += 2;
708 42
                    if ('<' == $char) {
709 42
                        // get array content
710
                        $objval = [];
711 42
                        do {
712
                            $oldOffset = $offset;
713 42
                            // get element
714
                            $element = $this->getRawObject($pdfData, $offset);
715 42
                            $offset = $element[2];
716 42
                            $objval[] = $element;
717 42
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
718 42
                        // remove closing delimiter
719
                        array_pop($objval);
720 42
                    }
721
                } else {
722
                    // hexadecimal string object
723
                    $objtype = $char;
724 18
                    ++$offset;
725 18
726 18
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
727 18
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
728 18
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
729
                        // remove white space characters
730
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
731 18
                        $offset += $span + 1;
732
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
733 18
                        $offset = $endpos + 1;
734 18
                    }
735
                }
736
                break;
737
738
            default:
739 42
                if ('endobj' == substr($pdfData, $offset, 6)) {
740
                    // indirect object
741
                    $objtype = 'endobj';
742 42
                    $offset += 6;
743
                } elseif ('null' == substr($pdfData, $offset, 4)) {
744 41
                    // null object
745 41
                    $objtype = 'null';
746 42
                    $offset += 4;
747
                    $objval = 'null';
748 3
                } elseif ('true' == substr($pdfData, $offset, 4)) {
749 3
                    // boolean true object
750 3
                    $objtype = 'boolean';
751 42
                    $offset += 4;
752
                    $objval = 'true';
753 15
                } elseif ('false' == substr($pdfData, $offset, 5)) {
754 15
                    // boolean false object
755 15
                    $objtype = 'boolean';
756 42
                    $offset += 5;
757
                    $objval = 'false';
758 3
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
759 3
                    // start stream object
760 3
                    $objtype = 'stream';
761 42
                    $offset += 6;
762
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
763 41
                        $offset += \strlen($matches[0]);
764 41
765 41
                        $streamLen = intval($this->getHeaderValue($headerDic, 'Length', 'numeric', 0));
766 41
                        $skip = !$this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
767 41
768 41
                        $pregResult = preg_match(
769 41
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
770
                            $pdfData,
771 41
                            $matches,
772
                            \PREG_OFFSET_CAPTURE,
773 41
                            $offset + $streamLen
774 41
                        );
775 41
776
                        if (1 == $pregResult) {
777
                            $objval = $skip ? '' : substr($pdfData, $offset,  $matches[0][1] - $offset);
778 42
                            $offset = $matches[1][1];
779
                        }
780 41
                    }
781 41
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
782 42
                    // end stream object
783
                    $objtype = 'endstream';
784 41
                    $offset += 9;
785 41
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
786 41
                    // indirect object reference
787 42
                    $objtype = 'objref';
788
                    $offset += \strlen($matches[0]);
789 10
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
790 10
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
791 10
                    // object start
792 42
                    $objtype = 'obj';
793
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
794 41
                    $offset += \strlen($matches[0]);
795 41
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
796 41
                    // numeric object
797
                    $objtype = 'numeric';
798 42
                    $objval = substr($pdfData, $offset, $numlen);
799
                    $offset += $numlen;
800
                }
801 42
                break;
802
        }
803
804
        return [$objtype, $objval, $offset];
805
    }
806
807
    /**
808
     * @param array|null $header obj's header, parsed by getRawObject
809
     * @param string $key Header's section
810
     * @param string $type type of the section (ie 'numeric', '/', '<<', etc.)
811
     * @param string|null $default default value for header's section
812
     * @return string|null value of obj header's section, or default value if none found
813
     */
814
    protected function getHeaderValue(?array $headerDic, string $key, string $type, ?string $default = ''): ?string
815 42
    {
816
        if (!is_array($headerDic))
0 ignored issues
show
introduced by
The condition is_array($headerDic) is always true.
Loading history...
817 42
            return $default;
818 42
819
        foreach ($headerDic as $i => $val) {
820
            if (is_array($val) && 3 == count($val) && '/' == $val[0] && $val[1] == $key && isset($headerDic[$i + 1]))
821 42
                return is_array($headerDic[$i + 1]) && 1 < count($headerDic[$i + 1]) && $type == $headerDic[$i+1][0] ? $headerDic[$i + 1][1] : $default;
822
        }
823
824
        return $default;
825 42
    }
826
827 42
    /**
828 42
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
829
     *
830 42
     * @param int   $offset xref offset (if known)
831
     * @param array $xref   previous xref array (if any)
832
     *
833 42
     * @return array containing xref and trailer data
834
     *
835
     * @throws \Exception if it was unable to find startxref
836 42
     * @throws \Exception if it was unable to find xref
837 42
     */
838 14
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
839
    {
840 7
        $startxrefPreg = preg_match(
841 7
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
842
            $pdfData,
843 7
            $matches,
844
            \PREG_OFFSET_CAPTURE,
845
            $offset
846
        );
847
848
        if (0 == $offset) {
849
            // find last startxref
850
            $pregResult = preg_match_all(
851 42
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
852 1
                $pdfData, $matches,
853
                \PREG_SET_ORDER,
854
                $offset
855
            );
856 41
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
857
                throw new \Exception('Unable to find startxref');
858 32
            }
859
            $matches = array_pop($matches);
860
            $startxref = $matches[1];
861 9
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
862
            // Already pointing at the xref table
863 41
            $startxref = $offset;
864
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
865
            // Cross-Reference Stream object
866
            $startxref = $offset;
867 41
        } elseif ($startxrefPreg) {
868
            // startxref found
869
            $startxref = $matches[1][0];
870
        } else {
871
            throw new \Exception('Unable to find startxref');
872
        }
873
874
        if ($startxref > \strlen($pdfData)) {
875
            throw new \Exception('Unable to find xref (PDF corrupted?)');
876
        }
877
878
        // check xref position
879
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
880 42
            // Cross-Reference
881
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
882 42
        } else {
883
            // Cross-Reference Stream
884
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
885
        }
886 42
        if (empty($xref)) {
887
            throw new \Exception('Unable to find xref');
888
        }
889
890
        return $xref;
891 42
    }
892
893
    /**
894 42
     * Parses PDF data and returns extracted data as array.
895
     *
896
     * @param string $data PDF data to parse
897 41
     *
898 41
     * @return array array of parsed PDF document objects
899 41
     *
900
     * @throws \Exception if empty PDF data given
901 41
     * @throws \Exception if PDF data missing %PDF header
902
     */
903
    public function parseData(string $data): array
904
    {
905 41
        if (empty($data)) {
906
            throw new \Exception('Empty PDF data given.');
907
        }
908
        // find the pdf header starting position
909
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
910
            throw new \Exception('Invalid PDF data: missing %PDF header.');
911
        }
912
913
        // get PDF content string
914
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
915
916
        // get xref and trailer data
917
        $xref = $this->getXrefData($pdfData);
918
919
        // parse all document objects
920
        $objects = [];
921
        foreach ($xref['xref'] as $obj => $offset) {
922
            if (!isset($objects[$obj]) && ($offset > 0)) {
923
                // decode objects with positive offset
924
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
925
            }
926
        }
927
928
        return [$xref, $objects];
929
    }
930
}
931