Test Failed
Push — master ( 68af1e...78cd9d )
by Konrad
05:29 queued 02:33
created

RawDataParser::getHeaderValue()   B

Complexity

Conditions 11
Paths 29

Size

Total Lines 28
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 14
CRAP Score 11.2363

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 11
eloc 13
c 1
b 0
f 0
nc 29
nop 4
dl 0
loc 28
ccs 14
cts 16
cp 0.875
crap 11.2363
rs 7.3166

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 *
19
 * @date    2020-01-06
20
 *
21
 * @license LGPLv3
22
 *
23
 * @url     <https://github.com/smalot/pdfparser>
24
 *
25
 *  PdfParser is a pdf library written in PHP, extraction oriented.
26
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
27
 *
28
 *  This program is free software: you can redistribute it and/or modify
29
 *  it under the terms of the GNU Lesser General Public License as published by
30
 *  the Free Software Foundation, either version 3 of the License, or
31
 *  (at your option) any later version.
32
 *
33
 *  This program is distributed in the hope that it will be useful,
34
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
35
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36
 *  GNU Lesser General Public License for more details.
37
 *
38
 *  You should have received a copy of the GNU Lesser General Public License
39
 *  along with this program.
40
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
41
 */
42
43
namespace Smalot\PdfParser\RawData;
44
45
use Smalot\PdfParser\Config;
46
47
class RawDataParser
48
{
49
    /**
50
     * @var \Smalot\PdfParser\Config
51
     */
52
    private $config;
53
54
    /**
55
     * Configuration array.
56
     */
57
    protected $cfg = [
58
        // if `true` ignore filter decoding errors
59
        'ignore_filter_decoding_errors' => true,
60
        // if `true` ignore missing filter decoding errors
61
        'ignore_missing_filter_decoders' => true,
62
    ];
63
64
    protected $filterHelper;
65
    protected $objects;
66
67
    /**
68
     * @param array $cfg Configuration array, default is []
69
     */
70 45
    public function __construct($cfg = [], Config $config = null)
71
    {
72
        // merge given array with default values
73 45
        $this->cfg = array_merge($this->cfg, $cfg);
74
75 45
        $this->filterHelper = new FilterHelper();
76 45
        $this->config = $config ?: new Config();
77 45
    }
78
79
    /**
80
     * Decode the specified stream.
81
     *
82
     * @param string $pdfData PDF data
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     *
88
     * @throws \Exception
89
     */
90 41
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
91
    {
92
        // get stream length and filters
93 41
        $slength = \strlen($stream);
94 41
        if ($slength <= 0) {
95
            return ['', []];
96
        }
97 41
        $filters = [];
98 41
        foreach ($sdic as $k => $v) {
99 41
            if ('/' == $v[0]) {
100 41
                if (('Length' == $v[1]) && (isset($sdic[$k + 1])) && ('numeric' == $sdic[$k + 1][0])) {
101
                    // get declared stream length
102 39
                    $declength = (int) $sdic[$k + 1][1];
103 39
                    if ($declength < $slength) {
104 39
                        $stream = substr($stream, 0, $declength);
105 39
                        $slength = $declength;
106
                    }
107 41
                } elseif (('Filter' == $v[1]) && (isset($sdic[$k + 1]))) {
108
                    // resolve indirect object
109 41
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[$k + 1]);
110 41
                    if ('/' == $objval[0]) {
111
                        // single filter
112 41
                        $filters[] = $objval[1];
113 3
                    } elseif ('[' == $objval[0]) {
114
                        // array of filters
115 3
                        foreach ($objval[1] as $flt) {
116 3
                            if ('/' == $flt[0]) {
117 3
                                $filters[] = $flt[1];
118
                            }
119
                        }
120
                    }
121
                }
122
            }
123
        }
124
125
        // decode the stream
126 41
        $remaining_filters = [];
127 41
        foreach ($filters as $filter) {
128 41
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
129
                try {
130 41
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
131
                } catch (\Exception $e) {
132
                    $emsg = $e->getMessage();
133
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
134
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
135
                    ) {
136 41
                        throw new \Exception($e->getMessage());
137
                    }
138
                }
139
            } else {
140
                // add missing filter to array
141 4
                $remaining_filters[] = $filter;
142
            }
143
        }
144
145 41
        return [$stream, $remaining_filters];
146
    }
147
148
    /**
149
     * Decode the Cross-Reference section
150
     *
151
     * @param string $pdfData   PDF data
152
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
153
     * @param array  $xref      Previous xref array (if any)
154
     *
155
     * @return array containing xref and trailer data
156
     *
157
     * @throws \Exception
158
     */
159 32
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
160
    {
161 32
        $startxref += 4; // 4 is the length of the word 'xref'
162
        // skip initial white space chars
163 32
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
164
        // initialize object number
165 32
        $obj_num = 0;
166
        // search for cross-reference entries or subsection
167 32
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
168 32
            if ($matches[0][1] != $offset) {
169
                // we are on another section
170 7
                break;
171
            }
172 32
            $offset += \strlen($matches[0][0]);
173 32
            if ('n' == $matches[3][0]) {
174
                // create unique object index: [object number]_[generation number]
175 32
                $index = $obj_num.'_'.(int) $matches[2][0];
176
                // check if object already exist
177 32
                if (!isset($xref['xref'][$index])) {
178
                    // store object offset position
179 32
                    $xref['xref'][$index] = (int) $matches[1][0];
180
                }
181 32
                ++$obj_num;
182 32
            } elseif ('f' == $matches[3][0]) {
183 32
                ++$obj_num;
184
            } else {
185
                // object number (index)
186 32
                $obj_num = (int) $matches[1][0];
187
            }
188
        }
189
        // get trailer data
190 32
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
191 32
            $trailer_data = $matches[1][0];
192 32
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
193
                // get only the last updated version
194 32
                $xref['trailer'] = [];
195
                // parse trailer_data
196 32
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
197 32
                    $xref['trailer']['size'] = (int) $matches[1];
198
                }
199 32
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
200 32
                    $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2];
201
                }
202 32
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
203
                    $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2];
204
                }
205 32
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
206 31
                    $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2];
207
                }
208 32
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
209 26
                    $xref['trailer']['id'] = [];
210 26
                    $xref['trailer']['id'][0] = $matches[1];
211 26
                    $xref['trailer']['id'][1] = $matches[2];
212
                }
213
            }
214 32
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
215
                // get previous xref
216 32
                $xref = $this->getXrefData($pdfData, (int) $matches[1], $xref);
217
            }
218
        } else {
219
            throw new \Exception('Unable to find trailer');
220
        }
221
222 32
        return $xref;
223
    }
224
225
    /**
226
     * Decode the Cross-Reference Stream section
227
     *
228
     * @param string $pdfData   PDF data
229
     * @param int    $startxref Offset at which the xref section starts
230
     * @param array  $xref      Previous xref array (if any)
231
     *
232
     * @return array containing xref and trailer data
233
     *
234
     * @throws \Exception if unknown PNG predictor detected
235
     */
236 9
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
237
    {
238
        // try to read Cross-Reference Stream
239 9
        $xrefobj = $this->getRawObject($pdfData, $startxref);
240 9
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
241 9
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
242
            // get only the last updated version
243 9
            $xref['trailer'] = [];
244 9
            $filltrailer = true;
245
        } else {
246 7
            $filltrailer = false;
247
        }
248 9
        if (!isset($xref['xref'])) {
249 9
            $xref['xref'] = [];
250
        }
251 9
        $valid_crs = false;
252 9
        $columns = 0;
253 9
        $predictor = null;
254 9
        $sarr = $xrefcrs[0][1];
255 9
        if (!\is_array($sarr)) {
256
            $sarr = [];
257
        }
258
259 9
        $wb = [];
260
261 9
        foreach ($sarr as $k => $v) {
262
            if (
263 9
                ('/' == $v[0])
264 9
                && ('Type' == $v[1])
265
                && (isset($sarr[$k + 1])
266 9
                    && '/' == $sarr[$k + 1][0]
267 9
                    && 'XRef' == $sarr[$k + 1][1]
268 9
                )
269
            ) {
270
                $valid_crs = true;
271 9
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[$k + 1]))) {
272 9
                // initialize list for: first object number in the subsection / number of objects
273
                $index_blocks = [];
274 7
                for ($m = 0; $m < \count($sarr[$k + 1][1]); $m += 2) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
275 7
                    $index_blocks[] = [$sarr[$k + 1][1][$m][1], $sarr[$k + 1][1][$m + 1][1]];
276 7
                }
277
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
278 9
                // get previous xref offset
279
                $prevxref = (int) $sarr[$k + 1][1];
280 7
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[$k + 1]))) {
281 9
                // number of bytes (in the decoded stream) of the corresponding field
282
                $wb[0] = (int) $sarr[$k + 1][1][0][1];
283 9
                $wb[1] = (int) $sarr[$k + 1][1][1][1];
284 9
                $wb[2] = (int) $sarr[$k + 1][1][2][1];
285 9
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[$k + 1][1]))) {
286 9
                $decpar = $sarr[$k + 1][1];
287 8
                foreach ($decpar as $kdc => $vdc) {
288 8
                    if (
289
                        '/' == $vdc[0]
290 8
                        && 'Columns' == $vdc[1]
291 8
                        && (isset($decpar[$kdc + 1])
292
                            && 'numeric' == $decpar[$kdc + 1][0]
293 8
                        )
294 8
                    ) {
295
                        $columns = (int) $decpar[$kdc + 1][1];
296
                    } elseif (
297 8
                        '/' == $vdc[0]
298
                        && 'Predictor' == $vdc[1]
299 8
                        && (isset($decpar[$kdc + 1])
300 8
                            && 'numeric' == $decpar[$kdc + 1][0]
301
                        )
302 8
                    ) {
303 8
                        $predictor = (int) $decpar[$kdc + 1][1];
304
                    }
305
                }
306 8
            } elseif ($filltrailer) {
307
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[$k + 1]) && ('numeric' == $sarr[$k + 1][0]))) {
308
                    $xref['trailer']['size'] = $sarr[$k + 1][1];
309 9
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
310 9
                    $xref['trailer']['root'] = $sarr[$k + 1][1];
311 9
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
312 9
                    $xref['trailer']['info'] = $sarr[$k + 1][1];
313 9
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[$k + 1]) && ('objref' == $sarr[$k + 1][0]))) {
314 9
                    $xref['trailer']['encrypt'] = $sarr[$k + 1][1];
315 9
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[$k + 1]))) {
316 9
                    $xref['trailer']['id'] = [];
317
                    $xref['trailer']['id'][0] = $sarr[$k + 1][1][0][1];
318 9
                    $xref['trailer']['id'][1] = $sarr[$k + 1][1][1][1];
319 9
                }
320 9
            }
321 9
        }
322
323
        // decode data
324
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
325
            if (null !== $predictor) {
326
                // number of bytes in a row
327 9
                $rowlen = ($columns + 1);
328 9
                // convert the stream into an array of integers
329
                /** @var array<int> */
330 8
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
331
                // TODO: Handle the case when unpack returns false
332
333 8
                // split the rows
334
                $sdata = array_chunk($sdata, $rowlen);
335
336
                // initialize decoded array
337 8
                $ddata = [];
338
                // initialize first row with zeros
339
                $prev_row = array_fill(0, $rowlen, 0);
340 8
                // for each row apply PNG unpredictor
341
                foreach ($sdata as $k => $row) {
342 8
                    // initialize new row
343
                    $ddata[$k] = [];
344 8
                    // get PNG predictor value
345
                    $predictor = (10 + $row[0]);
346 8
                    // for each byte on the row
347
                    for ($i = 1; $i <= $columns; ++$i) {
348 8
                        // new index
349
                        $j = ($i - 1);
350 8
                        $row_up = $prev_row[$j];
351
                        if (1 == $i) {
352 8
                            $row_left = 0;
353 8
                            $row_upleft = 0;
354 8
                        } else {
355 8
                            $row_left = $row[$i - 1];
356 8
                            $row_upleft = $prev_row[$j - 1];
357
                        }
358 8
                        switch ($predictor) {
359 8
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
360
                                $ddata[$k][$j] = $row[$i];
361 8
                                break;
362 8
363
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
364
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
365
                                break;
366 8
367
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
368
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
369
                                break;
370 8
371 8
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
372 8
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xFF);
373
                                break;
374
375
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
376
                                // initial estimate
377
                                $p = ($row_left + $row_up - $row_upleft);
378
                                // distances
379
                                $pa = abs($p - $row_left);
380
                                $pb = abs($p - $row_up);
381
                                $pc = abs($p - $row_upleft);
382
                                $pmin = min($pa, $pb, $pc);
383
                                // return minimum distance
384
                                switch ($pmin) {
385
                                    case $pa:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xFF);
387
                                        break;
388
389
                                    case $pb:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xFF);
391
                                        break;
392
393
                                    case $pc:
394
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xFF);
395
                                        break;
396
                                }
397
                                break;
398
399
                            default:  // PNG prediction (on encoding, PNG optimum)
400
                                throw new \Exception('Unknown PNG predictor: '.$predictor);
401
                        }
402
                    }
403
                    $prev_row = $ddata[$k];
404
                } // end for each row
405
            // complete decoding
406 8
            } else {
407
                // number of bytes in a row
408
                $rowlen = array_sum($wb);
409
                // convert the stream into an array of integers
410
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
411 1
                // split the rows
412
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

412
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
413 1
            }
414
415 1
            $sdata = [];
416
417
            // for every row
418 9
            foreach ($ddata as $k => $row) {
419
                // initialize new row
420
                $sdata[$k] = [0, 0, 0];
421 9
                if (0 == $wb[0]) {
422
                    // default type field
423 9
                    $sdata[$k][0] = 1;
424 9
                }
425
                $i = 0; // count bytes in the row
426
                // for every column
427
                for ($c = 0; $c < 3; ++$c) {
428 9
                    // for every byte on the column
429
                    for ($b = 0; $b < $wb[$c]; ++$b) {
430 9
                        if (isset($row[$i])) {
431
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
432 9
                        }
433 9
                        ++$i;
434 9
                    }
435
                }
436 9
            }
437
438
            // fill xref
439
            if (isset($index_blocks)) {
440
                // load the first object number of the first /Index entry
441
                $obj_num = $index_blocks[0][0];
442 9
            } else {
443
                $obj_num = 0;
444 7
            }
445
            foreach ($sdata as $k => $row) {
446 9
                switch ($row[0]) {
447
                    case 0:  // (f) linked list of free objects
448 9
                        break;
449 9
450 9
                    case 1:  // (n) objects that are in use but are not compressed
451 9
                        // create unique object index: [object number]_[generation number]
452
                        $index = $obj_num.'_'.$row[2];
453 9
                        // check if object already exist
454
                        if (!isset($xref['xref'][$index])) {
455 9
                            // store object offset position
456
                            $xref['xref'][$index] = $row[1];
457 9
                        }
458
                        break;
459 9
460
                    case 2:  // compressed objects
461 9
                        // $row[1] = object number of the object stream in which this object is stored
462
                        // $row[2] = index of this object within the object stream
463 9
                        $index = $row[1].'_0_'.$row[2];
464
                        $xref['xref'][$index] = -1;
465
                        break;
466 9
467 9
                    default:  // null objects
468 9
                        break;
469
                }
470
                ++$obj_num;
471
                if (isset($index_blocks)) {
472
                    // reduce the number of remaining objects
473 9
                    --$index_blocks[0][1];
474 9
                    if (0 == $index_blocks[0][1]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $index_blocks does not seem to be defined for all execution paths leading up to this point.
Loading history...
475
                        // remove the actual used /Index entry
476 7
                        array_shift($index_blocks);
477 7
                        if (0 < \count($index_blocks)) {
478
                            // load the first object number of the following /Index entry
479 7
                            $obj_num = $index_blocks[0][0];
480 7
                        } else {
481
                            // if there are no more entries, remove $index_blocks to avoid actions on an empty array
482 5
                            unset($index_blocks);
483
                        }
484
                    }
485 7
                }
486
            }
487
        } // end decoding data
488
        if (isset($prevxref)) {
489
            // get previous xref
490
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
491 9
        }
492
493 7
        return $xref;
494
    }
495
496 9
    protected function getObjectHeaderPattern(array $objRefs): string
497
    {
498
        // consider all whitespace character (PDF specifications)
499 41
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/';
500
    }
501
502 41
    protected function getObjectHeaderLen(array $objRefs): int
503
    {
504
        // "4 0 obj"
505 41
        // 2 whitespaces + strlen("obj") = 5
506
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
507
    }
508
509 41
    /**
510
     * Get content of indirect object.
511
     *
512
     * @param string $pdfData  PDF data
513
     * @param string $objRef   Object number and generation number separated by underscore character
514
     * @param int    $offset   Object offset
515
     * @param bool   $decoding If true decode streams
516
     *
517
     * @return array containing object data
518
     *
519
     * @throws \Exception if invalid object reference found
520
     */
521
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
522
    {
523
        /*
524 41
         * build indirect object header
525
         */
526
        // $objHeader = "[object number] [generation number] obj"
527
        $objRefArr = explode('_', $objRef);
528
        if (2 !== \count($objRefArr)) {
529
            throw new \Exception('Invalid object reference for $obj.');
530 41
        }
531 41
532
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
533
534
        /*
535 41
         * check if we are in position
536
         */
537
        // ignore whitespace characters at offset
538
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
539
        // ignore leading zeros for object number
540
        $offset += strspn($pdfData, '0', $offset);
541 41
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
542
            // an indirect reference to an undefined object shall be considered a reference to the null object
543 41
            return ['null', 'null', $offset];
544 41
        }
545
546
        /*
547
         * get content
548
         */
549
        // starting position of object content
550
        $offset += $objHeaderLen;
551
        $objContentArr = [];
552
        $i = 0; // object main index
553 41
        $header = null;
554 41
        do {
555 41
            $oldOffset = $offset;
556
            // get element
557 41
            $element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
558
            $offset = $element[2];
559 41
            // decode stream using stream's dictionary information
560 41
            if ($decoding && ('stream' === $element[0]) && null != $header) {
561
                $element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
562 41
            }
563 41
            $objContentArr[$i] = $element;
564
            $header = isset($element[0]) && '<<' === $element[0] ? $element : null;
565 41
            ++$i;
566 41
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
567 41
        // remove closing delimiter
568
        array_pop($objContentArr);
569 41
570
        /*
571
         * return raw object content
572
         */
573
        return $objContentArr;
574 41
    }
575
576
    /**
577
     * Get the content of object, resolving indirect object reference if necessary.
578
     *
579
     * @param string $pdfData PDF data
580
     * @param array  $obj     Object value
581
     *
582
     * @return array containing object data
583
     *
584
     * @throws \Exception
585
     */
586
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
587 41
    {
588
        if ('objref' == $obj[0]) {
589 41
            // reference to indirect object
590
            if (isset($this->objects[$obj[1]])) {
591
                // this object has been already parsed
592
                return $this->objects[$obj[1]];
593
            } elseif (isset($xref[$obj[1]])) {
594
                // parse new object
595
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
596
597
                return $this->objects[$obj[1]];
598
            }
599
        }
600
601
        return $obj;
602 41
    }
603
604
    /**
605
     * Get object type, raw value and offset to next object
606
     *
607
     * @param int        $offset    Object offset
608
     * @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
609
     *
610
     * @return array containing object type, raw value and offset to next object
611
     */
612 42
    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
613
    {
614 42
        $objtype = ''; // object type to be returned
615 42
        $objval = ''; // object value to be returned
616
617
        // skip initial white space chars
618 42
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
619
620
        // get first char
621 42
        $char = $pdfData[$offset];
622
        // get object type
623 42
        switch ($char) {
624 42
            case '%':  // \x25 PERCENT SIGN
625
                // skip comment and search for next token
626 1
                $next = strcspn($pdfData, "\r\n", $offset);
627 1
                if ($next > 0) {
628 1
                    $offset += $next;
629
630 1
                    return $this->getRawObject($pdfData, $offset);
631
                }
632
                break;
633
634 42
            case '/':  // \x2F SOLIDUS
635
                // name object
636 42
                $objtype = $char;
637 42
                ++$offset;
638 42
                $span = strcspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20\n\t\r\v\f\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset, 256);
639 42
                if ($span > 0) {
640 42
                    $objval = substr($pdfData, $offset, $span); // unescaped value
641
                    $offset += $span;
642
                }
643 42
                break;
644 42
645 42
            case '(':   // \x28 LEFT PARENTHESIS
646
            case ')':  // \x29 RIGHT PARENTHESIS
647 42
                // literal string object
648
                $objtype = $char;
649 42
                ++$offset;
650 42
                $strpos = $offset;
651
                if ('(' == $char) {
652 37
                    $open_bracket = 1;
653 37
                    while ($open_bracket > 0) {
654 37
                        if (!isset($pdfData[$strpos])) {
655 37
                            break;
656 37
                        }
657 37
                        $ch = $pdfData[$strpos];
658 37
                        switch ($ch) {
659
                            case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
660
                                // skip next character
661 37
                                ++$strpos;
662 37
                                break;
663 37
664
                            case '(':  // LEFT PARENHESIS (28h)
665 19
                                ++$open_bracket;
666 19
                                break;
667
668 37
                            case ')':  // RIGHT PARENTHESIS (29h)
669
                                --$open_bracket;
670
                                break;
671
                        }
672 37
                        ++$strpos;
673 37
                    }
674 37
                    $objval = substr($pdfData, $offset, $strpos - $offset - 1);
675
                    $offset = $strpos;
676 37
                }
677
                break;
678 37
679 37
            case '[':   // \x5B LEFT SQUARE BRACKET
680
            case ']':  // \x5D RIGHT SQUARE BRACKET
681 37
                // array object
682
                $objtype = $char;
683 42
                ++$offset;
684 42
                if ('[' == $char) {
685
                    // get array content
686 41
                    $objval = [];
687 41
                    do {
688 41
                        $oldOffset = $offset;
689
                        // get element
690 41
                        $element = $this->getRawObject($pdfData, $offset);
691
                        $offset = $element[2];
692 41
                        $objval[] = $element;
693
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
694 41
                    // remove closing delimiter
695 41
                    array_pop($objval);
696 41
                }
697 41
                break;
698
699 41
            case '<':  // \x3C LESS-THAN SIGN
700
            case '>':  // \x3E GREATER-THAN SIGN
701 41
                if (isset($pdfData[$offset + 1]) && ($pdfData[$offset + 1] == $char)) {
702
                    // dictionary object
703 42
                    $objtype = $char.$char;
704 42
                    $offset += 2;
705 42
                    if ('<' == $char) {
706
                        // get array content
707 42
                        $objval = [];
708 42
                        do {
709 42
                            $oldOffset = $offset;
710
                            // get element
711 42
                            $element = $this->getRawObject($pdfData, $offset);
712
                            $offset = $element[2];
713 42
                            $objval[] = $element;
714
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
715 42
                        // remove closing delimiter
716 42
                        array_pop($objval);
717 42
                    }
718 42
                } else {
719
                    // hexadecimal string object
720 42
                    $objtype = $char;
721
                    ++$offset;
722
723
                    $span = strspn($pdfData, "0123456789abcdefABCDEF\x09\x0a\x0c\x0d\x20", $offset);
724 18
                    $dataToCheck = $pdfData[$offset + $span] ?? null;
725 18
                    if ('<' == $char && $span > 0 && '>' == $dataToCheck) {
726 18
                        // remove white space characters
727 18
                        $objval = strtr(substr($pdfData, $offset, $span), $this->config->getPdfWhitespaces(), '');
728 18
                        $offset += $span + 1;
729
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
730
                        $offset = $endpos + 1;
731 18
                    }
732
                }
733 18
                break;
734 18
735
            default:
736
                if ('endobj' == substr($pdfData, $offset, 6)) {
737
                    // indirect object
738
                    $objtype = 'endobj';
739 42
                    $offset += 6;
740
                } elseif ('null' == substr($pdfData, $offset, 4)) {
741
                    // null object
742 42
                    $objtype = 'null';
743
                    $offset += 4;
744 41
                    $objval = 'null';
745 41
                } elseif ('true' == substr($pdfData, $offset, 4)) {
746 42
                    // boolean true object
747
                    $objtype = 'boolean';
748 3
                    $offset += 4;
749 3
                    $objval = 'true';
750 3
                } elseif ('false' == substr($pdfData, $offset, 5)) {
751 42
                    // boolean false object
752
                    $objtype = 'boolean';
753 15
                    $offset += 5;
754 15
                    $objval = 'false';
755 15
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
756 42
                    // start stream object
757
                    $objtype = 'stream';
758 3
                    $offset += 6;
759 3
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
760 3
                        $offset += \strlen($matches[0]);
761 42
762
                        // we get stream length here to later help preg_match test less data
763 41
                        $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
764 41
                        $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
765 41
766 41
                        $pregResult = preg_match(
767 41
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
768 41
                            $pdfData,
769 41
                            $matches,
770
                            \PREG_OFFSET_CAPTURE,
771 41
                            $offset + $streamLen
772
                        );
773 41
774 41
                        if (1 == $pregResult) {
775 41
                            $objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
776
                            $offset = $matches[1][1];
777
                        }
778 42
                    }
779
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
780 41
                    // end stream object
781 41
                    $objtype = 'endstream';
782 42
                    $offset += 9;
783
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
784 41
                    // indirect object reference
785 41
                    $objtype = 'objref';
786 41
                    $offset += \strlen($matches[0]);
787 42
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
788
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
789 10
                    // object start
790 10
                    $objtype = 'obj';
791 10
                    $objval = (int) $matches[1].'_'.(int) $matches[2];
792 42
                    $offset += \strlen($matches[0]);
793
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
794 41
                    // numeric object
795 41
                    $objtype = 'numeric';
796 41
                    $objval = substr($pdfData, $offset, $numlen);
797
                    $offset += $numlen;
798 42
                }
799
                break;
800
        }
801 42
802
        return [$objtype, $objval, $offset];
803
    }
804
805
    /**
806
     * Get value of an object header's section (obj << YYY >> part ).
807
     *
808
     * It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
809
     * when no Smalot\PdfParser\Header objects are created yet.
810
     *
811
     * @param string            $key     header's section name
812
     * @param string            $type    type of the section (i.e. 'numeric', '/', '<<', etc.)
813
     * @param string|array|null $default default value for header's section
814
     *
815 42
     * @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
816
     */
817 42
    private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
818 42
    {
819
        if (false === \is_array($headerDic)) {
0 ignored issues
show
introduced by
The condition false === is_array($headerDic) is always false.
Loading history...
820
            return $default;
821 42
        }
822
823
        /*
824
         * It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
825 42
         * iterates over it, searching for section of type '/' whith requested key.
826
         * If such a section is found, it tries to receive it's value (next object in dictionary),
827 42
         * returning it, if it matches requested type, or default value otherwise.
828 42
         */
829
        foreach ($headerDic as $i => $val) {
830 42
            $isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
831
            if (
832
                $isSectionName
833 42
                && $val[1] == $key
834
                && isset($headerDic[$i + 1])
835
            ) {
836 42
                $isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
837 42
838 14
                return $isSectionValue && $type == $headerDic[$i + 1][0]
839
                    ? $headerDic[$i + 1][1]
840 7
                    : $default;
841 7
            }
842
        }
843 7
844
        return $default;
845
    }
846
847
    /**
848
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
849
     *
850
     * @param int   $offset xref offset (if known)
851 42
     * @param array $xref   previous xref array (if any)
852 1
     *
853
     * @return array containing xref and trailer data
854
     *
855
     * @throws \Exception if it was unable to find startxref
856 41
     * @throws \Exception if it was unable to find xref
857
     */
858 32
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
859
    {
860
        $startxrefPreg = preg_match(
861 9
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
862
            $pdfData,
863 41
            $matches,
864
            \PREG_OFFSET_CAPTURE,
865
            $offset
866
        );
867 41
868
        if (0 == $offset) {
869
            // find last startxref
870
            $pregResult = preg_match_all(
871
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
872
                $pdfData,
873
                $matches,
874
                \PREG_SET_ORDER,
875
                $offset
876
            );
877
            if (0 == $pregResult) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $pregResult of type integer|null to 0; this is ambiguous as not only 0 == 0 is true, but null == 0 is true, too. Consider using a strict comparison ===.
Loading history...
878
                throw new \Exception('Unable to find startxref');
879
            }
880 42
            $matches = array_pop($matches);
881
            $startxref = $matches[1];
882 42
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
883
            // Already pointing at the xref table
884
            $startxref = $offset;
885
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
886 42
            // Cross-Reference Stream object
887
            $startxref = $offset;
888
        } elseif ($startxrefPreg) {
889
            // startxref found
890
            $startxref = $matches[1][0];
891 42
        } else {
892
            throw new \Exception('Unable to find startxref');
893
        }
894 42
895
        if ($startxref > \strlen($pdfData)) {
896
            throw new \Exception('Unable to find xref (PDF corrupted?)');
897 41
        }
898 41
899 41
        // check xref position
900
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
901 41
            // Cross-Reference
902
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
903
        } else {
904
            // Cross-Reference Stream
905 41
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
906
        }
907
        if (empty($xref)) {
908
            throw new \Exception('Unable to find xref');
909
        }
910
911
        return $xref;
912
    }
913
914
    /**
915
     * Parses PDF data and returns extracted data as array.
916
     *
917
     * @param string $data PDF data to parse
918
     *
919
     * @return array array of parsed PDF document objects
920
     *
921
     * @throws \Exception if empty PDF data given
922
     * @throws \Exception if PDF data missing %PDF header
923
     */
924
    public function parseData(string $data): array
925
    {
926
        if (empty($data)) {
927
            throw new \Exception('Empty PDF data given.');
928
        }
929
        // find the pdf header starting position
930
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
931
            throw new \Exception('Invalid PDF data: missing %PDF header.');
932
        }
933
934
        // get PDF content string
935
        $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data;
936
937
        // get xref and trailer data
938
        $xref = $this->getXrefData($pdfData);
939
940
        // parse all document objects
941
        $objects = [];
942
        foreach ($xref['xref'] as $obj => $offset) {
943
            if (!isset($objects[$obj]) && ($offset > 0)) {
944
                // decode objects with positive offset
945
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
946
            }
947
        }
948
949
        return [$xref, $objects];
950
    }
951
}
952