Passed
Pull Request — master (#440)
by
unknown
02:15
created

RawDataParser::getXrefData()   B

Complexity

Conditions 9
Paths 22

Size

Total Lines 53
Code Lines 33

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9.5145

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 9
eloc 33
c 3
b 1
f 1
nc 22
nop 3
dl 0
loc 53
ccs 22
cts 27
cp 0.8148
crap 9.5145
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 33
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 33
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 33
        $this->filterHelper = new FilterHelper();
75 33
        $this->config = $config ?: new Config();
76 33
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $sdic    Stream's dictionary array
83
     * @param string $stream  Stream to decode
84
     *
85
     * @return array containing decoded stream data and remaining filters
86
     *
87
     * @throws Exception
88
     */
89 29
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
90
    {
91
        // get stream length and filters
92 29
        $slength = \strlen($stream);
93 29
        if ($slength <= 0) {
94
            return ['', []];
95
        }
96 29
        $filters = [];
97 29
        foreach ($sdic as $k => $v) {
98 29
            if ('/' == $v[0]) {
99 29
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
100
                    // get declared stream length
101 28
                    $declength = (int) ($sdic[($k + 1)][1]);
102 28
                    if ($declength < $slength) {
103 28
                        $stream = substr($stream, 0, $declength);
104 28
                        $slength = $declength;
105
                    }
106 29
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
107
                    // resolve indirect object
108 29
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
109 29
                    if ('/' == $objval[0]) {
110
                        // single filter
111 29
                        $filters[] = $objval[1];
112 1
                    } elseif ('[' == $objval[0]) {
113
                        // array of filters
114 1
                        foreach ($objval[1] as $flt) {
115 1
                            if ('/' == $flt[0]) {
116 1
                                $filters[] = $flt[1];
117
                            }
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
124
        // decode the stream
125 29
        $remaining_filters = [];
126 29
        foreach ($filters as $filter) {
127 29
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
128
                try {
129 29
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
130 1
                } catch (Exception $e) {
131 1
                    $emsg = $e->getMessage();
132 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
133 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
134
                    ) {
135 29
                        throw new Exception($e->getMessage());
136
                    }
137
                }
138
            } else {
139
                // add missing filter to array
140 4
                $remaining_filters[] = $filter;
141
            }
142
        }
143
144 29
        return [$stream, $remaining_filters];
145
    }
146
147
    /**
148
     * Decode the Cross-Reference section
149
     *
150
     * @param string $pdfData   PDF data
151
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
152
     * @param array  $xref      Previous xref array (if any)
153
     *
154
     * @return array containing xref and trailer data
155
     *
156
     * @throws Exception
157
     */
158 24
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
159
    {
160 24
        $startxref += 4; // 4 is the length of the word 'xref'
161
        // skip initial white space chars
162 24
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
163
        // initialize object number
164 24
        $obj_num = 0;
165
        // search for cross-reference entries or subsection
166 24
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
167 24
            if ($matches[0][1] != $offset) {
168
                // we are on another section
169 5
                break;
170
            }
171 24
            $offset += \strlen($matches[0][0]);
172 24
            if ('n' == $matches[3][0]) {
173
                // create unique object index: [object number]_[generation number]
174 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
175
                // check if object already exist
176 24
                if (!isset($xref['xref'][$index])) {
177
                    // store object offset position
178 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
179
                }
180 24
                ++$obj_num;
181 24
            } elseif ('f' == $matches[3][0]) {
182 24
                ++$obj_num;
183
            } else {
184
                // object number (index)
185 24
                $obj_num = (int) ($matches[1][0]);
186
            }
187
        }
188
        // get trailer data
189 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
190 24
            $trailer_data = $matches[1][0];
191 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
192
                // get only the last updated version
193 24
                $xref['trailer'] = [];
194
                // parse trailer_data
195 24
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
196 24
                    $xref['trailer']['size'] = (int) ($matches[1]);
197
                }
198 24
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 24
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 24
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 24
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 24
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
206
                }
207 24
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
208 22
                    $xref['trailer']['id'] = [];
209 22
                    $xref['trailer']['id'][0] = $matches[1];
210 22
                    $xref['trailer']['id'][1] = $matches[2];
211
                }
212
            }
213 24
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214
                // get previous xref
215 24
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
216
            }
217
        } else {
218
            throw new Exception('Unable to find trailer');
219
        }
220
221 24
        return $xref;
222
    }
223
224
    /**
225
     * Decode the Cross-Reference Stream section
226
     *
227
     * @param string $pdfData   PDF data
228
     * @param int    $startxref Offset at which the xref section starts
229
     * @param array  $xref      Previous xref array (if any)
230
     *
231
     * @return array containing xref and trailer data
232
     *
233
     * @throws Exception if unknown PNG predictor detected
234
     */
235 6
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
236
    {
237
        // try to read Cross-Reference Stream
238 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
239 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
240 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
241
            // get only the last updated version
242 6
            $xref['trailer'] = [];
243 6
            $filltrailer = true;
244
        } else {
245 4
            $filltrailer = false;
246
        }
247 6
        if (!isset($xref['xref'])) {
248 6
            $xref['xref'] = [];
249
        }
250 6
        $valid_crs = false;
251 6
        $columns = 0;
252 6
        $predictor = null;
253 6
        $sarr = $xrefcrs[0][1];
254 6
        if (!\is_array($sarr)) {
255
            $sarr = [];
256
        }
257
258 6
        $wb = [];
259
260 6
        foreach ($sarr as $k => $v) {
261
            if (
262 6
                ('/' == $v[0])
263 6
                && ('Type' == $v[1])
264
                && (
265 6
                    isset($sarr[($k + 1)])
266 6
                    && '/' == $sarr[($k + 1)][0]
267 6
                    && 'XRef' == $sarr[($k + 1)][1]
268
                )
269
            ) {
270 6
                $valid_crs = true;
271 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272
                // first object number in the subsection
273 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
274 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
275
                // get previous xref offset
276 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
277 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
278
                // number of bytes (in the decoded stream) of the corresponding field
279 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
280 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
281 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
282 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
283 5
                $decpar = $sarr[($k + 1)][1];
284 5
                foreach ($decpar as $kdc => $vdc) {
285
                    if (
286 5
                        '/' == $vdc[0]
287 5
                        && 'Columns' == $vdc[1]
288
                        && (
289 5
                            isset($decpar[($kdc + 1)])
290 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
291
                        )
292
                    ) {
293 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
294
                    } elseif (
295 5
                        '/' == $vdc[0]
296 5
                        && 'Predictor' == $vdc[1]
297
                        && (
298 5
                            isset($decpar[($kdc + 1)])
299 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
300
                        )
301
                    ) {
302 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
303
                    }
304
                }
305 6
            } elseif ($filltrailer) {
306 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
307 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
308 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
309 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
310 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
311 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
312 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
313
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
314 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
315 6
                    $xref['trailer']['id'] = [];
316 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
317 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
318
                }
319
            }
320
        }
321
322
        // decode data
323 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
324 6
            if (null !== $predictor) {
325
                // number of bytes in a row
326 5
                $rowlen = ($columns + 1);
327
                // convert the stream into an array of integers
328 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
329
                // split the rows
330 5
                $sdata = array_chunk($sdata, $rowlen);
331
332
                // initialize decoded array
333 5
                $ddata = [];
334
                // initialize first row with zeros
335 5
                $prev_row = array_fill(0, $rowlen, 0);
336
                // for each row apply PNG unpredictor
337 5
                foreach ($sdata as $k => $row) {
338
                    // initialize new row
339 5
                    $ddata[$k] = [];
340
                    // get PNG predictor value
341 5
                    $predictor = (10 + $row[0]);
342
                    // for each byte on the row
343 5
                    for ($i = 1; $i <= $columns; ++$i) {
344
                        // new index
345 5
                        $j = ($i - 1);
346 5
                        $row_up = $prev_row[$j];
347 5
                        if (1 == $i) {
348 5
                            $row_left = 0;
349 5
                            $row_upleft = 0;
350
                        } else {
351 5
                            $row_left = $row[($i - 1)];
352 5
                            $row_upleft = $prev_row[($j - 1)];
353
                        }
354 5
                        switch ($predictor) {
355 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
356
                                $ddata[$k][$j] = $row[$i];
357
                                break;
358
359 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
360
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
361
                                break;
362
363 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
364 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
365 5
                                break;
366
367
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
368
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
369
                                break;
370
371
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
372
                                // initial estimate
373
                                $p = ($row_left + $row_up - $row_upleft);
374
                                // distances
375
                                $pa = abs($p - $row_left);
376
                                $pb = abs($p - $row_up);
377
                                $pc = abs($p - $row_upleft);
378
                                $pmin = min($pa, $pb, $pc);
379
                                // return minimum distance
380
                                switch ($pmin) {
381
                                    case $pa:
382
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
383
                                        break;
384
385
                                    case $pb:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
387
                                        break;
388
389
                                    case $pc:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
391
                                        break;
392
                                }
393
                                break;
394
395
                            default:  // PNG prediction (on encoding, PNG optimum)
396
                                throw new Exception('Unknown PNG predictor: '.$predictor);
397
                        }
398
                    }
399 5
                    $prev_row = $ddata[$k];
400
                } // end for each row
401
                // complete decoding
402
            } else {
403
                // number of bytes in a row
404 2
                $rowlen = array_sum($wb);
405
                // convert the stream into an array of integers
406 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
407
                // split the rows
408 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

408
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
409
            }
410
411 6
            $sdata = [];
412
413
            // for every row
414 6
            foreach ($ddata as $k => $row) {
415
                // initialize new row
416 6
                $sdata[$k] = [0, 0, 0];
417 6
                if (0 == $wb[0]) {
418
                    // default type field
419
                    $sdata[$k][0] = 1;
420
                }
421 6
                $i = 0; // count bytes in the row
422
                // for every column
423 6
                for ($c = 0; $c < 3; ++$c) {
424
                    // for every byte on the column
425 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
426 6
                        if (isset($row[$i])) {
427 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
428
                        }
429 6
                        ++$i;
430
                    }
431
                }
432
            }
433
434
            // fill xref
435 6
            if (isset($index_first)) {
436 4
                $obj_num = $index_first;
437
            } else {
438 6
                $obj_num = 0;
439
            }
440 6
            foreach ($sdata as $k => $row) {
441 6
                switch ($row[0]) {
442 6
                    case 0:  // (f) linked list of free objects
443 6
                            break;
444
445 6
                    case 1:  // (n) objects that are in use but are not compressed
446
                            // create unique object index: [object number]_[generation number]
447 6
                            $index = $obj_num.'_'.$row[2];
448
                            // check if object already exist
449 6
                            if (!isset($xref['xref'][$index])) {
450
                                // store object offset position
451 6
                                $xref['xref'][$index] = $row[1];
452
                            }
453 6
                            break;
454
455 6
                    case 2:  // compressed objects
456
                            // $row[1] = object number of the object stream in which this object is stored
457
                            // $row[2] = index of this object within the object stream
458 6
                            $index = $row[1].'_0_'.$row[2];
459 6
                            $xref['xref'][$index] = -1;
460 6
                            break;
461
462
                    default:  // null objects
463
                            break;
464
                }
465 6
                ++$obj_num;
466
            }
467
        } // end decoding data
468 6
        if (isset($prevxref)) {
469
            // get previous xref
470 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
471
        }
472
473 6
        return $xref;
474
    }
475
476 29
    protected function getObjectHeaderPattern(array $objRefs): string
477
    {
478
        // consider all whitespace character (PDF specifications)
479 29
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
480
    }
481
482 29
    protected function getObjectHeaderLen(array $objRefs): int
483
    {
484
        // "4 0 obj"
485
        // 2 whitespaces + strlen("obj") = 5
486 29
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
487
    }
488
489
    /**
490
     * Get content of indirect object.
491
     *
492
     * @param string $pdfData  PDF data
493
     * @param string $objRef   Object number and generation number separated by underscore character
494
     * @param int    $offset   Object offset
495
     * @param bool   $decoding If true decode streams
496
     *
497
     * @return array containing object data
498
     *
499
     * @throws Exception if invalid object reference found
500
     */
501 29
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
502
    {
503
        /*
504
         * build indirect object header
505
         */
506
        // $objHeader = "[object number] [generation number] obj"
507 29
        $objRefArr = explode('_', $objRef);
508 29
        if (2 !== \count($objRefArr)) {
509
            throw new Exception('Invalid object reference for $obj.');
510
        }
511
512 29
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
513
514
        /*
515
         * check if we are in position
516
         */
517
        // ignore whitespace characters at offset
518 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
519
        // ignore leading zeros for object number
520 29
        $offset += strspn($pdfData, '0', $offset);
521 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
522
            // an indirect reference to an undefined object shall be considered a reference to the null object
523 2
            return ['null', 'null', $offset];
524
        }
525
526
        /*
527
         * get content
528
         */
529
        // starting position of object content
530 29
        $offset += $objHeaderLen;
531 29
        $objContentArr = [];
532 29
        $i = 0; // object main index
533
        do {
534 29
            $oldOffset = $offset;
535
            // get element
536 29
            $element = $this->getRawObject($pdfData, $offset);
537 29
            $offset = $element[2];
538
            // decode stream using stream's dictionary information
539 29
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
540 29
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
541
            }
542 29
            $objContentArr[$i] = $element;
543 29
            ++$i;
544 29
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
545
        // remove closing delimiter
546 29
        array_pop($objContentArr);
547
548
        /*
549
         * return raw object content
550
         */
551 29
        return $objContentArr;
552
    }
553
554
    /**
555
     * Get the content of object, resolving indirect object reference if necessary.
556
     *
557
     * @param string $pdfData PDF data
558
     * @param array  $obj     Object value
559
     *
560
     * @return array containing object data
561
     *
562
     * @throws Exception
563
     */
564 29
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
565
    {
566 29
        if ('objref' == $obj[0]) {
567
            // reference to indirect object
568
            if (isset($this->objects[$obj[1]])) {
569
                // this object has been already parsed
570
                return $this->objects[$obj[1]];
571
            } elseif (isset($xref[$obj[1]])) {
572
                // parse new object
573
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
574
575
                return $this->objects[$obj[1]];
576
            }
577
        }
578
579 29
        return $obj;
580
    }
581
582
    /**
583
     * Get object type, raw value and offset to next object
584
     *
585
     * @param int $offset Object offset
586
     *
587
     * @return array containing object type, raw value and offset to next object
588
     */
589 30
    protected function getRawObject(string $pdfData, int $offset = 0): array
590
    {
591 30
        $objtype = ''; // object type to be returned
592 30
        $objval = ''; // object value to be returned
593
594
        // skip initial white space chars
595 30
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
596
597
        // get first char
598 30
        $char = $pdfData[$offset];
599
        // get object type
600 30
        switch ($char) {
601 30
            case '%':  // \x25 PERCENT SIGN
602
                    // skip comment and search for next token
603
                    $next = strcspn($pdfData, "\r\n", $offset);
604
                    if ($next > 0) {
605
                        $offset += $next;
606
607
                        return $this->getRawObject($pdfData, $offset);
608
                    }
609
                    break;
610
611 30
            case '/':  // \x2F SOLIDUS
612
                    // name object
613 30
                    $objtype = $char;
614 30
                    ++$offset;
615 30
                    $pregResult = preg_match(
616 30
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
617 30
                        substr($pdfData, $offset, 256),
618
                        $matches
619
                    );
620 30
                    if (1 == $pregResult) {
621 30
                        $objval = $matches[1]; // unescaped value
622 30
                        $offset += \strlen($objval);
623
                    }
624 30
                    break;
625
626 30
            case '(':   // \x28 LEFT PARENTHESIS
627 30
            case ')':  // \x29 RIGHT PARENTHESIS
628
                    // literal string object
629 27
                    $objtype = $char;
630 27
                    ++$offset;
631 27
                    $strpos = $offset;
632 27
                    if ('(' == $char) {
633 27
                        $open_bracket = 1;
634 27
                        while ($open_bracket > 0) {
635 27
                            if (!isset($pdfData[$strpos])) {
636
                                break;
637
                            }
638 27
                            $ch = $pdfData[$strpos];
639 27
                            switch ($ch) {
640 27
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
641
                                        // skip next character
642 15
                                        ++$strpos;
643 15
                                        break;
644
645 27
                                case '(':  // LEFT PARENHESIS (28h)
646
                                        ++$open_bracket;
647
                                        break;
648
649 27
                                case ')':  // RIGHT PARENTHESIS (29h)
650 27
                                        --$open_bracket;
651 27
                                        break;
652
                            }
653 27
                            ++$strpos;
654
                        }
655 27
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
656 27
                        $offset = $strpos;
657
                    }
658 27
                    break;
659
660 30
            case '[':   // \x5B LEFT SQUARE BRACKET
661 30
            case ']':  // \x5D RIGHT SQUARE BRACKET
662
                // array object
663 29
                $objtype = $char;
664 29
                ++$offset;
665 29
                if ('[' == $char) {
666
                    // get array content
667 29
                    $objval = [];
668
                    do {
669 29
                        $oldOffset = $offset;
670
                        // get element
671 29
                        $element = $this->getRawObject($pdfData, $offset);
672 29
                        $offset = $element[2];
673 29
                        $objval[] = $element;
674 29
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
675
                    // remove closing delimiter
676 29
                    array_pop($objval);
677
                }
678 29
                break;
679
680 30
            case '<':  // \x3C LESS-THAN SIGN
681 30
            case '>':  // \x3E GREATER-THAN SIGN
682 30
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
683
                    // dictionary object
684 30
                    $objtype = $char.$char;
685 30
                    $offset += 2;
686 30
                    if ('<' == $char) {
687
                        // get array content
688 30
                        $objval = [];
689
                        do {
690 30
                            $oldOffset = $offset;
691
                            // get element
692 30
                            $element = $this->getRawObject($pdfData, $offset);
693 30
                            $offset = $element[2];
694 30
                            $objval[] = $element;
695 30
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
696
                        // remove closing delimiter
697 30
                        array_pop($objval);
698
                    }
699
                } else {
700
                    // hexadecimal string object
701 12
                    $objtype = $char;
702 12
                    ++$offset;
703 12
                    $pregResult = preg_match(
704 12
                        '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
705 12
                        substr($pdfData, $offset),
706
                        $matches
707
                    );
708 12
                    if (('<' == $char) && 1 == $pregResult) {
709
                        // remove white space characters
710 12
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
711 12
                        $offset += \strlen($matches[0]);
712
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
713
                        $offset = $endpos + 1;
714
                    }
715
                }
716 30
                break;
717
718
            default:
719 30
                if ('endobj' == substr($pdfData, $offset, 6)) {
720
                    // indirect object
721 29
                    $objtype = 'endobj';
722 29
                    $offset += 6;
723 30
                } elseif ('null' == substr($pdfData, $offset, 4)) {
724
                    // null object
725 4
                    $objtype = 'null';
726 4
                    $offset += 4;
727 4
                    $objval = 'null';
728 30
                } elseif ('true' == substr($pdfData, $offset, 4)) {
729
                    // boolean true object
730 9
                    $objtype = 'boolean';
731 9
                    $offset += 4;
732 9
                    $objval = 'true';
733 30
                } elseif ('false' == substr($pdfData, $offset, 5)) {
734
                    // boolean false object
735 2
                    $objtype = 'boolean';
736 2
                    $offset += 5;
737 2
                    $objval = 'false';
738 30
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
739
                    // start stream object
740 29
                    $objtype = 'stream';
741 29
                    $offset += 6;
742 29
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
743 29
                        $offset += \strlen($matches[0]);
744 29
                        $pregResult = preg_match(
745 29
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
746 29
                            substr($pdfData, $offset),
747
                            $matches,
748 29
                            \PREG_OFFSET_CAPTURE
749
                        );
750 29
                        if (1 == $pregResult) {
751 29
                            $objval = substr($pdfData, $offset, $matches[0][1]);
752 29
                            $offset += $matches[1][1];
753
                        }
754
                    }
755 30
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
756
                    // end stream object
757 29
                    $objtype = 'endstream';
758 29
                    $offset += 9;
759 30
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
760
                    // indirect object reference
761 29
                    $objtype = 'objref';
762 29
                    $offset += \strlen($matches[0]);
763 29
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
764 30
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
765
                    // object start
766 6
                    $objtype = 'obj';
767 6
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
768 6
                    $offset += \strlen($matches[0]);
769 30
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
770
                    // numeric object
771 29
                    $objtype = 'numeric';
772 29
                    $objval = substr($pdfData, $offset, $numlen);
773 29
                    $offset += $numlen;
774
                }
775 30
                break;
776
        }
777
778 30
        return [$objtype, $objval, $offset];
779
    }
780
781
    /**
782
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
783
     *
784
     * @param int   $offset xref offset (if know)
785
     * @param array $xref   previous xref array (if any)
786
     *
787
     * @return array containing xref and trailer data
788
     *
789
     * @throws Exception if it was unable to find startxref
790
     * @throws Exception if it was unable to find xref
791
     */
792 30
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
793
    {
794 30
        $startxrefPreg = preg_match(
795 30
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
796
            $pdfData,
797
            $matches,
798 30
            \PREG_OFFSET_CAPTURE,
799
            $offset
800
        );
801
802 30
        if (0 == $offset) {
803
            // find last startxref
804 30
            $pregResult = preg_match_all(
805 30
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
806
                $pdfData, $matches,
807 30
                \PREG_SET_ORDER,
808
                $offset
809
            );
810 30
            if (0 == $pregResult) {
811
                throw new Exception('Unable to find startxref');
812
            }
813 30
            $matches = array_pop($matches);
814 30
            $startxref = $matches[1];
815 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
816
            // Already pointing at the xref table
817 5
            $startxref = $offset;
818 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
819
            // Cross-Reference Stream object
820 4
            $startxref = $offset;
821
        } elseif ($startxrefPreg) {
822
            // startxref found
823
            $startxref = $matches[1][0];
824
        } else {
825
            throw new Exception('Unable to find startxref');
826
        }
827
828 30
        if ($startxref > \strlen($pdfData)) {
829 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
830
        }
831
832
        // check xref position
833 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
834
            // Cross-Reference
835 24
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
836
        } else {
837
            // Cross-Reference Stream
838 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
839
        }
840 29
        if (empty($xref)) {
841
            throw new Exception('Unable to find xref');
842
        }
843
844 29
        return $xref;
845
    }
846
847
    /**
848
     * Parses PDF data and returns extracted data as array.
849
     *
850
     * @param string $data PDF data to parse
851
     *
852
     * @return array array of parsed PDF document objects
853
     *
854
     * @throws Exception if empty PDF data given
855
     * @throws Exception if PDF data missing %PDF header
856
     */
857 30
    public function parseData(string $data): array
858
    {
859 30
        if (empty($data)) {
860
            throw new Exception('Empty PDF data given.');
861
        }
862
        // find the pdf header starting position
863 30
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
864
            throw new Exception('Invalid PDF data: missing %PDF header.');
865
        }
866
867
        // get PDF content string
868 30
        $pdfData = substr($data, $trimpos);
869
870
        // get xref and trailer data
871 30
        $xref = $this->getXrefData($pdfData);
872
873
        // parse all document objects
874 29
        $objects = [];
875 29
        foreach ($xref['xref'] as $obj => $offset) {
876 29
            if (!isset($objects[$obj]) && ($offset > 0)) {
877
                // decode objects with positive offset
878 29
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
879
            }
880
        }
881
882 29
        return [$xref, $objects];
883
    }
884
}
885