Passed
Pull Request — master (#440)
by
unknown
02:14
created

RawDataParser::decodeXref()   C

Complexity

Conditions 15
Paths 134

Size

Total Lines 64
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 15.0386

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 15
eloc 37
c 3
b 1
f 1
nc 134
nop 3
dl 0
loc 64
ccs 34
cts 36
cp 0.9444
crap 15.0386
rs 5.6333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 32
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 32
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 32
        $this->filterHelper = new FilterHelper();
75 32
        $this->config = $config ?: new Config();
76 32
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $sdic    Stream's dictionary array
83
     * @param string $stream  Stream to decode
84
     *
85
     * @return array containing decoded stream data and remaining filters
86
     *
87
     * @throws Exception
88
     */
89 28
    protected function decodeStream(string $pdfData, array $xref, array $sdic, string $stream): array
90
    {
91
        // get stream length and filters
92 28
        $slength = \strlen($stream);
93 28
        if ($slength <= 0) {
94
            return ['', []];
95
        }
96 28
        $filters = [];
97 28
        foreach ($sdic as $k => $v) {
98 28
            if ('/' == $v[0]) {
99 28
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
100
                    // get declared stream length
101 27
                    $declength = (int) ($sdic[($k + 1)][1]);
102 27
                    if ($declength < $slength) {
103 27
                        $stream = substr($stream, 0, $declength);
104 27
                        $slength = $declength;
105
                    }
106 28
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
107
                    // resolve indirect object
108 28
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
109 28
                    if ('/' == $objval[0]) {
110
                        // single filter
111 28
                        $filters[] = $objval[1];
112
                    } elseif ('[' == $objval[0]) {
113
                        // array of filters
114
                        foreach ($objval[1] as $flt) {
115
                            if ('/' == $flt[0]) {
116
                                $filters[] = $flt[1];
117
                            }
118
                        }
119
                    }
120
                }
121
            }
122
        }
123
124
        // decode the stream
125 28
        $remaining_filters = [];
126 28
        foreach ($filters as $filter) {
127 28
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
128
                try {
129 28
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
130
                } catch (Exception $e) {
131
                    $emsg = $e->getMessage();
132
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
133
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
134
                    ) {
135 28
                        throw new Exception($e->getMessage());
136
                    }
137
                }
138
            } else {
139
                // add missing filter to array
140 3
                $remaining_filters[] = $filter;
141
            }
142
        }
143
144 28
        return [$stream, $remaining_filters];
145
    }
146
147
    /**
148
     * Decode the Cross-Reference section
149
     *
150
     * @param string $pdfData   PDF data
151
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
152
     * @param array  $xref      Previous xref array (if any)
153
     *
154
     * @return array containing xref and trailer data
155
     *
156
     * @throws Exception
157
     */
158 23
    protected function decodeXref(string $pdfData, int $startxref, array $xref = []): array
159
    {
160 23
        $startxref += 4; // 4 is the length of the word 'xref'
161
        // skip initial white space chars
162 23
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
163
        // initialize object number
164 23
        $obj_num = 0;
165
        // search for cross-reference entries or subsection
166 23
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
167 23
            if ($matches[0][1] != $offset) {
168
                // we are on another section
169 4
                break;
170
            }
171 23
            $offset += \strlen($matches[0][0]);
172 23
            if ('n' == $matches[3][0]) {
173
                // create unique object index: [object number]_[generation number]
174 23
                $index = $obj_num.'_'.(int) ($matches[2][0]);
175
                // check if object already exist
176 23
                if (!isset($xref['xref'][$index])) {
177
                    // store object offset position
178 23
                    $xref['xref'][$index] = (int) ($matches[1][0]);
179
                }
180 23
                ++$obj_num;
181 23
            } elseif ('f' == $matches[3][0]) {
182 23
                ++$obj_num;
183
            } else {
184
                // object number (index)
185 23
                $obj_num = (int) ($matches[1][0]);
186
            }
187
        }
188
        // get trailer data
189 23
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
190 23
            $trailer_data = $matches[1][0];
191 23
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
192
                // get only the last updated version
193 23
                $xref['trailer'] = [];
194
                // parse trailer_data
195 23
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
196 23
                    $xref['trailer']['size'] = (int) ($matches[1]);
197
                }
198 23
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 23
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 23
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 23
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
205 23
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
206
                }
207 23
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
208 21
                    $xref['trailer']['id'] = [];
209 21
                    $xref['trailer']['id'][0] = $matches[1];
210 21
                    $xref['trailer']['id'][1] = $matches[2];
211
                }
212
            }
213 23
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
214
                // get previous xref
215 23
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
216
            }
217
        } else {
218
            throw new Exception('Unable to find trailer');
219
        }
220
221 23
        return $xref;
222
    }
223
224
    /**
225
     * Decode the Cross-Reference Stream section
226
     *
227
     * @param string $pdfData   PDF data
228
     * @param int    $startxref Offset at which the xref section starts
229
     * @param array  $xref      Previous xref array (if any)
230
     *
231
     * @return array containing xref and trailer data
232
     *
233
     * @throws Exception if unknown PNG predictor detected
234
     */
235 5
    protected function decodeXrefStream(string $pdfData, int $startxref, array $xref = []): array
236
    {
237
        // try to read Cross-Reference Stream
238 5
        $xrefobj = $this->getRawObject($pdfData, $startxref);
239 5
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
240 5
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
241
            // get only the last updated version
242 5
            $xref['trailer'] = [];
243 5
            $filltrailer = true;
244
        } else {
245 3
            $filltrailer = false;
246
        }
247 5
        if (!isset($xref['xref'])) {
248 5
            $xref['xref'] = [];
249
        }
250 5
        $valid_crs = false;
251 5
        $columns = 0;
252 5
        $predictor = null;
253 5
        $sarr = $xrefcrs[0][1];
254 5
        if (!\is_array($sarr)) {
255
            $sarr = [];
256
        }
257
258 5
        $wb = [];
259
260 5
        foreach ($sarr as $k => $v) {
261
            if (
262 5
                ('/' == $v[0])
263 5
                && ('Type' == $v[1])
264
                && (
265 5
                    isset($sarr[($k + 1)])
266 5
                    && '/' == $sarr[($k + 1)][0]
267 5
                    && 'XRef' == $sarr[($k + 1)][1]
268
                )
269
            ) {
270 5
                $valid_crs = true;
271 5
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
272
                // first object number in the subsection
273 3
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
274 5
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
275
                // get previous xref offset
276 3
                $prevxref = (int) ($sarr[($k + 1)][1]);
277 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
278
                // number of bytes (in the decoded stream) of the corresponding field
279 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
280 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
281 5
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
282 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
283 4
                $decpar = $sarr[($k + 1)][1];
284 4
                foreach ($decpar as $kdc => $vdc) {
285
                    if (
286 4
                        '/' == $vdc[0]
287 4
                        && 'Columns' == $vdc[1]
288
                        && (
289 4
                            isset($decpar[($kdc + 1)])
290 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
291
                        )
292
                    ) {
293 4
                        $columns = (int) ($decpar[($kdc + 1)][1]);
294
                    } elseif (
295 4
                        '/' == $vdc[0]
296 4
                        && 'Predictor' == $vdc[1]
297
                        && (
298 4
                            isset($decpar[($kdc + 1)])
299 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
300
                        )
301
                    ) {
302 4
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
303
                    }
304
                }
305 5
            } elseif ($filltrailer) {
306 5
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
307 5
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
308 5
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
309 5
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
310 5
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
311 5
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
312 5
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
313
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
314 5
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
315 5
                    $xref['trailer']['id'] = [];
316 5
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
317 5
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
318
                }
319
            }
320
        }
321
322
        // decode data
323 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
324 5
            if (null !== $predictor) {
325
                // number of bytes in a row
326 4
                $rowlen = ($columns + 1);
327
                // convert the stream into an array of integers
328 4
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
329
                // split the rows
330 4
                $sdata = array_chunk($sdata, $rowlen);
331
332
                // initialize decoded array
333 4
                $ddata = [];
334
                // initialize first row with zeros
335 4
                $prev_row = array_fill(0, $rowlen, 0);
336
                // for each row apply PNG unpredictor
337 4
                foreach ($sdata as $k => $row) {
338
                    // initialize new row
339 4
                    $ddata[$k] = [];
340
                    // get PNG predictor value
341 4
                    $predictor = (10 + $row[0]);
342
                    // for each byte on the row
343 4
                    for ($i = 1; $i <= $columns; ++$i) {
344
                        // new index
345 4
                        $j = ($i - 1);
346 4
                        $row_up = $prev_row[$j];
347 4
                        if (1 == $i) {
348 4
                            $row_left = 0;
349 4
                            $row_upleft = 0;
350
                        } else {
351 4
                            $row_left = $row[($i - 1)];
352 4
                            $row_upleft = $prev_row[($j - 1)];
353
                        }
354 4
                        switch ($predictor) {
355 4
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
356
                                $ddata[$k][$j] = $row[$i];
357
                                break;
358
359 4
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
360
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
361
                                break;
362
363 4
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
364 4
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
365 4
                                break;
366
367
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
368
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
369
                                break;
370
371
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
372
                                // initial estimate
373
                                $p = ($row_left + $row_up - $row_upleft);
374
                                // distances
375
                                $pa = abs($p - $row_left);
376
                                $pb = abs($p - $row_up);
377
                                $pc = abs($p - $row_upleft);
378
                                $pmin = min($pa, $pb, $pc);
379
                                // return minimum distance
380
                                switch ($pmin) {
381
                                    case $pa:
382
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
383
                                        break;
384
385
                                    case $pb:
386
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
387
                                        break;
388
389
                                    case $pc:
390
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
391
                                        break;
392
                                }
393
                                break;
394
395
                            default:  // PNG prediction (on encoding, PNG optimum)
396
                                throw new Exception('Unknown PNG predictor: '.$predictor);
397
                        }
398
                    }
399 4
                    $prev_row = $ddata[$k];
400
                } // end for each row
401
                // complete decoding
402
            } else {
403
                // number of bytes in a row
404 1
                $rowlen = array_sum($wb);
405
                // convert the stream into an array of integers
406 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
407
                // split the rows
408 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

408
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
409
            }
410
411 5
            $sdata = [];
412
413
            // for every row
414 5
            foreach ($ddata as $k => $row) {
415
                // initialize new row
416 5
                $sdata[$k] = [0, 0, 0];
417 5
                if (0 == $wb[0]) {
418
                    // default type field
419
                    $sdata[$k][0] = 1;
420
                }
421 5
                $i = 0; // count bytes in the row
422
                // for every column
423 5
                for ($c = 0; $c < 3; ++$c) {
424
                    // for every byte on the column
425 5
                    for ($b = 0; $b < $wb[$c]; ++$b) {
426 5
                        if (isset($row[$i])) {
427 5
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
428
                        }
429 5
                        ++$i;
430
                    }
431
                }
432
            }
433
434
            // fill xref
435 5
            if (isset($index_first)) {
436 3
                $obj_num = $index_first;
437
            } else {
438 5
                $obj_num = 0;
439
            }
440 5
            foreach ($sdata as $k => $row) {
441 5
                switch ($row[0]) {
442 5
                    case 0:  // (f) linked list of free objects
443 5
                            break;
444
445 5
                    case 1:  // (n) objects that are in use but are not compressed
446
                            // create unique object index: [object number]_[generation number]
447 5
                            $index = $obj_num.'_'.$row[2];
448
                            // check if object already exist
449 5
                            if (!isset($xref['xref'][$index])) {
450
                                // store object offset position
451 5
                                $xref['xref'][$index] = $row[1];
452
                            }
453 5
                            break;
454
455 5
                    case 2:  // compressed objects
456
                            // $row[1] = object number of the object stream in which this object is stored
457
                            // $row[2] = index of this object within the object stream
458 5
                            $index = $row[1].'_0_'.$row[2];
459 5
                            $xref['xref'][$index] = -1;
460 5
                            break;
461
462
                    default:  // null objects
463
                            break;
464
                }
465 5
                ++$obj_num;
466
            }
467
        } // end decoding data
468 5
        if (isset($prevxref)) {
469
            // get previous xref
470 3
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
471
        }
472
473 5
        return $xref;
474
    }
475
476 28
    protected function getObjectHeaderPattern(array $objRefs): string
477
    {
478
        // consider all whitespace character (PDF specifications)
479 28
        return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
480
    }
481
482 28
    protected function getObjectHeaderLen(array $objRefs): int
483
    {
484
        // "4 0 obj"
485
        // 2 whitespaces + strlen("obj") = 5
486 28
        return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]);
487
    }
488
489
    /**
490
     * Get content of indirect object.
491
     *
492
     * @param string $pdfData  PDF data
493
     * @param string $objRef   Object number and generation number separated by underscore character
494
     * @param int    $offset   Object offset
495
     * @param bool   $decoding If true decode streams
496
     *
497
     * @return array containing object data
498
     *
499
     * @throws Exception if invalid object reference found
500
     */
501 28
    protected function getIndirectObject(string $pdfData, array $xref, string $objRef, int $offset = 0, bool $decoding = true): array
502
    {
503
        /*
504
         * build indirect object header
505
         */
506
        // $objHeader = "[object number] [generation number] obj"
507 28
        $objRefArr = explode('_', $objRef);
508 28
        if (2 !== \count($objRefArr)) {
509
            throw new Exception('Invalid object reference for $obj.');
510
        }
511
512 28
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
513
514
        /*
515
         * check if we are in position
516
         */
517
        // ignore whitespace characters at offset
518 28
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
519
        // ignore leading zeros for object number
520 28
        $offset += strspn($pdfData, '0', $offset);
521 28
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
522
            // an indirect reference to an undefined object shall be considered a reference to the null object
523 1
            return ['null', 'null', $offset];
524
        }
525
526
        /*
527
         * get content
528
         */
529
        // starting position of object content
530 28
        $offset += $objHeaderLen;
531 28
        $objContentArr = [];
532 28
        $i = 0; // object main index
533
        do {
534 28
            $oldOffset = $offset;
535
            // get element
536 28
            $element = $this->getRawObject($pdfData, $offset);
537 28
            $offset = $element[2];
538
            // decode stream using stream's dictionary information
539 28
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
540 28
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
541
            }
542 28
            $objContentArr[$i] = $element;
543 28
            ++$i;
544 28
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
545
        // remove closing delimiter
546 28
        array_pop($objContentArr);
547
548
        /*
549
         * return raw object content
550
         */
551 28
        return $objContentArr;
552
    }
553
554
    /**
555
     * Get the content of object, resolving indirect object reference if necessary.
556
     *
557
     * @param string $pdfData PDF data
558
     * @param array  $obj     Object value
559
     *
560
     * @return array containing object data
561
     *
562
     * @throws Exception
563
     */
564 28
    protected function getObjectVal(string $pdfData, $xref, array $obj): array
565
    {
566 28
        if ('objref' == $obj[0]) {
567
            // reference to indirect object
568
            if (isset($this->objects[$obj[1]])) {
569
                // this object has been already parsed
570
                return $this->objects[$obj[1]];
571
            } elseif (isset($xref[$obj[1]])) {
572
                // parse new object
573
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
574
575
                return $this->objects[$obj[1]];
576
            }
577
        }
578
579 28
        return $obj;
580
    }
581
582
    /**
583
     * Get object type, raw value and offset to next object
584
     *
585
     * @param int $offset Object offset
586
     *
587
     * @return array containing object type, raw value and offset to next object
588
     */
589 29
    protected function getRawObject(string $pdfData, int $offset = 0): array
590
    {
591 29
        $objtype = ''; // object type to be returned
592 29
        $objval = ''; // object value to be returned
593
594
        // skip initial white space chars
595 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
596
597
        // get first char
598 29
        $char = $pdfData[$offset];
599
        // get object type
600 29
        switch ($char) {
601 29
            case '%':  // \x25 PERCENT SIGN
602
                    // skip comment and search for next token
603
                    $next = strcspn($pdfData, "\r\n", $offset);
604
                    if ($next > 0) {
605
                        $offset += $next;
606
607
                        return $this->getRawObject($pdfData, $offset);
608
                    }
609
                    break;
610
611 29
            case '/':  // \x2F SOLIDUS
612
                    // name object
613 29
                    $objtype = $char;
614 29
                    ++$offset;
615 29
                    $pregResult = preg_match(
616 29
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
617 29
                        substr($pdfData, $offset, 256),
618
                        $matches
619
                    );
620 29
                    if (1 == $pregResult) {
621 29
                        $objval = $matches[1]; // unescaped value
622 29
                        $offset += \strlen($objval);
623
                    }
624 29
                    break;
625
626 29
            case '(':   // \x28 LEFT PARENTHESIS
627 29
            case ')':  // \x29 RIGHT PARENTHESIS
628
                    // literal string object
629 26
                    $objtype = $char;
630 26
                    ++$offset;
631 26
                    $strpos = $offset;
632 26
                    if ('(' == $char) {
633 26
                        $open_bracket = 1;
634 26
                        while ($open_bracket > 0) {
635 26
                            if (!isset($pdfData[$strpos])) {
636
                                break;
637
                            }
638 26
                            $ch = $pdfData[$strpos];
639 26
                            switch ($ch) {
640 26
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
641
                                        // skip next character
642 15
                                        ++$strpos;
643 15
                                        break;
644
645 26
                                case '(':  // LEFT PARENHESIS (28h)
646
                                        ++$open_bracket;
647
                                        break;
648
649 26
                                case ')':  // RIGHT PARENTHESIS (29h)
650 26
                                        --$open_bracket;
651 26
                                        break;
652
                            }
653 26
                            ++$strpos;
654
                        }
655 26
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
656 26
                        $offset = $strpos;
657
                    }
658 26
                    break;
659
660 29
            case '[':   // \x5B LEFT SQUARE BRACKET
661 29
            case ']':  // \x5D RIGHT SQUARE BRACKET
662
                // array object
663 28
                $objtype = $char;
664 28
                ++$offset;
665 28
                if ('[' == $char) {
666
                    // get array content
667 28
                    $objval = [];
668
                    do {
669 28
                        $oldOffset = $offset;
670
                        // get element
671 28
                        $element = $this->getRawObject($pdfData, $offset);
672 28
                        $offset = $element[2];
673 28
                        $objval[] = $element;
674 28
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
675
                    // remove closing delimiter
676 28
                    array_pop($objval);
677
                }
678 28
                break;
679
680 29
            case '<':  // \x3C LESS-THAN SIGN
681 29
            case '>':  // \x3E GREATER-THAN SIGN
682 29
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
683
                    // dictionary object
684 29
                    $objtype = $char.$char;
685 29
                    $offset += 2;
686 29
                    if ('<' == $char) {
687
                        // get array content
688 29
                        $objval = [];
689
                        do {
690 29
                            $oldOffset = $offset;
691
                            // get element
692 29
                            $element = $this->getRawObject($pdfData, $offset);
693 29
                            $offset = $element[2];
694 29
                            $objval[] = $element;
695 29
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
696
                        // remove closing delimiter
697 29
                        array_pop($objval);
698
                    }
699
                } else {
700
                    // hexadecimal string object
701 11
                    $objtype = $char;
702 11
                    ++$offset;
703 11
                    $pregResult = preg_match(
704 11
                        '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
705 11
                        substr($pdfData, $offset),
706
                        $matches
707
                    );
708 11
                    if (('<' == $char) && 1 == $pregResult) {
709
                        // remove white space characters
710 11
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
711 11
                        $offset += \strlen($matches[0]);
712
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
713
                        $offset = $endpos + 1;
714
                    }
715
                }
716 29
                break;
717
718
            default:
719 29
                if ('endobj' == substr($pdfData, $offset, 6)) {
720
                    // indirect object
721 28
                    $objtype = 'endobj';
722 28
                    $offset += 6;
723 29
                } elseif ('null' == substr($pdfData, $offset, 4)) {
724
                    // null object
725 3
                    $objtype = 'null';
726 3
                    $offset += 4;
727 3
                    $objval = 'null';
728 29
                } elseif ('true' == substr($pdfData, $offset, 4)) {
729
                    // boolean true object
730 8
                    $objtype = 'boolean';
731 8
                    $offset += 4;
732 8
                    $objval = 'true';
733 29
                } elseif ('false' == substr($pdfData, $offset, 5)) {
734
                    // boolean false object
735 1
                    $objtype = 'boolean';
736 1
                    $offset += 5;
737 1
                    $objval = 'false';
738 29
                } elseif ('stream' == substr($pdfData, $offset, 6)) {
739
                    // start stream object
740 28
                    $objtype = 'stream';
741 28
                    $offset += 6;
742 28
                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
743 28
                        $offset += \strlen($matches[0]);
744 28
                        $pregResult = preg_match(
745 28
                            '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
746 28
                            substr($pdfData, $offset),
747
                            $matches,
748 28
                            \PREG_OFFSET_CAPTURE
749
                        );
750 28
                        if (1 == $pregResult) {
751 28
                            $objval = substr($pdfData, $offset, $matches[0][1]);
752 28
                            $offset += $matches[1][1];
753
                        }
754
                    }
755 29
                } elseif ('endstream' == substr($pdfData, $offset, 9)) {
756
                    // end stream object
757 28
                    $objtype = 'endstream';
758 28
                    $offset += 9;
759 29
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
760
                    // indirect object reference
761 28
                    $objtype = 'objref';
762 28
                    $offset += \strlen($matches[0]);
763 28
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
764 29
                } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
765
                    // object start
766 5
                    $objtype = 'obj';
767 5
                    $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
768 5
                    $offset += \strlen($matches[0]);
769 29
                } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
770
                    // numeric object
771 28
                    $objtype = 'numeric';
772 28
                    $objval = substr($pdfData, $offset, $numlen);
773 28
                    $offset += $numlen;
774
                }
775 29
                break;
776
        }
777
778 29
        return [$objtype, $objval, $offset];
779
    }
780
781
    /**
782
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
783
     *
784
     * @param int   $offset xref offset (if known)
785
     * @param array $xref   previous xref array (if any)
786
     *
787
     * @return array containing xref and trailer data
788
     *
789
     * @throws Exception if it was unable to find startxref
790
     * @throws Exception if it was unable to find xref
791
     */
792 29
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
793
    {
794 29
        $startxrefPreg = preg_match(
795 29
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
796
            $pdfData,
797
            $matches,
798 29
            \PREG_OFFSET_CAPTURE,
799
            $offset
800
        );
801
802 29
        if (0 == $offset) {
803
            // find last startxref
804 29
            $pregResult = preg_match_all(
805 29
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
806
                $pdfData, $matches,
807 29
                \PREG_SET_ORDER,
808
                $offset
809
            );
810 29
            if (0 == $pregResult) {
811
                throw new Exception('Unable to find startxref');
812
            }
813 29
            $matches = array_pop($matches);
814 29
            $startxref = $matches[1];
815 7
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
816
            // Already pointing at the xref table
817 4
            $startxref = $offset;
818 3
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
819
            // Cross-Reference Stream object
820 3
            $startxref = $offset;
821
        } elseif ($startxrefPreg) {
822
            // startxref found
823
            $startxref = $matches[1][0];
824
        } else {
825
            throw new Exception('Unable to find startxref');
826
        }
827
828 29
        if ($startxref > \strlen($pdfData)) {
829 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
830
        }
831
832
        // check xref position
833 28
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
834
            // Cross-Reference
835 23
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
836
        } else {
837
            // Cross-Reference Stream
838 5
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
839
        }
840 28
        if (empty($xref)) {
841
            throw new Exception('Unable to find xref');
842
        }
843
844 28
        return $xref;
845
    }
846
847
    /**
848
     * Parses PDF data and returns extracted data as array.
849
     *
850
     * @param string $data PDF data to parse
851
     *
852
     * @return array array of parsed PDF document objects
853
     *
854
     * @throws Exception if empty PDF data given
855
     * @throws Exception if PDF data missing %PDF header
856
     */
857 29
    public function parseData(string $data): array
858
    {
859 29
        if (empty($data)) {
860
            throw new Exception('Empty PDF data given.');
861
        }
862
        // find the pdf header starting position
863 29
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
864
            throw new Exception('Invalid PDF data: missing %PDF header.');
865
        }
866
867
        // get PDF content string
868 29
        $pdfData = substr($data, $trimpos);
869
870
        // get xref and trailer data
871 29
        $xref = $this->getXrefData($pdfData);
872
873
        // parse all document objects
874 28
        $objects = [];
875 28
        foreach ($xref['xref'] as $obj => $offset) {
876 28
            if (!isset($objects[$obj]) && ($offset > 0)) {
877
                // decode objects with positive offset
878 28
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
879
            }
880
        }
881
882 28
        return [$xref, $objects];
883
    }
884
}
885