Passed
Pull Request — master (#400)
by Sebastien
05:01 queued 02:27
created

RawDataParser::getXrefData()   B

Complexity

Conditions 9
Paths 22

Size

Total Lines 53
Code Lines 33

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9.5145

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 9
eloc 33
c 3
b 1
f 1
nc 22
nop 3
dl 0
loc 53
ccs 22
cts 27
cp 0.8148
crap 9.5145
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 31
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 31
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 31
        $this->filterHelper = new FilterHelper();
69 31
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 27
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 27
        $slength = \strlen($stream);
85 27
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 27
        $filters = [];
89 27
        foreach ($sdic as $k => $v) {
90 27
            if ('/' == $v[0]) {
91 27
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 26
                    $declength = (int) ($sdic[($k + 1)][1]);
94 26
                    if ($declength < $slength) {
95 26
                        $stream = substr($stream, 0, $declength);
96 26
                        $slength = $declength;
97
                    }
98 27
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 27
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 27
                    if ('/' == $objval[0]) {
102
                        // single filter
103 27
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 27
        $remaining_filters = [];
118 27
        foreach ($filters as $filter) {
119 27
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 27
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 27
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 27
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 22
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 22
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 22
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 22
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 22
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 22
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 22
            $offset += \strlen($matches[0][0]);
162 22
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 22
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 22
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 22
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 22
                ++$obj_num;
171 22
            } elseif ('f' == $matches[3][0]) {
172 22
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 22
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 22
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 22
            $trailer_data = $matches[1][0];
181 22
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 22
                $xref['trailer'] = [];
184
                // parse trailer_data
185 22
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 22
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 22
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 22
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 22
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 21
                    $xref['trailer']['id'] = [];
199 21
                    $xref['trailer']['id'][0] = $matches[1];
200 21
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 22
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 22
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 22
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $predictor = null;
243 6
        $sarr = $xrefcrs[0][1];
244 6
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 6
        $wb = [];
249
250 6
        foreach ($sarr as $k => $v) {
251
            if (
252 6
                ('/' == $v[0])
253 6
                && ('Type' == $v[1])
254
                && (
255 6
                    isset($sarr[($k + 1)])
256 6
                    && '/' == $sarr[($k + 1)][0]
257 6
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 6
                $valid_crs = true;
261 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
273 5
                $decpar = $sarr[($k + 1)][1];
274 5
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 5
                        '/' == $vdc[0]
277 5
                        && 'Columns' == $vdc[1]
278
                        && (
279 5
                            isset($decpar[($kdc + 1)])
280 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 5
                        '/' == $vdc[0]
286 5
                        && 'Predictor' == $vdc[1]
287
                        && (
288 5
                            isset($decpar[($kdc + 1)])
289 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
293
                    }
294
                }
295 6
            } elseif ($filltrailer) {
296 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
297 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
299 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
305 6
                    $xref['trailer']['id'] = [];
306 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
314 6
            if (null !== $predictor) {
315
                // number of bytes in a row
316 5
                $rowlen = ($columns + 1);
317
                // convert the stream into an array of integers
318 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
319
                // split the rows
320 5
                $sdata = array_chunk($sdata, $rowlen);
321
322
                // initialize decoded array
323 5
                $ddata = [];
324
                // initialize first row with zeros
325 5
                $prev_row = array_fill(0, $rowlen, 0);
326
                // for each row apply PNG unpredictor
327 5
                foreach ($sdata as $k => $row) {
328
                    // initialize new row
329 5
                    $ddata[$k] = [];
330
                    // get PNG predictor value
331 5
                    $predictor = (10 + $row[0]);
332
                    // for each byte on the row
333 5
                    for ($i = 1; $i <= $columns; ++$i) {
334
                        // new index
335 5
                        $j = ($i - 1);
336 5
                        $row_up = $prev_row[$j];
337 5
                        if (1 == $i) {
338 5
                            $row_left = 0;
339 5
                            $row_upleft = 0;
340
                        } else {
341 5
                            $row_left = $row[($i - 1)];
342 5
                            $row_upleft = $prev_row[($j - 1)];
343
                        }
344 5
                        switch ($predictor) {
345 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
346
                                $ddata[$k][$j] = $row[$i];
347
                                break;
348
349 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
350
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
351
                                break;
352
353 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
355 5
                                break;
356
357
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
359
                                break;
360
361
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
362
                                // initial estimate
363
                                $p = ($row_left + $row_up - $row_upleft);
364
                                // distances
365
                                $pa = abs($p - $row_left);
366
                                $pb = abs($p - $row_up);
367
                                $pc = abs($p - $row_upleft);
368
                                $pmin = min($pa, $pb, $pc);
369
                                // return minimum distance
370
                                switch ($pmin) {
371
                                    case $pa:
372
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
373
                                        break;
374
375
                                    case $pb:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
377
                                        break;
378
379
                                    case $pc:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
381
                                        break;
382
                                }
383
                                break;
384
385
                            default:  // PNG prediction (on encoding, PNG optimum)
386
                                throw new Exception('Unknown PNG predictor: '.$predictor);
387
                        }
388
                    }
389 5
                    $prev_row = $ddata[$k];
390
                } // end for each row
391
                // complete decoding
392
            } else {
393
                // number of bytes in a row
394 2
                $rowlen = array_sum($wb);
395
                // convert the stream into an array of integers
396 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
397
                // split the rows
398 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

398
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
399
            }
400
401 6
            $sdata = [];
402
403
            // for every row
404 6
            foreach ($ddata as $k => $row) {
405
                // initialize new row
406 6
                $sdata[$k] = [0, 0, 0];
407 6
                if (0 == $wb[0]) {
408
                    // default type field
409
                    $sdata[$k][0] = 1;
410
                }
411 6
                $i = 0; // count bytes in the row
412
                // for every column
413 6
                for ($c = 0; $c < 3; ++$c) {
414
                    // for every byte on the column
415 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
416 6
                        if (isset($row[$i])) {
417 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
418
                        }
419 6
                        ++$i;
420
                    }
421
                }
422
            }
423 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
424
            // fill xref
425 6
            if (isset($index_first)) {
426 4
                $obj_num = $index_first;
427
            } else {
428 6
                $obj_num = 0;
429
            }
430 6
            foreach ($sdata as $k => $row) {
431 6
                switch ($row[0]) {
432 6
                    case 0:  // (f) linked list of free objects
433 6
                            break;
434
435 6
                    case 1:  // (n) objects that are in use but are not compressed
436
                            // create unique object index: [object number]_[generation number]
437 6
                            $index = $obj_num.'_'.$row[2];
438
                            // check if object already exist
439 6
                            if (!isset($xref['xref'][$index])) {
440
                                // store object offset position
441 6
                                $xref['xref'][$index] = $row[1];
442
                            }
443 6
                            break;
444
445 6
                    case 2:  // compressed objects
446
                            // $row[1] = object number of the object stream in which this object is stored
447
                            // $row[2] = index of this object within the object stream
448 6
                            $index = $row[1].'_0_'.$row[2];
449 6
                            $xref['xref'][$index] = -1;
450 6
                            break;
451
452
                    default:  // null objects
453
                            break;
454
                }
455 6
                ++$obj_num;
456
            }
457
        } // end decoding data
458 6
        if (isset($prevxref)) {
459
            // get previous xref
460 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
461
        }
462
463 6
        return $xref;
464
    }
465
466
    /**
467
     * Get content of indirect object.
468
     *
469
     * @param string $pdfData  PDF data
470
     * @param array  $xref
471
     * @param string $objRef   Object number and generation number separated by underscore character
472
     * @param int    $offset   Object offset
473
     * @param bool   $decoding If true decode streams
474
     *
475
     * @return array containing object data
476
     *
477
     * @throws Exception if invalid object reference found
478
     */
479 27
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
480
    {
481
        /*
482
         * build indirect object header
483
         */
484
        // $objHeader = "[object number] [generation number] obj"
485 27
        $objRefArr = explode('_', $objRef);
486 27
        if (2 !== \count($objRefArr)) {
487
            throw new Exception('Invalid object reference for $obj.');
488
        }
489 27
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
490
491
        /*
492
         * check if we are in position
493
         */
494
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
495 27
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
496
        // ignore leading zeros for object number
497 27
        $offset += strspn($pdfData, '0', $offset);
498 27
        if (substr($pdfData, $offset, \strlen($objHeader)) !== $objHeader) {
499
            // an indirect reference to an undefined object shall be considered a reference to the null object
500 2
            return ['null', 'null', $offset];
501
        }
502
503
        /*
504
         * get content
505
         */
506
        // starting position of object content
507 27
        $offset += \strlen($objHeader);
508 27
        $objContentArr = [];
509 27
        $i = 0; // object main index
510
        do {
511 27
            $oldOffset = $offset;
512
            // get element
513 27
            $element = $this->getRawObject($pdfData, $offset);
514 27
            $offset = $element[2];
515
            // decode stream using stream's dictionary information
516 27
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
517 27
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
518
            }
519 27
            $objContentArr[$i] = $element;
520 27
            ++$i;
521 27
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
522
        // remove closing delimiter
523 27
        array_pop($objContentArr);
524
525
        /*
526
         * return raw object content
527
         */
528 27
        return $objContentArr;
529
    }
530
531
    /**
532
     * Get the content of object, resolving indirect object reference if necessary.
533
     *
534
     * @param string $pdfData PDF data
535
     * @param array  $obj     Object value
536
     *
537
     * @return array containing object data
538
     *
539
     * @throws Exception
540
     */
541 27
    protected function getObjectVal($pdfData, $xref, $obj)
542
    {
543 27
        if ('objref' == $obj[0]) {
544
            // reference to indirect object
545
            if (isset($this->objects[$obj[1]])) {
546
                // this object has been already parsed
547
                return $this->objects[$obj[1]];
548
            } elseif (isset($xref[$obj[1]])) {
549
                // parse new object
550
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
551
552
                return $this->objects[$obj[1]];
553
            }
554
        }
555
556 27
        return $obj;
557
    }
558
559
    /**
560
     * Get object type, raw value and offset to next object
561
     *
562
     * @param int $offset Object offset
563
     *
564
     * @return array containing object type, raw value and offset to next object
565
     */
566 28
    protected function getRawObject($pdfData, $offset = 0)
567
    {
568 28
        $objtype = ''; // object type to be returned
569 28
        $objval = ''; // object value to be returned
570
571
        /*
572
         * skip initial white space chars:
573
         *      \x00 null (NUL)
574
         *      \x09 horizontal tab (HT)
575
         *      \x0A line feed (LF)
576
         *      \x0C form feed (FF)
577
         *      \x0D carriage return (CR)
578
         *      \x20 space (SP)
579
         */
580 28
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
581
582
        // get first char
583 28
        $char = $pdfData[$offset];
584
        // get object type
585 28
        switch ($char) {
586 28
            case '%':  // \x25 PERCENT SIGN
587
                    // skip comment and search for next token
588
                    $next = strcspn($pdfData, "\r\n", $offset);
589
                    if ($next > 0) {
590
                        $offset += $next;
591
592
                        return $this->getRawObject($pdfData, $offset);
593
                    }
594
                    break;
595
596 28
            case '/':  // \x2F SOLIDUS
597
                    // name object
598 28
                    $objtype = $char;
599 28
                    ++$offset;
600 28
                    $pregResult = preg_match(
601 28
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
602 28
                        substr($pdfData, $offset, 256),
603
                        $matches
604
                    );
605 28
                    if (1 == $pregResult) {
606 28
                        $objval = $matches[1]; // unescaped value
607 28
                        $offset += \strlen($objval);
608
                    }
609 28
                    break;
610
611 28
            case '(':   // \x28 LEFT PARENTHESIS
612 28
            case ')':  // \x29 RIGHT PARENTHESIS
613
                    // literal string object
614 25
                    $objtype = $char;
615 25
                    ++$offset;
616 25
                    $strpos = $offset;
617 25
                    if ('(' == $char) {
618 25
                        $open_bracket = 1;
619 25
                        while ($open_bracket > 0) {
620 25
                            if (!isset($pdfData[$strpos])) {
621
                                break;
622
                            }
623 25
                            $ch = $pdfData[$strpos];
624 25
                            switch ($ch) {
625 25
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
626
                                        // skip next character
627 15
                                        ++$strpos;
628 15
                                        break;
629
630 25
                                case '(':  // LEFT PARENHESIS (28h)
631
                                        ++$open_bracket;
632
                                        break;
633
634 25
                                case ')':  // RIGHT PARENTHESIS (29h)
635 25
                                        --$open_bracket;
636 25
                                        break;
637
                            }
638 25
                            ++$strpos;
639
                        }
640 25
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
641 25
                        $offset = $strpos;
642
                    }
643 25
                    break;
644
645 28
            case '[':   // \x5B LEFT SQUARE BRACKET
646 28
            case ']':  // \x5D RIGHT SQUARE BRACKET
647
                // array object
648 27
                $objtype = $char;
649 27
                ++$offset;
650 27
                if ('[' == $char) {
651
                    // get array content
652 27
                    $objval = [];
653
                    do {
654 27
                        $oldOffset = $offset;
655
                        // get element
656 27
                        $element = $this->getRawObject($pdfData, $offset);
657 27
                        $offset = $element[2];
658 27
                        $objval[] = $element;
659 27
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
660
                    // remove closing delimiter
661 27
                    array_pop($objval);
662
                }
663 27
                break;
664
665 28
            case '<':  // \x3C LESS-THAN SIGN
666 28
            case '>':  // \x3E GREATER-THAN SIGN
667 28
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
668
                    // dictionary object
669 28
                    $objtype = $char.$char;
670 28
                    $offset += 2;
671 28
                    if ('<' == $char) {
672
                        // get array content
673 28
                        $objval = [];
674
                        do {
675 28
                            $oldOffset = $offset;
676
                            // get element
677 28
                            $element = $this->getRawObject($pdfData, $offset);
678 28
                            $offset = $element[2];
679 28
                            $objval[] = $element;
680 28
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
681
                        // remove closing delimiter
682 28
                        array_pop($objval);
683
                    }
684
                } else {
685
                    // hexadecimal string object
686 10
                    $objtype = $char;
687 10
                    ++$offset;
688 10
                    $pregResult = preg_match(
689 10
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
690 10
                            substr($pdfData, $offset),
691
                            $matches
692
                        );
693 10
                    if (('<' == $char) && 1 == $pregResult) {
694
                        // remove white space characters
695 10
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
696 10
                        $offset += \strlen($matches[0]);
697
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
698
                        $offset = $endpos + 1;
699
                    }
700
                }
701 28
                    break;
702
703
            default:
704 28
                    if ('endobj' == substr($pdfData, $offset, 6)) {
705
                        // indirect object
706 27
                        $objtype = 'endobj';
707 27
                        $offset += 6;
708 28
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
709
                        // null object
710 3
                        $objtype = 'null';
711 3
                        $offset += 4;
712 3
                        $objval = 'null';
713 28
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
714
                        // boolean true object
715 8
                        $objtype = 'boolean';
716 8
                        $offset += 4;
717 8
                        $objval = 'true';
718 28
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
719
                        // boolean false object
720 2
                        $objtype = 'boolean';
721 2
                        $offset += 5;
722 2
                        $objval = 'false';
723 28
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
724
                        // start stream object
725 27
                        $objtype = 'stream';
726 27
                        $offset += 6;
727 27
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
728 27
                            $offset += \strlen($matches[0]);
729 27
                            $pregResult = preg_match(
730 27
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
731 27
                                substr($pdfData, $offset),
732
                                $matches,
733 27
                                \PREG_OFFSET_CAPTURE
734
                            );
735 27
                            if (1 == $pregResult) {
736 27
                                $objval = substr($pdfData, $offset, $matches[0][1]);
737 27
                                $offset += $matches[1][1];
738
                            }
739
                        }
740 28
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
741
                        // end stream object
742 27
                        $objtype = 'endstream';
743 27
                        $offset += 9;
744 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
745
                        // indirect object reference
746 27
                        $objtype = 'objref';
747 27
                        $offset += \strlen($matches[0]);
748 27
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
749 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
750
                        // object start
751 6
                        $objtype = 'obj';
752 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
753 6
                        $offset += \strlen($matches[0]);
754 28
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
755
                        // numeric object
756 27
                        $objtype = 'numeric';
757 27
                        $objval = substr($pdfData, $offset, $numlen);
758 27
                        $offset += $numlen;
759
                    }
760 28
                    break;
761
        }
762
763 28
        return [$objtype, $objval, $offset];
764
    }
765
766
    /**
767
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
768
     *
769
     * @param string $pdfData
770
     * @param int    $offset  xref offset (if know)
771
     * @param array  $xref    previous xref array (if any)
772
     *
773
     * @return array containing xref and trailer data
774
     *
775
     * @throws Exception if it was unable to find startxref
776
     * @throws Exception if it was unable to find xref
777
     */
778 28
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
779
    {
780 28
        $startxrefPreg = preg_match(
781 28
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
782
            $pdfData,
783
            $matches,
784 28
            \PREG_OFFSET_CAPTURE,
785
            $offset
786
        );
787
788 28
        if (0 == $offset) {
789
            // find last startxref
790 28
            $pregResult = preg_match_all(
791 28
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
792
                $pdfData, $matches,
793 28
                \PREG_SET_ORDER,
794
                $offset
795
            );
796 28
            if (0 == $pregResult) {
797
                throw new Exception('Unable to find startxref');
798
            }
799 28
            $matches = array_pop($matches);
800 28
            $startxref = $matches[1];
801 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
802
            // Already pointing at the xref table
803 5
            $startxref = $offset;
804 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
805
            // Cross-Reference Stream object
806 4
            $startxref = $offset;
807
        } elseif ($startxrefPreg) {
808
            // startxref found
809
            $startxref = $matches[1][0];
810
        } else {
811
            throw new Exception('Unable to find startxref');
812
        }
813
814 28
        if ($startxref > \strlen($pdfData)) {
815 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
816
        }
817
818
        // check xref position
819 27
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
820
            // Cross-Reference
821 22
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
822
        } else {
823
            // Cross-Reference Stream
824 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
825
        }
826 27
        if (empty($xref)) {
827
            throw new Exception('Unable to find xref');
828
        }
829
830 27
        return $xref;
831
    }
832
833
    /**
834
     * Parses PDF data and returns extracted data as array.
835
     *
836
     * @param string $data PDF data to parse
837
     *
838
     * @return array array of parsed PDF document objects
839
     *
840
     * @throws Exception if empty PDF data given
841
     * @throws Exception if PDF data missing %PDF header
842
     */
843 28
    public function parseData($data)
844
    {
845 28
        if (empty($data)) {
846
            throw new Exception('Empty PDF data given.');
847
        }
848
        // find the pdf header starting position
849 28
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
850
            throw new Exception('Invalid PDF data: missing %PDF header.');
851
        }
852
853
        // get PDF content string
854 28
        $pdfData = substr($data, $trimpos);
855
856
        // get xref and trailer data
857 28
        $xref = $this->getXrefData($pdfData);
858
859
        // parse all document objects
860 27
        $objects = [];
861 27
        foreach ($xref['xref'] as $obj => $offset) {
862 27
            if (!isset($objects[$obj]) && ($offset > 0)) {
863
                // decode objects with positive offset
864 27
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
865
            }
866
        }
867
868 27
        return [$xref, $objects];
869
    }
870
}
871