Passed
Pull Request — master (#405)
by
unknown
07:01
created

RawDataParser::getIndirectObject()   B

Complexity

Conditions 9
Paths 4

Size

Total Lines 52
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9.0066

Importance

Changes 7
Bugs 3 Features 1
Metric Value
cc 9
eloc 23
c 7
b 3
f 1
nc 4
nop 5
dl 0
loc 52
ccs 22
cts 23
cp 0.9565
crap 9.0066
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 32
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 32
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 32
        $this->filterHelper = new FilterHelper();
69 32
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 28
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 28
        $slength = \strlen($stream);
85 28
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 28
        $filters = [];
89 28
        foreach ($sdic as $k => $v) {
90 28
            if ('/' == $v[0]) {
91 28
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 27
                    $declength = (int) ($sdic[($k + 1)][1]);
94 27
                    if ($declength < $slength) {
95 27
                        $stream = substr($stream, 0, $declength);
96 27
                        $slength = $declength;
97
                    }
98 28
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 28
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 28
                    if ('/' == $objval[0]) {
102
                        // single filter
103 28
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 28
        $remaining_filters = [];
118 28
        foreach ($filters as $filter) {
119 28
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 28
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 28
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 28
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 23
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 23
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 23
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 23
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 23
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 23
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 23
            $offset += \strlen($matches[0][0]);
162 23
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 23
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 23
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 23
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 23
                ++$obj_num;
171 23
            } elseif ('f' == $matches[3][0]) {
172 23
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 23
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 23
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 23
            $trailer_data = $matches[1][0];
181 23
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 23
                $xref['trailer'] = [];
184
                // parse trailer_data
185 23
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 23
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 23
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 23
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 23
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 23
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 23
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 23
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 22
                    $xref['trailer']['id'] = [];
199 22
                    $xref['trailer']['id'][0] = $matches[1];
200 22
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 23
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 23
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 23
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $predictor = null;
243 6
        $sarr = $xrefcrs[0][1];
244 6
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 6
        $wb = [];
249
250 6
        foreach ($sarr as $k => $v) {
251
            if (
252 6
                ('/' == $v[0])
253 6
                && ('Type' == $v[1])
254
                && (
255 6
                    isset($sarr[($k + 1)])
256 6
                    && '/' == $sarr[($k + 1)][0]
257 6
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 6
                $valid_crs = true;
261 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
273 5
                $decpar = $sarr[($k + 1)][1];
274 5
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 5
                        '/' == $vdc[0]
277 5
                        && 'Columns' == $vdc[1]
278
                        && (
279 5
                            isset($decpar[($kdc + 1)])
280 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 5
                        '/' == $vdc[0]
286 5
                        && 'Predictor' == $vdc[1]
287
                        && (
288 5
                            isset($decpar[($kdc + 1)])
289 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
293
                    }
294
                }
295 6
            } elseif ($filltrailer) {
296 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
297 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
299 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
305 6
                    $xref['trailer']['id'] = [];
306 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
314 6
            if (null !== $predictor) {
315
                // number of bytes in a row
316 5
                $rowlen = ($columns + 1);
317
                // convert the stream into an array of integers
318 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
319
                // split the rows
320 5
                $sdata = array_chunk($sdata, $rowlen);
321
322
                // initialize decoded array
323 5
                $ddata = [];
324
                // initialize first row with zeros
325 5
                $prev_row = array_fill(0, $rowlen, 0);
326
                // for each row apply PNG unpredictor
327 5
                foreach ($sdata as $k => $row) {
328
                    // initialize new row
329 5
                    $ddata[$k] = [];
330
                    // get PNG predictor value
331 5
                    $predictor = (10 + $row[0]);
332
                    // for each byte on the row
333 5
                    for ($i = 1; $i <= $columns; ++$i) {
334
                        // new index
335 5
                        $j = ($i - 1);
336 5
                        $row_up = $prev_row[$j];
337 5
                        if (1 == $i) {
338 5
                            $row_left = 0;
339 5
                            $row_upleft = 0;
340
                        } else {
341 5
                            $row_left = $row[($i - 1)];
342 5
                            $row_upleft = $prev_row[($j - 1)];
343
                        }
344 5
                        switch ($predictor) {
345 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
346
                                $ddata[$k][$j] = $row[$i];
347
                                break;
348
349 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
350
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
351
                                break;
352
353 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
355 5
                                break;
356
357
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
359
                                break;
360
361
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
362
                                // initial estimate
363
                                $p = ($row_left + $row_up - $row_upleft);
364
                                // distances
365
                                $pa = abs($p - $row_left);
366
                                $pb = abs($p - $row_up);
367
                                $pc = abs($p - $row_upleft);
368
                                $pmin = min($pa, $pb, $pc);
369
                                // return minimum distance
370
                                switch ($pmin) {
371
                                    case $pa:
372
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
373
                                        break;
374
375
                                    case $pb:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
377
                                        break;
378
379
                                    case $pc:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
381
                                        break;
382
                                }
383
                                break;
384
385
                            default:  // PNG prediction (on encoding, PNG optimum)
386
                                throw new Exception('Unknown PNG predictor: '.$predictor);
387
                        }
388
                    }
389 5
                    $prev_row = $ddata[$k];
390
                } // end for each row
391
                // complete decoding
392
            } else {
393
                // number of bytes in a row
394 2
                $rowlen = array_sum($wb);
395
                // convert the stream into an array of integers
396 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
397
                // split the rows
398 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

398
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
399
            }
400
401 6
            $sdata = [];
402
403
            // for every row
404 6
            foreach ($ddata as $k => $row) {
405
                // initialize new row
406 6
                $sdata[$k] = [0, 0, 0];
407 6
                if (0 == $wb[0]) {
408
                    // default type field
409
                    $sdata[$k][0] = 1;
410
                }
411 6
                $i = 0; // count bytes in the row
412
                // for every column
413 6
                for ($c = 0; $c < 3; ++$c) {
414
                    // for every byte on the column
415 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
416 6
                        if (isset($row[$i])) {
417 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
418
                        }
419 6
                        ++$i;
420
                    }
421
                }
422
            }
423 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
424
            // fill xref
425 6
            if (isset($index_first)) {
426 4
                $obj_num = $index_first;
427
            } else {
428 6
                $obj_num = 0;
429
            }
430 6
            foreach ($sdata as $k => $row) {
431 6
                switch ($row[0]) {
432 6
                    case 0:  // (f) linked list of free objects
433 6
                            break;
434
435 6
                    case 1:  // (n) objects that are in use but are not compressed
436
                            // create unique object index: [object number]_[generation number]
437 6
                            $index = $obj_num.'_'.$row[2];
438
                            // check if object already exist
439 6
                            if (!isset($xref['xref'][$index])) {
440
                                // store object offset position
441 6
                                $xref['xref'][$index] = $row[1];
442
                            }
443 6
                            break;
444
445 6
                    case 2:  // compressed objects
446
                            // $row[1] = object number of the object stream in which this object is stored
447
                            // $row[2] = index of this object within the object stream
448 6
                            $index = $row[1].'_0_'.$row[2];
449 6
                            $xref['xref'][$index] = -1;
450 6
                            break;
451
452
                    default:  // null objects
453
                            break;
454
                }
455 6
                ++$obj_num;
456
            }
457
        } // end decoding data
458 6
        if (isset($prevxref)) {
459
            // get previous xref
460 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
461
        }
462
463 6
        return $xref;
464
    }
465
466
    /**
467
     * Get content of indirect object.
468
     *
469
     * @param string $pdfData  PDF data
470
     * @param array  $xref
471
     * @param string $objRef   Object number and generation number separated by underscore character
472
     * @param int    $offset   Object offset
473
     * @param bool   $decoding If true decode streams
474
     *
475
     * @return array containing object data
476
     *
477
     * @throws Exception if invalid object reference found
478
     */
479 28
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
480
    {
481
        /*
482
         * build indirect object header
483
         */
484
        // $objHeader = "[object number] [generation number] obj"
485 28
        $objRefArr = explode('_', $objRef);
486 28
        if (2 !== \count($objRefArr)) {
487
            throw new Exception('Invalid object reference for $obj.');
488
        }
489 28
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
490
491
        /*
492
         * check if we are in position
493
         */
494
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
495 28
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
496
        // ignore leading zeros for object number
497 28
        $offset += strspn($pdfData, '0', $offset);
498
        // consider all whitespace character (PDF specifications)
499 28
        $objHeaderPattern = '/'.$objRefArr[0].'[\0\t\n\f\r ]'.$objRefArr[1].'[\0\t\n\f\r ]obj'.'/';
500 28
        if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader)))) {
501
            // an indirect reference to an undefined object shall be considered a reference to the null object
502 2
            return ['null', 'null', $offset];
503
        }
504
505
        /*
506
         * get content
507
         */
508
        // starting position of object content
509 28
        $offset += \strlen($objHeader);
510 28
        $objContentArr = [];
511 28
        $i = 0; // object main index
512
        do {
513 28
            $oldOffset = $offset;
514
            // get element
515 28
            $element = $this->getRawObject($pdfData, $offset);
516 28
            $offset = $element[2];
517
            // decode stream using stream's dictionary information
518 28
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
519 28
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
520
            }
521 28
            $objContentArr[$i] = $element;
522 28
            ++$i;
523 28
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
524
        // remove closing delimiter
525 28
        array_pop($objContentArr);
526
527
        /*
528
         * return raw object content
529
         */
530 28
        return $objContentArr;
531
    }
532
533
    /**
534
     * Get the content of object, resolving indirect object reference if necessary.
535
     *
536
     * @param string $pdfData PDF data
537
     * @param array  $obj     Object value
538
     *
539
     * @return array containing object data
540
     *
541
     * @throws Exception
542
     */
543 28
    protected function getObjectVal($pdfData, $xref, $obj)
544
    {
545 28
        if ('objref' == $obj[0]) {
546
            // reference to indirect object
547
            if (isset($this->objects[$obj[1]])) {
548
                // this object has been already parsed
549
                return $this->objects[$obj[1]];
550
            } elseif (isset($xref[$obj[1]])) {
551
                // parse new object
552
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
553
554
                return $this->objects[$obj[1]];
555
            }
556
        }
557
558 28
        return $obj;
559
    }
560
561
    /**
562
     * Get object type, raw value and offset to next object
563
     *
564
     * @param int $offset Object offset
565
     *
566
     * @return array containing object type, raw value and offset to next object
567
     */
568 29
    protected function getRawObject($pdfData, $offset = 0)
569
    {
570 29
        $objtype = ''; // object type to be returned
571 29
        $objval = ''; // object value to be returned
572
573
        /*
574
         * skip initial white space chars:
575
         *      \x00 null (NUL)
576
         *      \x09 horizontal tab (HT)
577
         *      \x0A line feed (LF)
578
         *      \x0C form feed (FF)
579
         *      \x0D carriage return (CR)
580
         *      \x20 space (SP)
581
         */
582 29
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
583
584
        // get first char
585 29
        $char = $pdfData[$offset];
586
        // get object type
587 29
        switch ($char) {
588 29
            case '%':  // \x25 PERCENT SIGN
589
                    // skip comment and search for next token
590
                    $next = strcspn($pdfData, "\r\n", $offset);
591
                    if ($next > 0) {
592
                        $offset += $next;
593
594
                        return $this->getRawObject($pdfData, $offset);
595
                    }
596
                    break;
597
598 29
            case '/':  // \x2F SOLIDUS
599
                    // name object
600 29
                    $objtype = $char;
601 29
                    ++$offset;
602 29
                    $pregResult = preg_match(
603 29
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
604 29
                        substr($pdfData, $offset, 256),
605
                        $matches
606
                    );
607 29
                    if (1 == $pregResult) {
608 29
                        $objval = $matches[1]; // unescaped value
609 29
                        $offset += \strlen($objval);
610
                    }
611 29
                    break;
612
613 29
            case '(':   // \x28 LEFT PARENTHESIS
614 29
            case ')':  // \x29 RIGHT PARENTHESIS
615
                    // literal string object
616 26
                    $objtype = $char;
617 26
                    ++$offset;
618 26
                    $strpos = $offset;
619 26
                    if ('(' == $char) {
620 26
                        $open_bracket = 1;
621 26
                        while ($open_bracket > 0) {
622 26
                            if (!isset($pdfData[$strpos])) {
623
                                break;
624
                            }
625 26
                            $ch = $pdfData[$strpos];
626 26
                            switch ($ch) {
627 26
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
628
                                        // skip next character
629 15
                                        ++$strpos;
630 15
                                        break;
631
632 26
                                case '(':  // LEFT PARENHESIS (28h)
633
                                        ++$open_bracket;
634
                                        break;
635
636 26
                                case ')':  // RIGHT PARENTHESIS (29h)
637 26
                                        --$open_bracket;
638 26
                                        break;
639
                            }
640 26
                            ++$strpos;
641
                        }
642 26
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
643 26
                        $offset = $strpos;
644
                    }
645 26
                    break;
646
647 29
            case '[':   // \x5B LEFT SQUARE BRACKET
648 29
            case ']':  // \x5D RIGHT SQUARE BRACKET
649
                // array object
650 28
                $objtype = $char;
651 28
                ++$offset;
652 28
                if ('[' == $char) {
653
                    // get array content
654 28
                    $objval = [];
655
                    do {
656 28
                        $oldOffset = $offset;
657
                        // get element
658 28
                        $element = $this->getRawObject($pdfData, $offset);
659 28
                        $offset = $element[2];
660 28
                        $objval[] = $element;
661 28
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
662
                    // remove closing delimiter
663 28
                    array_pop($objval);
664
                }
665 28
                break;
666
667 29
            case '<':  // \x3C LESS-THAN SIGN
668 29
            case '>':  // \x3E GREATER-THAN SIGN
669 29
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
670
                    // dictionary object
671 29
                    $objtype = $char.$char;
672 29
                    $offset += 2;
673 29
                    if ('<' == $char) {
674
                        // get array content
675 29
                        $objval = [];
676
                        do {
677 29
                            $oldOffset = $offset;
678
                            // get element
679 29
                            $element = $this->getRawObject($pdfData, $offset);
680 29
                            $offset = $element[2];
681 29
                            $objval[] = $element;
682 29
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
683
                        // remove closing delimiter
684 29
                        array_pop($objval);
685
                    }
686
                } else {
687
                    // hexadecimal string object
688 11
                    $objtype = $char;
689 11
                    ++$offset;
690 11
                    $pregResult = preg_match(
691 11
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
692 11
                            substr($pdfData, $offset),
693
                            $matches
694
                        );
695 11
                    if (('<' == $char) && 1 == $pregResult) {
696
                        // remove white space characters
697 11
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
698 11
                        $offset += \strlen($matches[0]);
699
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
700
                        $offset = $endpos + 1;
701
                    }
702
                }
703 29
                    break;
704
705
            default:
706 29
                    if ('endobj' == substr($pdfData, $offset, 6)) {
707
                        // indirect object
708 28
                        $objtype = 'endobj';
709 28
                        $offset += 6;
710 29
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
711
                        // null object
712 4
                        $objtype = 'null';
713 4
                        $offset += 4;
714 4
                        $objval = 'null';
715 29
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
716
                        // boolean true object
717 9
                        $objtype = 'boolean';
718 9
                        $offset += 4;
719 9
                        $objval = 'true';
720 29
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
721
                        // boolean false object
722 2
                        $objtype = 'boolean';
723 2
                        $offset += 5;
724 2
                        $objval = 'false';
725 29
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
726
                        // start stream object
727 28
                        $objtype = 'stream';
728 28
                        $offset += 6;
729 28
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
730 28
                            $offset += \strlen($matches[0]);
731 28
                            $pregResult = preg_match(
732 28
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
733 28
                                substr($pdfData, $offset),
734
                                $matches,
735 28
                                \PREG_OFFSET_CAPTURE
736
                            );
737 28
                            if (1 == $pregResult) {
738 28
                                $objval = substr($pdfData, $offset, $matches[0][1]);
739 28
                                $offset += $matches[1][1];
740
                            }
741
                        }
742 29
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
743
                        // end stream object
744 28
                        $objtype = 'endstream';
745 28
                        $offset += 9;
746 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
747
                        // indirect object reference
748 28
                        $objtype = 'objref';
749 28
                        $offset += \strlen($matches[0]);
750 28
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
751 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
752
                        // object start
753 6
                        $objtype = 'obj';
754 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
755 6
                        $offset += \strlen($matches[0]);
756 29
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
757
                        // numeric object
758 28
                        $objtype = 'numeric';
759 28
                        $objval = substr($pdfData, $offset, $numlen);
760 28
                        $offset += $numlen;
761
                    }
762 29
                    break;
763
        }
764
765 29
        return [$objtype, $objval, $offset];
766
    }
767
768
    /**
769
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
770
     *
771
     * @param string $pdfData
772
     * @param int    $offset  xref offset (if know)
773
     * @param array  $xref    previous xref array (if any)
774
     *
775
     * @return array containing xref and trailer data
776
     *
777
     * @throws Exception if it was unable to find startxref
778
     * @throws Exception if it was unable to find xref
779
     */
780 29
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
781
    {
782 29
        $startxrefPreg = preg_match(
783 29
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
784
            $pdfData,
785
            $matches,
786 29
            \PREG_OFFSET_CAPTURE,
787
            $offset
788
        );
789
790 29
        if (0 == $offset) {
791
            // find last startxref
792 29
            $pregResult = preg_match_all(
793 29
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
794
                $pdfData, $matches,
795 29
                \PREG_SET_ORDER,
796
                $offset
797
            );
798 29
            if (0 == $pregResult) {
799
                throw new Exception('Unable to find startxref');
800
            }
801 29
            $matches = array_pop($matches);
802 29
            $startxref = $matches[1];
803 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
804
            // Already pointing at the xref table
805 5
            $startxref = $offset;
806 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
807
            // Cross-Reference Stream object
808 4
            $startxref = $offset;
809
        } elseif ($startxrefPreg) {
810
            // startxref found
811
            $startxref = $matches[1][0];
812
        } else {
813
            throw new Exception('Unable to find startxref');
814
        }
815
816 29
        if ($startxref > \strlen($pdfData)) {
817 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
818
        }
819
820
        // check xref position
821 28
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
822
            // Cross-Reference
823 23
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
824
        } else {
825
            // Cross-Reference Stream
826 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
827
        }
828 28
        if (empty($xref)) {
829
            throw new Exception('Unable to find xref');
830
        }
831
832 28
        return $xref;
833
    }
834
835
    /**
836
     * Parses PDF data and returns extracted data as array.
837
     *
838
     * @param string $data PDF data to parse
839
     *
840
     * @return array array of parsed PDF document objects
841
     *
842
     * @throws Exception if empty PDF data given
843
     * @throws Exception if PDF data missing %PDF header
844
     */
845 29
    public function parseData($data)
846
    {
847 29
        if (empty($data)) {
848
            throw new Exception('Empty PDF data given.');
849
        }
850
        // find the pdf header starting position
851 29
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
852
            throw new Exception('Invalid PDF data: missing %PDF header.');
853
        }
854
855
        // get PDF content string
856 29
        $pdfData = substr($data, $trimpos);
857
858
        // get xref and trailer data
859 29
        $xref = $this->getXrefData($pdfData);
860
861
        // parse all document objects
862 28
        $objects = [];
863 28
        foreach ($xref['xref'] as $obj => $offset) {
864 28
            if (!isset($objects[$obj]) && ($offset > 0)) {
865
                // decode objects with positive offset
866 28
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
867
            }
868
        }
869
870 28
        return [$xref, $objects];
871
    }
872
}
873