Passed
Pull Request — master (#405)
by
unknown
03:27 queued 01:06
created

RawDataParser::decodeXrefStream()   F

Complexity

Conditions 79
Paths 1536

Size

Total Lines 239
Code Lines 154

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 114
CRAP Score 126.8602

Importance

Changes 4
Bugs 1 Features 1
Metric Value
cc 79
eloc 154
c 4
b 1
f 1
nc 1536
nop 3
dl 0
loc 239
ccs 114
cts 142
cp 0.8028
crap 126.8602
rs 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 31
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 31
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 31
        $this->filterHelper = new FilterHelper();
69 31
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 27
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 27
        $slength = \strlen($stream);
85 27
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 27
        $filters = [];
89 27
        foreach ($sdic as $k => $v) {
90 27
            if ('/' == $v[0]) {
91 27
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 26
                    $declength = (int) ($sdic[($k + 1)][1]);
94 26
                    if ($declength < $slength) {
95 26
                        $stream = substr($stream, 0, $declength);
96 26
                        $slength = $declength;
97
                    }
98 27
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 27
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 27
                    if ('/' == $objval[0]) {
102
                        // single filter
103 27
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 27
        $remaining_filters = [];
118 27
        foreach ($filters as $filter) {
119 27
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 27
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 27
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 27
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 22
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 22
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 22
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 22
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 22
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 22
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 22
            $offset += \strlen($matches[0][0]);
162 22
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 22
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 22
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 22
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 22
                ++$obj_num;
171 22
            } elseif ('f' == $matches[3][0]) {
172 22
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 22
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 22
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 22
            $trailer_data = $matches[1][0];
181 22
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 22
                $xref['trailer'] = [];
184
                // parse trailer_data
185 22
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 22
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 22
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 22
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 22
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 21
                    $xref['trailer']['id'] = [];
199 21
                    $xref['trailer']['id'][0] = $matches[1];
200 21
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 22
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 22
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 22
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $predictor = null;
243 6
        $sarr = $xrefcrs[0][1];
244 6
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 6
        $wb = [];
249
250 6
        foreach ($sarr as $k => $v) {
251
            if (
252 6
                ('/' == $v[0])
253 6
                && ('Type' == $v[1])
254
                && (
255 6
                    isset($sarr[($k + 1)])
256 6
                    && '/' == $sarr[($k + 1)][0]
257 6
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 6
                $valid_crs = true;
261 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
273 5
                $decpar = $sarr[($k + 1)][1];
274 5
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 5
                        '/' == $vdc[0]
277 5
                        && 'Columns' == $vdc[1]
278
                        && (
279 5
                            isset($decpar[($kdc + 1)])
280 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 5
                        '/' == $vdc[0]
286 5
                        && 'Predictor' == $vdc[1]
287
                        && (
288 5
                            isset($decpar[($kdc + 1)])
289 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
293
                    }
294
                }
295 6
            } elseif ($filltrailer) {
296 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
297 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
299 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
305 6
                    $xref['trailer']['id'] = [];
306 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
314 6
            if (null !== $predictor) {
315
                // number of bytes in a row
316 5
                $rowlen = ($columns + 1);
317
                // convert the stream into an array of integers
318 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
319
                // split the rows
320 5
                $sdata = array_chunk($sdata, $rowlen);
321
322
                // initialize decoded array
323 5
                $ddata = [];
324
                // initialize first row with zeros
325 5
                $prev_row = array_fill(0, $rowlen, 0);
326
                // for each row apply PNG unpredictor
327 5
                foreach ($sdata as $k => $row) {
328
                    // initialize new row
329 5
                    $ddata[$k] = [];
330
                    // get PNG predictor value
331 5
                    $predictor = (10 + $row[0]);
332
                    // for each byte on the row
333 5
                    for ($i = 1; $i <= $columns; ++$i) {
334
                        // new index
335 5
                        $j = ($i - 1);
336 5
                        $row_up = $prev_row[$j];
337 5
                        if (1 == $i) {
338 5
                            $row_left = 0;
339 5
                            $row_upleft = 0;
340
                        } else {
341 5
                            $row_left = $row[($i - 1)];
342 5
                            $row_upleft = $prev_row[($j - 1)];
343
                        }
344 5
                        switch ($predictor) {
345 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
346
                                $ddata[$k][$j] = $row[$i];
347
                                break;
348
349 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
350
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
351
                                break;
352
353 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
355 5
                                break;
356
357
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
359
                                break;
360
361
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
362
                                // initial estimate
363
                                $p = ($row_left + $row_up - $row_upleft);
364
                                // distances
365
                                $pa = abs($p - $row_left);
366
                                $pb = abs($p - $row_up);
367
                                $pc = abs($p - $row_upleft);
368
                                $pmin = min($pa, $pb, $pc);
369
                                // return minimum distance
370
                                switch ($pmin) {
371
                                    case $pa:
372
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
373
                                        break;
374
375
                                    case $pb:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
377
                                        break;
378
379
                                    case $pc:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
381
                                        break;
382
                                }
383
                                break;
384
385
                            default:  // PNG prediction (on encoding, PNG optimum)
386
                                throw new Exception('Unknown PNG predictor: '.$predictor);
387
                        }
388
                    }
389 5
                    $prev_row = $ddata[$k];
390
                } // end for each row
391
                // complete decoding
392
            } else {
393
                // number of bytes in a row
394 2
                $rowlen = array_sum($wb);
395
                // convert the stream into an array of integers
396 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
397
                // split the rows
398 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

398
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
399
            }
400
401 6
            $sdata = [];
402
403
            // for every row
404 6
            foreach ($ddata as $k => $row) {
405
                // initialize new row
406 6
                $sdata[$k] = [0, 0, 0];
407 6
                if (0 == $wb[0]) {
408
                    // default type field
409
                    $sdata[$k][0] = 1;
410
                }
411 6
                $i = 0; // count bytes in the row
412
                // for every column
413 6
                for ($c = 0; $c < 3; ++$c) {
414
                    // for every byte on the column
415 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
416 6
                        if (isset($row[$i])) {
417 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
418
                        }
419 6
                        ++$i;
420
                    }
421
                }
422
            }
423 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
424
            // fill xref
425 6
            if (isset($index_first)) {
426 4
                $obj_num = $index_first;
427
            } else {
428 6
                $obj_num = 0;
429
            }
430 6
            foreach ($sdata as $k => $row) {
431 6
                switch ($row[0]) {
432 6
                    case 0:  // (f) linked list of free objects
433 6
                            break;
434
435 6
                    case 1:  // (n) objects that are in use but are not compressed
436
                            // create unique object index: [object number]_[generation number]
437 6
                            $index = $obj_num.'_'.$row[2];
438
                            // check if object already exist
439 6
                            if (!isset($xref['xref'][$index])) {
440
                                // store object offset position
441 6
                                $xref['xref'][$index] = $row[1];
442
                            }
443 6
                            break;
444
445 6
                    case 2:  // compressed objects
446
                            // $row[1] = object number of the object stream in which this object is stored
447
                            // $row[2] = index of this object within the object stream
448 6
                            $index = $row[1].'_0_'.$row[2];
449 6
                            $xref['xref'][$index] = -1;
450 6
                            break;
451
452
                    default:  // null objects
453
                            break;
454
                }
455 6
                ++$obj_num;
456
            }
457
        } // end decoding data
458 6
        if (isset($prevxref)) {
459
            // get previous xref
460 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
461
        }
462
463 6
        return $xref;
464
    }
465
466
    /**
467
     * Get content of indirect object.
468
     *
469
     * @param string $pdfData  PDF data
470
     * @param array  $xref
471
     * @param string $objRef   Object number and generation number separated by underscore character
472
     * @param int    $offset   Object offset
473
     * @param bool   $decoding If true decode streams
474
     *
475
     * @return array containing object data
476
     *
477
     * @throws Exception if invalid object reference found
478
     */
479 27
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
480
    {
481
        /*
482
         * build indirect object header
483
         */
484
        // $objHeader = "[object number] [generation number] obj"
485 27
        $objRefArr = explode('_', $objRef);
486 27
        if (2 !== \count($objRefArr)) {
487
            throw new Exception('Invalid object reference for $obj.');
488
        }
489 27
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
490
491
        /*
492
         * check if we are in position
493
         */
494
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
495 27
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
496
        // ignore leading zeros for object number
497 27
        $offset += strspn($pdfData, '0', $offset);
498
        // Newer pdf may use \n instead of whitespace
499 27
        $objHeaderPattern = '/'.$objRefArr[0].'[ \n]'.$objRefArr[1].'[ \n]obj'.'/';
500 27
        if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader)))) {
501
            // an indirect reference to an undefined object shall be considered a reference to the null object
502 2
            return ['null', 'null', $offset];
503
        }
504
505
        /*
506
         * get content
507
         */
508
        // starting position of object content
509 27
        $offset += \strlen($objHeader);
510 27
        $objContentArr = [];
511 27
        $i = 0; // object main index
512
        do {
513 27
            $oldOffset = $offset;
514
            // get element
515 27
            $element = $this->getRawObject($pdfData, $offset);
516 27
            $offset = $element[2];
517
            // decode stream using stream's dictionary information
518 27
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
519 27
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
520
            }
521 27
            $objContentArr[$i] = $element;
522 27
            ++$i;
523 27
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
524
        // remove closing delimiter
525 27
        array_pop($objContentArr);
526
527
        /*
528
         * return raw object content
529
         */
530 27
        return $objContentArr;
531
    }
532
533
    /**
534
     * Get the content of object, resolving indirect object reference if necessary.
535
     *
536
     * @param string $pdfData PDF data
537
     * @param array  $obj     Object value
538
     *
539
     * @return array containing object data
540
     *
541
     * @throws Exception
542
     */
543 27
    protected function getObjectVal($pdfData, $xref, $obj)
544
    {
545 27
        if ('objref' == $obj[0]) {
546
            // reference to indirect object
547
            if (isset($this->objects[$obj[1]])) {
548
                // this object has been already parsed
549
                return $this->objects[$obj[1]];
550
            } elseif (isset($xref[$obj[1]])) {
551
                // parse new object
552
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
553
554
                return $this->objects[$obj[1]];
555
            }
556
        }
557
558 27
        return $obj;
559
    }
560
561
    /**
562
     * Get object type, raw value and offset to next object
563
     *
564
     * @param int $offset Object offset
565
     *
566
     * @return array containing object type, raw value and offset to next object
567
     */
568 28
    protected function getRawObject($pdfData, $offset = 0)
569
    {
570 28
        $objtype = ''; // object type to be returned
571 28
        $objval = ''; // object value to be returned
572
573
        /*
574
         * skip initial white space chars:
575
         *      \x00 null (NUL)
576
         *      \x09 horizontal tab (HT)
577
         *      \x0A line feed (LF)
578
         *      \x0C form feed (FF)
579
         *      \x0D carriage return (CR)
580
         *      \x20 space (SP)
581
         */
582 28
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
583
584
        // get first char
585 28
        $char = $pdfData[$offset];
586
        // get object type
587 28
        switch ($char) {
588 28
            case '%':  // \x25 PERCENT SIGN
589
                    // skip comment and search for next token
590
                    $next = strcspn($pdfData, "\r\n", $offset);
591
                    if ($next > 0) {
592
                        $offset += $next;
593
594
                        return $this->getRawObject($pdfData, $offset);
595
                    }
596
                    break;
597
598 28
            case '/':  // \x2F SOLIDUS
599
                    // name object
600 28
                    $objtype = $char;
601 28
                    ++$offset;
602 28
                    $pregResult = preg_match(
603 28
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
604 28
                        substr($pdfData, $offset, 256),
605
                        $matches
606
                    );
607 28
                    if (1 == $pregResult) {
608 28
                        $objval = $matches[1]; // unescaped value
609 28
                        $offset += \strlen($objval);
610
                    }
611 28
                    break;
612
613 28
            case '(':   // \x28 LEFT PARENTHESIS
614 28
            case ')':  // \x29 RIGHT PARENTHESIS
615
                    // literal string object
616 25
                    $objtype = $char;
617 25
                    ++$offset;
618 25
                    $strpos = $offset;
619 25
                    if ('(' == $char) {
620 25
                        $open_bracket = 1;
621 25
                        while ($open_bracket > 0) {
622 25
                            if (!isset($pdfData[$strpos])) {
623
                                break;
624
                            }
625 25
                            $ch = $pdfData[$strpos];
626 25
                            switch ($ch) {
627 25
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
628
                                        // skip next character
629 15
                                        ++$strpos;
630 15
                                        break;
631
632 25
                                case '(':  // LEFT PARENHESIS (28h)
633
                                        ++$open_bracket;
634
                                        break;
635
636 25
                                case ')':  // RIGHT PARENTHESIS (29h)
637 25
                                        --$open_bracket;
638 25
                                        break;
639
                            }
640 25
                            ++$strpos;
641
                        }
642 25
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
643 25
                        $offset = $strpos;
644
                    }
645 25
                    break;
646
647 28
            case '[':   // \x5B LEFT SQUARE BRACKET
648 28
            case ']':  // \x5D RIGHT SQUARE BRACKET
649
                // array object
650 27
                $objtype = $char;
651 27
                ++$offset;
652 27
                if ('[' == $char) {
653
                    // get array content
654 27
                    $objval = [];
655
                    do {
656 27
                        $oldOffset = $offset;
657
                        // get element
658 27
                        $element = $this->getRawObject($pdfData, $offset);
659 27
                        $offset = $element[2];
660 27
                        $objval[] = $element;
661 27
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
662
                    // remove closing delimiter
663 27
                    array_pop($objval);
664
                }
665 27
                break;
666
667 28
            case '<':  // \x3C LESS-THAN SIGN
668 28
            case '>':  // \x3E GREATER-THAN SIGN
669 28
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
670
                    // dictionary object
671 28
                    $objtype = $char.$char;
672 28
                    $offset += 2;
673 28
                    if ('<' == $char) {
674
                        // get array content
675 28
                        $objval = [];
676
                        do {
677 28
                            $oldOffset = $offset;
678
                            // get element
679 28
                            $element = $this->getRawObject($pdfData, $offset);
680 28
                            $offset = $element[2];
681 28
                            $objval[] = $element;
682 28
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
683
                        // remove closing delimiter
684 28
                        array_pop($objval);
685
                    }
686
                } else {
687
                    // hexadecimal string object
688 10
                    $objtype = $char;
689 10
                    ++$offset;
690 10
                    $pregResult = preg_match(
691 10
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
692 10
                            substr($pdfData, $offset),
693
                            $matches
694
                        );
695 10
                    if (('<' == $char) && 1 == $pregResult) {
696
                        // remove white space characters
697 10
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
698 10
                        $offset += \strlen($matches[0]);
699
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
700
                        $offset = $endpos + 1;
701
                    }
702
                }
703 28
                    break;
704
705
            default:
706 28
                    if ('endobj' == substr($pdfData, $offset, 6)) {
707
                        // indirect object
708 27
                        $objtype = 'endobj';
709 27
                        $offset += 6;
710 28
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
711
                        // null object
712 3
                        $objtype = 'null';
713 3
                        $offset += 4;
714 3
                        $objval = 'null';
715 28
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
716
                        // boolean true object
717 8
                        $objtype = 'boolean';
718 8
                        $offset += 4;
719 8
                        $objval = 'true';
720 28
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
721
                        // boolean false object
722 2
                        $objtype = 'boolean';
723 2
                        $offset += 5;
724 2
                        $objval = 'false';
725 28
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
726
                        // start stream object
727 27
                        $objtype = 'stream';
728 27
                        $offset += 6;
729 27
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
730 27
                            $offset += \strlen($matches[0]);
731 27
                            $pregResult = preg_match(
732 27
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
733 27
                                substr($pdfData, $offset),
734
                                $matches,
735 27
                                \PREG_OFFSET_CAPTURE
736
                            );
737 27
                            if (1 == $pregResult) {
738 27
                                $objval = substr($pdfData, $offset, $matches[0][1]);
739 27
                                $offset += $matches[1][1];
740
                            }
741
                        }
742 28
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
743
                        // end stream object
744 27
                        $objtype = 'endstream';
745 27
                        $offset += 9;
746 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
747
                        // indirect object reference
748 27
                        $objtype = 'objref';
749 27
                        $offset += \strlen($matches[0]);
750 27
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
751 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
752
                        // object start
753 6
                        $objtype = 'obj';
754 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
755 6
                        $offset += \strlen($matches[0]);
756 28
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
757
                        // numeric object
758 27
                        $objtype = 'numeric';
759 27
                        $objval = substr($pdfData, $offset, $numlen);
760 27
                        $offset += $numlen;
761
                    }
762 28
                    break;
763
        }
764
765 28
        return [$objtype, $objval, $offset];
766
    }
767
768
    /**
769
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
770
     *
771
     * @param string $pdfData
772
     * @param int    $offset  xref offset (if know)
773
     * @param array  $xref    previous xref array (if any)
774
     *
775
     * @return array containing xref and trailer data
776
     *
777
     * @throws Exception if it was unable to find startxref
778
     * @throws Exception if it was unable to find xref
779
     */
780 28
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
781
    {
782 28
        $startxrefPreg = preg_match(
783 28
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
784
            $pdfData,
785
            $matches,
786 28
            \PREG_OFFSET_CAPTURE,
787
            $offset
788
        );
789
790 28
        if (0 == $offset) {
791
            // find last startxref
792 28
            $pregResult = preg_match_all(
793 28
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
794
                $pdfData, $matches,
795 28
                \PREG_SET_ORDER,
796
                $offset
797
            );
798 28
            if (0 == $pregResult) {
799
                throw new Exception('Unable to find startxref');
800
            }
801 28
            $matches = array_pop($matches);
802 28
            $startxref = $matches[1];
803 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
804
            // Already pointing at the xref table
805 5
            $startxref = $offset;
806 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
807
            // Cross-Reference Stream object
808 4
            $startxref = $offset;
809
        } elseif ($startxrefPreg) {
810
            // startxref found
811
            $startxref = $matches[1][0];
812
        } else {
813
            throw new Exception('Unable to find startxref');
814
        }
815
816 28
        if ($startxref > \strlen($pdfData)) {
817 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
818
        }
819
820
        // check xref position
821 27
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
822
            // Cross-Reference
823 22
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
824
        } else {
825
            // Cross-Reference Stream
826 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
827
        }
828 27
        if (empty($xref)) {
829
            throw new Exception('Unable to find xref');
830
        }
831
832 27
        return $xref;
833
    }
834
835
    /**
836
     * Parses PDF data and returns extracted data as array.
837
     *
838
     * @param string $data PDF data to parse
839
     *
840
     * @return array array of parsed PDF document objects
841
     *
842
     * @throws Exception if empty PDF data given
843
     * @throws Exception if PDF data missing %PDF header
844
     */
845 28
    public function parseData($data)
846
    {
847 28
        if (empty($data)) {
848
            throw new Exception('Empty PDF data given.');
849
        }
850
        // find the pdf header starting position
851 28
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
852
            throw new Exception('Invalid PDF data: missing %PDF header.');
853
        }
854
855
        // get PDF content string
856 28
        $pdfData = substr($data, $trimpos);
857
858
        // get xref and trailer data
859 28
        $xref = $this->getXrefData($pdfData);
860
861
        // parse all document objects
862 27
        $objects = [];
863 27
        foreach ($xref['xref'] as $obj => $offset) {
864 27
            if (!isset($objects[$obj]) && ($offset > 0)) {
865
                // decode objects with positive offset
866 27
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
867
            }
868
        }
869
870 27
        return [$xref, $objects];
871
    }
872
}
873