Passed
Pull Request — master (#405)
by
unknown
08:18
created

RawDataParser::getIndirectObject()   B

Complexity

Conditions 9
Paths 4

Size

Total Lines 53
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9.0066

Importance

Changes 5
Bugs 3 Features 1
Metric Value
cc 9
eloc 23
c 5
b 3
f 1
nc 4
nop 5
dl 0
loc 53
ccs 22
cts 23
cp 0.9565
crap 9.0066
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 31
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 31
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 31
        $this->filterHelper = new FilterHelper();
69 31
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 27
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 27
        $slength = \strlen($stream);
85 27
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 27
        $filters = [];
89 27
        foreach ($sdic as $k => $v) {
90 27
            if ('/' == $v[0]) {
91 27
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 26
                    $declength = (int) ($sdic[($k + 1)][1]);
94 26
                    if ($declength < $slength) {
95 26
                        $stream = substr($stream, 0, $declength);
96 26
                        $slength = $declength;
97
                    }
98 27
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 27
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 27
                    if ('/' == $objval[0]) {
102
                        // single filter
103 27
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 27
        $remaining_filters = [];
118 27
        foreach ($filters as $filter) {
119 27
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 27
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 27
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 27
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 22
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 22
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 22
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 22
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 22
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 22
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 22
            $offset += \strlen($matches[0][0]);
162 22
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 22
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 22
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 22
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 22
                ++$obj_num;
171 22
            } elseif ('f' == $matches[3][0]) {
172 22
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 22
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 22
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 22
            $trailer_data = $matches[1][0];
181 22
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 22
                $xref['trailer'] = [];
184
                // parse trailer_data
185 22
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 22
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 22
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 22
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 22
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 21
                    $xref['trailer']['id'] = [];
199 21
                    $xref['trailer']['id'][0] = $matches[1];
200 21
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 22
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 22
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 22
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $predictor = null;
243 6
        $sarr = $xrefcrs[0][1];
244 6
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 6
        $wb = [];
249
250 6
        foreach ($sarr as $k => $v) {
251
            if (
252 6
                ('/' == $v[0])
253 6
                && ('Type' == $v[1])
254
                && (
255 6
                    isset($sarr[($k + 1)])
256 6
                    && '/' == $sarr[($k + 1)][0]
257 6
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 6
                $valid_crs = true;
261 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
273 5
                $decpar = $sarr[($k + 1)][1];
274 5
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 5
                        '/' == $vdc[0]
277 5
                        && 'Columns' == $vdc[1]
278
                        && (
279 5
                            isset($decpar[($kdc + 1)])
280 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 5
                        '/' == $vdc[0]
286 5
                        && 'Predictor' == $vdc[1]
287
                        && (
288 5
                            isset($decpar[($kdc + 1)])
289 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
293
                    }
294
                }
295 6
            } elseif ($filltrailer) {
296 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
297 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
299 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
305 6
                    $xref['trailer']['id'] = [];
306 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
314 6
            if (null !== $predictor) {
315
                // number of bytes in a row
316 5
                $rowlen = ($columns + 1);
317
                // convert the stream into an array of integers
318 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
319
                // split the rows
320 5
                $sdata = array_chunk($sdata, $rowlen);
321
322
                // initialize decoded array
323 5
                $ddata = [];
324
                // initialize first row with zeros
325 5
                $prev_row = array_fill(0, $rowlen, 0);
326
                // for each row apply PNG unpredictor
327 5
                foreach ($sdata as $k => $row) {
328
                    // initialize new row
329 5
                    $ddata[$k] = [];
330
                    // get PNG predictor value
331 5
                    $predictor = (10 + $row[0]);
332
                    // for each byte on the row
333 5
                    for ($i = 1; $i <= $columns; ++$i) {
334
                        // new index
335 5
                        $j = ($i - 1);
336 5
                        $row_up = $prev_row[$j];
337 5
                        if (1 == $i) {
338 5
                            $row_left = 0;
339 5
                            $row_upleft = 0;
340
                        } else {
341 5
                            $row_left = $row[($i - 1)];
342 5
                            $row_upleft = $prev_row[($j - 1)];
343
                        }
344 5
                        switch ($predictor) {
345 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
346
                                $ddata[$k][$j] = $row[$i];
347
                                break;
348
349 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
350
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
351
                                break;
352
353 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
355 5
                                break;
356
357
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
359
                                break;
360
361
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
362
                                // initial estimate
363
                                $p = ($row_left + $row_up - $row_upleft);
364
                                // distances
365
                                $pa = abs($p - $row_left);
366
                                $pb = abs($p - $row_up);
367
                                $pc = abs($p - $row_upleft);
368
                                $pmin = min($pa, $pb, $pc);
369
                                // return minimum distance
370
                                switch ($pmin) {
371
                                    case $pa:
372
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
373
                                        break;
374
375
                                    case $pb:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
377
                                        break;
378
379
                                    case $pc:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
381
                                        break;
382
                                }
383
                                break;
384
385
                            default:  // PNG prediction (on encoding, PNG optimum)
386
                                throw new Exception('Unknown PNG predictor: '.$predictor);
387
                        }
388
                    }
389 5
                    $prev_row = $ddata[$k];
390
                } // end for each row
391
                // complete decoding
392
            } else {
393
                // number of bytes in a row
394 2
                $rowlen = array_sum($wb);
395
                // convert the stream into an array of integers
396 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
397
                // split the rows
398 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

398
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
399
            }
400
401 6
            $sdata = [];
402
403
            // for every row
404 6
            foreach ($ddata as $k => $row) {
405
                // initialize new row
406 6
                $sdata[$k] = [0, 0, 0];
407 6
                if (0 == $wb[0]) {
408
                    // default type field
409
                    $sdata[$k][0] = 1;
410
                }
411 6
                $i = 0; // count bytes in the row
412
                // for every column
413 6
                for ($c = 0; $c < 3; ++$c) {
414
                    // for every byte on the column
415 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
416 6
                        if (isset($row[$i])) {
417 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
418
                        }
419 6
                        ++$i;
420
                    }
421
                }
422
            }
423 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
424
            // fill xref
425 6
            if (isset($index_first)) {
426 4
                $obj_num = $index_first;
427
            } else {
428 6
                $obj_num = 0;
429
            }
430 6
            foreach ($sdata as $k => $row) {
431 6
                switch ($row[0]) {
432 6
                    case 0:  // (f) linked list of free objects
433 6
                            break;
434
435 6
                    case 1:  // (n) objects that are in use but are not compressed
436
                            // create unique object index: [object number]_[generation number]
437 6
                            $index = $obj_num.'_'.$row[2];
438
                            // check if object already exist
439 6
                            if (!isset($xref['xref'][$index])) {
440
                                // store object offset position
441 6
                                $xref['xref'][$index] = $row[1];
442
                            }
443 6
                            break;
444
445 6
                    case 2:  // compressed objects
446
                            // $row[1] = object number of the object stream in which this object is stored
447
                            // $row[2] = index of this object within the object stream
448 6
                            $index = $row[1].'_0_'.$row[2];
449 6
                            $xref['xref'][$index] = -1;
450 6
                            break;
451
452
                    default:  // null objects
453
                            break;
454
                }
455 6
                ++$obj_num;
456
            }
457
        } // end decoding data
458 6
        if (isset($prevxref)) {
459
            // get previous xref
460 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
461
        }
462
463 6
        return $xref;
464
    }
465
466
    /**
467
     * Get content of indirect object.
468
     *
469
     * @param string $pdfData  PDF data
470
     * @param array  $xref
471
     * @param string $objRef   Object number and generation number separated by underscore character
472
     * @param int    $offset   Object offset
473
     * @param bool   $decoding If true decode streams
474
     *
475
     * @return array containing object data
476
     *
477
     * @throws Exception if invalid object reference found
478
     */
479 27
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
480
    {
481
        /*
482
         * build indirect object header
483
         */
484
        // $objHeader = "[object number] [generation number] obj"
485 27
        $objRefArr = explode('_', $objRef);
486 27
        if (2 !== \count($objRefArr)) {
487
            throw new Exception('Invalid object reference for $obj.');
488
        }
489 27
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
490
491
        /*
492
         * check if we are in position
493
         */
494
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
495 27
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
496
        // ignore leading zeros for object number
497 27
        $offset += strspn($pdfData, '0', $offset);
498
499
        // Newer pdf may use \n instead of whitespace
500 27
        $objHeaderPattern = '/' . $objRefArr[0] . '[ \n]' . $objRefArr[1].'[ \n]obj' . '/';
501 27
        if (preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader))) == 0) {
502
            // an indirect reference to an undefined object shall be considered a reference to the null object
503 2
            return ['null', 'null', $offset];
504
        }
505
506
        /*
507
         * get content
508
         */
509
        // starting position of object content
510 27
        $offset += \strlen($objHeader);
511 27
        $objContentArr = [];
512 27
        $i = 0; // object main index
513
        do {
514 27
            $oldOffset = $offset;
515
            // get element
516 27
            $element = $this->getRawObject($pdfData, $offset);
517 27
            $offset = $element[2];
518
            // decode stream using stream's dictionary information
519 27
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
520 27
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
521
            }
522 27
            $objContentArr[$i] = $element;
523 27
            ++$i;
524 27
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
525
        // remove closing delimiter
526 27
        array_pop($objContentArr);
527
528
        /*
529
         * return raw object content
530
         */
531 27
        return $objContentArr;
532
    }
533
534
    /**
535
     * Get the content of object, resolving indirect object reference if necessary.
536
     *
537
     * @param string $pdfData PDF data
538
     * @param array  $obj     Object value
539
     *
540
     * @return array containing object data
541
     *
542
     * @throws Exception
543
     */
544 27
    protected function getObjectVal($pdfData, $xref, $obj)
545
    {
546 27
        if ('objref' == $obj[0]) {
547
            // reference to indirect object
548
            if (isset($this->objects[$obj[1]])) {
549
                // this object has been already parsed
550
                return $this->objects[$obj[1]];
551
            } elseif (isset($xref[$obj[1]])) {
552
                // parse new object
553
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
554
555
                return $this->objects[$obj[1]];
556
            }
557
        }
558
559 27
        return $obj;
560
    }
561
562
    /**
563
     * Get object type, raw value and offset to next object
564
     *
565
     * @param int $offset Object offset
566
     *
567
     * @return array containing object type, raw value and offset to next object
568
     */
569 28
    protected function getRawObject($pdfData, $offset = 0)
570
    {
571 28
        $objtype = ''; // object type to be returned
572 28
        $objval = ''; // object value to be returned
573
574
        /*
575
         * skip initial white space chars:
576
         *      \x00 null (NUL)
577
         *      \x09 horizontal tab (HT)
578
         *      \x0A line feed (LF)
579
         *      \x0C form feed (FF)
580
         *      \x0D carriage return (CR)
581
         *      \x20 space (SP)
582
         */
583 28
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
584
585
        // get first char
586 28
        $char = $pdfData[$offset];
587
        // get object type
588 28
        switch ($char) {
589 28
            case '%':  // \x25 PERCENT SIGN
590
                    // skip comment and search for next token
591
                    $next = strcspn($pdfData, "\r\n", $offset);
592
                    if ($next > 0) {
593
                        $offset += $next;
594
595
                        return $this->getRawObject($pdfData, $offset);
596
                    }
597
                    break;
598
599 28
            case '/':  // \x2F SOLIDUS
600
                    // name object
601 28
                    $objtype = $char;
602 28
                    ++$offset;
603 28
                    $pregResult = preg_match(
604 28
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
605 28
                        substr($pdfData, $offset, 256),
606
                        $matches
607
                    );
608 28
                    if (1 == $pregResult) {
609 28
                        $objval = $matches[1]; // unescaped value
610 28
                        $offset += \strlen($objval);
611
                    }
612 28
                    break;
613
614 28
            case '(':   // \x28 LEFT PARENTHESIS
615 28
            case ')':  // \x29 RIGHT PARENTHESIS
616
                    // literal string object
617 25
                    $objtype = $char;
618 25
                    ++$offset;
619 25
                    $strpos = $offset;
620 25
                    if ('(' == $char) {
621 25
                        $open_bracket = 1;
622 25
                        while ($open_bracket > 0) {
623 25
                            if (!isset($pdfData[$strpos])) {
624
                                break;
625
                            }
626 25
                            $ch = $pdfData[$strpos];
627 25
                            switch ($ch) {
628 25
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
629
                                        // skip next character
630 15
                                        ++$strpos;
631 15
                                        break;
632
633 25
                                case '(':  // LEFT PARENHESIS (28h)
634
                                        ++$open_bracket;
635
                                        break;
636
637 25
                                case ')':  // RIGHT PARENTHESIS (29h)
638 25
                                        --$open_bracket;
639 25
                                        break;
640
                            }
641 25
                            ++$strpos;
642
                        }
643 25
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
644 25
                        $offset = $strpos;
645
                    }
646 25
                    break;
647
648 28
            case '[':   // \x5B LEFT SQUARE BRACKET
649 28
            case ']':  // \x5D RIGHT SQUARE BRACKET
650
                // array object
651 27
                $objtype = $char;
652 27
                ++$offset;
653 27
                if ('[' == $char) {
654
                    // get array content
655 27
                    $objval = [];
656
                    do {
657 27
                        $oldOffset = $offset;
658
                        // get element
659 27
                        $element = $this->getRawObject($pdfData, $offset);
660 27
                        $offset = $element[2];
661 27
                        $objval[] = $element;
662 27
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
663
                    // remove closing delimiter
664 27
                    array_pop($objval);
665
                }
666 27
                break;
667
668 28
            case '<':  // \x3C LESS-THAN SIGN
669 28
            case '>':  // \x3E GREATER-THAN SIGN
670 28
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
671
                    // dictionary object
672 28
                    $objtype = $char.$char;
673 28
                    $offset += 2;
674 28
                    if ('<' == $char) {
675
                        // get array content
676 28
                        $objval = [];
677
                        do {
678 28
                            $oldOffset = $offset;
679
                            // get element
680 28
                            $element = $this->getRawObject($pdfData, $offset);
681 28
                            $offset = $element[2];
682 28
                            $objval[] = $element;
683 28
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
684
                        // remove closing delimiter
685 28
                        array_pop($objval);
686
                    }
687
                } else {
688
                    // hexadecimal string object
689 10
                    $objtype = $char;
690 10
                    ++$offset;
691 10
                    $pregResult = preg_match(
692 10
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
693 10
                            substr($pdfData, $offset),
694
                            $matches
695
                        );
696 10
                    if (('<' == $char) && 1 == $pregResult) {
697
                        // remove white space characters
698 10
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
699 10
                        $offset += \strlen($matches[0]);
700
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
701
                        $offset = $endpos + 1;
702
                    }
703
                }
704 28
                    break;
705
706
            default:
707 28
                    if ('endobj' == substr($pdfData, $offset, 6)) {
708
                        // indirect object
709 27
                        $objtype = 'endobj';
710 27
                        $offset += 6;
711 28
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
712
                        // null object
713 3
                        $objtype = 'null';
714 3
                        $offset += 4;
715 3
                        $objval = 'null';
716 28
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
717
                        // boolean true object
718 8
                        $objtype = 'boolean';
719 8
                        $offset += 4;
720 8
                        $objval = 'true';
721 28
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
722
                        // boolean false object
723 2
                        $objtype = 'boolean';
724 2
                        $offset += 5;
725 2
                        $objval = 'false';
726 28
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
727
                        // start stream object
728 27
                        $objtype = 'stream';
729 27
                        $offset += 6;
730 27
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
731 27
                            $offset += \strlen($matches[0]);
732 27
                            $pregResult = preg_match(
733 27
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
734 27
                                substr($pdfData, $offset),
735
                                $matches,
736 27
                                \PREG_OFFSET_CAPTURE
737
                            );
738 27
                            if (1 == $pregResult) {
739 27
                                $objval = substr($pdfData, $offset, $matches[0][1]);
740 27
                                $offset += $matches[1][1];
741
                            }
742
                        }
743 28
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
744
                        // end stream object
745 27
                        $objtype = 'endstream';
746 27
                        $offset += 9;
747 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
748
                        // indirect object reference
749 27
                        $objtype = 'objref';
750 27
                        $offset += \strlen($matches[0]);
751 27
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
752 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
753
                        // object start
754 6
                        $objtype = 'obj';
755 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
756 6
                        $offset += \strlen($matches[0]);
757 28
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
758
                        // numeric object
759 27
                        $objtype = 'numeric';
760 27
                        $objval = substr($pdfData, $offset, $numlen);
761 27
                        $offset += $numlen;
762
                    }
763 28
                    break;
764
        }
765
766 28
        return [$objtype, $objval, $offset];
767
    }
768
769
    /**
770
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
771
     *
772
     * @param string $pdfData
773
     * @param int    $offset  xref offset (if know)
774
     * @param array  $xref    previous xref array (if any)
775
     *
776
     * @return array containing xref and trailer data
777
     *
778
     * @throws Exception if it was unable to find startxref
779
     * @throws Exception if it was unable to find xref
780
     */
781 28
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
782
    {
783 28
        $startxrefPreg = preg_match(
784 28
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
785
            $pdfData,
786
            $matches,
787 28
            \PREG_OFFSET_CAPTURE,
788
            $offset
789
        );
790
791 28
        if (0 == $offset) {
792
            // find last startxref
793 28
            $pregResult = preg_match_all(
794 28
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
795
                $pdfData, $matches,
796 28
                \PREG_SET_ORDER,
797
                $offset
798
            );
799 28
            if (0 == $pregResult) {
800
                throw new Exception('Unable to find startxref');
801
            }
802 28
            $matches = array_pop($matches);
803 28
            $startxref = $matches[1];
804 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
805
            // Already pointing at the xref table
806 5
            $startxref = $offset;
807 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
808
            // Cross-Reference Stream object
809 4
            $startxref = $offset;
810
        } elseif ($startxrefPreg) {
811
            // startxref found
812
            $startxref = $matches[1][0];
813
        } else {
814
            throw new Exception('Unable to find startxref');
815
        }
816
817 28
        if ($startxref > \strlen($pdfData)) {
818 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
819
        }
820
821
        // check xref position
822 27
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
823
            // Cross-Reference
824 22
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
825
        } else {
826
            // Cross-Reference Stream
827 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
828
        }
829 27
        if (empty($xref)) {
830
            throw new Exception('Unable to find xref');
831
        }
832
833 27
        return $xref;
834
    }
835
836
    /**
837
     * Parses PDF data and returns extracted data as array.
838
     *
839
     * @param string $data PDF data to parse
840
     *
841
     * @return array array of parsed PDF document objects
842
     *
843
     * @throws Exception if empty PDF data given
844
     * @throws Exception if PDF data missing %PDF header
845
     */
846 28
    public function parseData($data)
847
    {
848 28
        if (empty($data)) {
849
            throw new Exception('Empty PDF data given.');
850
        }
851
        // find the pdf header starting position
852 28
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
853
            throw new Exception('Invalid PDF data: missing %PDF header.');
854
        }
855
856
        // get PDF content string
857 28
        $pdfData = substr($data, $trimpos);
858
859
        // get xref and trailer data
860 28
        $xref = $this->getXrefData($pdfData);
861
862
        // parse all document objects
863 27
        $objects = [];
864 27
        foreach ($xref['xref'] as $obj => $offset) {
865 27
            if (!isset($objects[$obj]) && ($offset > 0)) {
866
                // decode objects with positive offset
867 27
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
868
            }
869
        }
870
871 27
        return [$xref, $objects];
872
    }
873
}
874