Passed
Pull Request — master (#405)
by
unknown
03:27 queued 01:06
created

RawDataParser::getIndirectObject()   B

Complexity

Conditions 9
Paths 4

Size

Total Lines 52
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9.0066

Importance

Changes 6
Bugs 3 Features 1
Metric Value
cc 9
eloc 23
c 6
b 3
f 1
nc 4
nop 5
dl 0
loc 52
ccs 22
cts 23
cp 0.9565
crap 9.0066
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 31
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 31
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 31
        $this->filterHelper = new FilterHelper();
69 31
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 27
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 27
        $slength = \strlen($stream);
85 27
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 27
        $filters = [];
89 27
        foreach ($sdic as $k => $v) {
90 27
            if ('/' == $v[0]) {
91 27
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 26
                    $declength = (int) ($sdic[($k + 1)][1]);
94 26
                    if ($declength < $slength) {
95 26
                        $stream = substr($stream, 0, $declength);
96 26
                        $slength = $declength;
97
                    }
98 27
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 27
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 27
                    if ('/' == $objval[0]) {
102
                        // single filter
103 27
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 27
        $remaining_filters = [];
118 27
        foreach ($filters as $filter) {
119 27
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 27
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 27
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 27
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 22
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 22
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 22
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 22
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 22
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 22
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 22
            $offset += \strlen($matches[0][0]);
162 22
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 22
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 22
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 22
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 22
                ++$obj_num;
171 22
            } elseif ('f' == $matches[3][0]) {
172 22
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 22
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 22
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 22
            $trailer_data = $matches[1][0];
181 22
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 22
                $xref['trailer'] = [];
184
                // parse trailer_data
185 22
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 22
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 22
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 22
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 22
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 21
                    $xref['trailer']['id'] = [];
199 21
                    $xref['trailer']['id'][0] = $matches[1];
200 21
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 22
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 22
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 22
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $predictor = null;
243 6
        $sarr = $xrefcrs[0][1];
244 6
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 6
        $wb = [];
249
250 6
        foreach ($sarr as $k => $v) {
251
            if (
252 6
                ('/' == $v[0])
253 6
                && ('Type' == $v[1])
254
                && (
255 6
                    isset($sarr[($k + 1)])
256 6
                    && '/' == $sarr[($k + 1)][0]
257 6
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 6
                $valid_crs = true;
261 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
273 5
                $decpar = $sarr[($k + 1)][1];
274 5
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 5
                        '/' == $vdc[0]
277 5
                        && 'Columns' == $vdc[1]
278
                        && (
279 5
                            isset($decpar[($kdc + 1)])
280 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 5
                        '/' == $vdc[0]
286 5
                        && 'Predictor' == $vdc[1]
287
                        && (
288 5
                            isset($decpar[($kdc + 1)])
289 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
293
                    }
294
                }
295 6
            } elseif ($filltrailer) {
296 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
297 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
299 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
305 6
                    $xref['trailer']['id'] = [];
306 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
314 6
            if (null !== $predictor) {
315
                // number of bytes in a row
316 5
                $rowlen = ($columns + 1);
317
                // convert the stream into an array of integers
318 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
319
                // split the rows
320 5
                $sdata = array_chunk($sdata, $rowlen);
321
322
                // initialize decoded array
323 5
                $ddata = [];
324
                // initialize first row with zeros
325 5
                $prev_row = array_fill(0, $rowlen, 0);
326
                // for each row apply PNG unpredictor
327 5
                foreach ($sdata as $k => $row) {
328
                    // initialize new row
329 5
                    $ddata[$k] = [];
330
                    // get PNG predictor value
331 5
                    $predictor = (10 + $row[0]);
332
                    // for each byte on the row
333 5
                    for ($i = 1; $i <= $columns; ++$i) {
334
                        // new index
335 5
                        $j = ($i - 1);
336 5
                        $row_up = $prev_row[$j];
337 5
                        if (1 == $i) {
338 5
                            $row_left = 0;
339 5
                            $row_upleft = 0;
340
                        } else {
341 5
                            $row_left = $row[($i - 1)];
342 5
                            $row_upleft = $prev_row[($j - 1)];
343
                        }
344 5
                        switch ($predictor) {
345 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
346
                                $ddata[$k][$j] = $row[$i];
347
                                break;
348
349 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
350
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
351
                                break;
352
353 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
355 5
                                break;
356
357
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
359
                                break;
360
361
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
362
                                // initial estimate
363
                                $p = ($row_left + $row_up - $row_upleft);
364
                                // distances
365
                                $pa = abs($p - $row_left);
366
                                $pb = abs($p - $row_up);
367
                                $pc = abs($p - $row_upleft);
368
                                $pmin = min($pa, $pb, $pc);
369
                                // return minimum distance
370
                                switch ($pmin) {
371
                                    case $pa:
372
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
373
                                        break;
374
375
                                    case $pb:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
377
                                        break;
378
379
                                    case $pc:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
381
                                        break;
382
                                }
383
                                break;
384
385
                            default:  // PNG prediction (on encoding, PNG optimum)
386
                                throw new Exception('Unknown PNG predictor: '.$predictor);
387
                        }
388
                    }
389 5
                    $prev_row = $ddata[$k];
390
                } // end for each row
391
                // complete decoding
392
            } else {
393
                // number of bytes in a row
394 2
                $rowlen = array_sum($wb);
395
                // convert the stream into an array of integers
396 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
397
                // split the rows
398 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

398
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
399
            }
400
401 6
            $sdata = [];
402
403
            // for every row
404 6
            foreach ($ddata as $k => $row) {
405
                // initialize new row
406 6
                $sdata[$k] = [0, 0, 0];
407 6
                if (0 == $wb[0]) {
408
                    // default type field
409
                    $sdata[$k][0] = 1;
410
                }
411 6
                $i = 0; // count bytes in the row
412
                // for every column
413 6
                for ($c = 0; $c < 3; ++$c) {
414
                    // for every byte on the column
415 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
416 6
                        if (isset($row[$i])) {
417 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
418
                        }
419 6
                        ++$i;
420
                    }
421
                }
422
            }
423 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
424
            // fill xref
425 6
            if (isset($index_first)) {
426 4
                $obj_num = $index_first;
427
            } else {
428 6
                $obj_num = 0;
429
            }
430 6
            foreach ($sdata as $k => $row) {
431 6
                switch ($row[0]) {
432 6
                    case 0:  // (f) linked list of free objects
433 6
                            break;
434
435 6
                    case 1:  // (n) objects that are in use but are not compressed
436
                            // create unique object index: [object number]_[generation number]
437 6
                            $index = $obj_num.'_'.$row[2];
438
                            // check if object already exist
439 6
                            if (!isset($xref['xref'][$index])) {
440
                                // store object offset position
441 6
                                $xref['xref'][$index] = $row[1];
442
                            }
443 6
                            break;
444
445 6
                    case 2:  // compressed objects
446
                            // $row[1] = object number of the object stream in which this object is stored
447
                            // $row[2] = index of this object within the object stream
448 6
                            $index = $row[1].'_0_'.$row[2];
449 6
                            $xref['xref'][$index] = -1;
450 6
                            break;
451
452
                    default:  // null objects
453
                            break;
454
                }
455 6
                ++$obj_num;
456
            }
457
        } // end decoding data
458 6
        if (isset($prevxref)) {
459
            // get previous xref
460 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
461
        }
462
463 6
        return $xref;
464
    }
465
466
    /**
467
     * Get content of indirect object.
468
     *
469
     * @param string $pdfData  PDF data
470
     * @param array  $xref
471
     * @param string $objRef   Object number and generation number separated by underscore character
472
     * @param int    $offset   Object offset
473
     * @param bool   $decoding If true decode streams
474
     *
475
     * @return array containing object data
476
     *
477
     * @throws Exception if invalid object reference found
478
     */
479 27
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
480
    {
481
        /*
482
         * build indirect object header
483
         */
484
        // $objHeader = "[object number] [generation number] obj"
485 27
        $objRefArr = explode('_', $objRef);
486 27
        if (2 !== \count($objRefArr)) {
487
            throw new Exception('Invalid object reference for $obj.');
488
        }
489 27
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
490
491
        /*
492
         * check if we are in position
493
         */
494
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
495 27
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
496
        // ignore leading zeros for object number
497 27
        $offset += strspn($pdfData, '0', $offset);
498
        // Newer pdf may use \n instead of whitespace
499 27
        $objHeaderPattern = '/'.$objRefArr[0].'[ \n]'.$objRefArr[1].'[ \n]obj'.'/';
500 27
        if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, \strlen($objHeader)))) {
501
            // an indirect reference to an undefined object shall be considered a reference to the null object
502 2
            return ['null', 'null', $offset];
503
        }
504
505
        /*
506
         * get content
507
         */
508
        // starting position of object content
509 27
        $offset += \strlen($objHeader);
510 27
        $objContentArr = [];
511 27
        $i = 0; // object main index
512
        do {
513 27
            $oldOffset = $offset;
514
            // get element
515 27
            $element = $this->getRawObject($pdfData, $offset);
516 27
            $offset = $element[2];
517
            // decode stream using stream's dictionary information
518 27
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
519 27
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
520
            }
521 27
            $objContentArr[$i] = $element;
522 27
            ++$i;
523 27
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
524
        // remove closing delimiter
525 27
        array_pop($objContentArr);
526
527
        /*
528
         * return raw object content
529
         */
530 27
        return $objContentArr;
531
    }
532
533
    /**
534
     * Get the content of object, resolving indirect object reference if necessary.
535
     *
536
     * @param string $pdfData PDF data
537
     * @param array  $obj     Object value
538
     *
539
     * @return array containing object data
540
     *
541
     * @throws Exception
542
     */
543 27
    protected function getObjectVal($pdfData, $xref, $obj)
544
    {
545 27
        if ('objref' == $obj[0]) {
546
            // reference to indirect object
547
            if (isset($this->objects[$obj[1]])) {
548
                // this object has been already parsed
549
                return $this->objects[$obj[1]];
550
            } elseif (isset($xref[$obj[1]])) {
551
                // parse new object
552
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
553
554
                return $this->objects[$obj[1]];
555
            }
556
        }
557
558 27
        return $obj;
559
    }
560
561
    /**
562
     * Get object type, raw value and offset to next object
563
     *
564
     * @param int $offset Object offset
565
     *
566
     * @return array containing object type, raw value and offset to next object
567
     */
568 28
    protected function getRawObject($pdfData, $offset = 0)
569
    {
570 28
        $objtype = ''; // object type to be returned
571 28
        $objval = ''; // object value to be returned
572
573
        /*
574
         * skip initial white space chars:
575
         *      \x00 null (NUL)
576
         *      \x09 horizontal tab (HT)
577
         *      \x0A line feed (LF)
578
         *      \x0C form feed (FF)
579
         *      \x0D carriage return (CR)
580
         *      \x20 space (SP)
581
         */
582 28
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
583
584
        // get first char
585 28
        $char = $pdfData[$offset];
586
        // get object type
587 28
        switch ($char) {
588 28
            case '%':  // \x25 PERCENT SIGN
589
                    // skip comment and search for next token
590
                    $next = strcspn($pdfData, "\r\n", $offset);
591
                    if ($next > 0) {
592
                        $offset += $next;
593
594
                        return $this->getRawObject($pdfData, $offset);
595
                    }
596
                    break;
597
598 28
            case '/':  // \x2F SOLIDUS
599
                    // name object
600 28
                    $objtype = $char;
601 28
                    ++$offset;
602 28
                    $pregResult = preg_match(
603 28
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
604 28
                        substr($pdfData, $offset, 256),
605
                        $matches
606
                    );
607 28
                    if (1 == $pregResult) {
608 28
                        $objval = $matches[1]; // unescaped value
609 28
                        $offset += \strlen($objval);
610
                    }
611 28
                    break;
612
613 28
            case '(':   // \x28 LEFT PARENTHESIS
614 28
            case ')':  // \x29 RIGHT PARENTHESIS
615
                    // literal string object
616 25
                    $objtype = $char;
617 25
                    ++$offset;
618 25
                    $strpos = $offset;
619 25
                    if ('(' == $char) {
620 25
                        $open_bracket = 1;
621 25
                        while ($open_bracket > 0) {
622 25
                            if (!isset($pdfData[$strpos])) {
623
                                break;
624
                            }
625 25
                            $ch = $pdfData[$strpos];
626 25
                            switch ($ch) {
627 25
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
628
                                        // skip next character
629 15
                                        ++$strpos;
630 15
                                        break;
631
632 25
                                case '(':  // LEFT PARENHESIS (28h)
633
                                        ++$open_bracket;
634
                                        break;
635
636 25
                                case ')':  // RIGHT PARENTHESIS (29h)
637 25
                                        --$open_bracket;
638 25
                                        break;
639
                            }
640 25
                            ++$strpos;
641
                        }
642 25
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
643 25
                        $offset = $strpos;
644
                    }
645 25
                    break;
646
647 28
            case '[':   // \x5B LEFT SQUARE BRACKET
648 28
            case ']':  // \x5D RIGHT SQUARE BRACKET
649
                // array object
650 27
                $objtype = $char;
651 27
                ++$offset;
652 27
                if ('[' == $char) {
653
                    // get array content
654 27
                    $objval = [];
655
                    do {
656 27
                        $oldOffset = $offset;
657
                        // get element
658 27
                        $element = $this->getRawObject($pdfData, $offset);
659 27
                        $offset = $element[2];
660 27
                        $objval[] = $element;
661 27
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
662
                    // remove closing delimiter
663 27
                    array_pop($objval);
664
                }
665 27
                break;
666
667 28
            case '<':  // \x3C LESS-THAN SIGN
668 28
            case '>':  // \x3E GREATER-THAN SIGN
669 28
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
670
                    // dictionary object
671 28
                    $objtype = $char.$char;
672 28
                    $offset += 2;
673 28
                    if ('<' == $char) {
674
                        // get array content
675 28
                        $objval = [];
676
                        do {
677 28
                            $oldOffset = $offset;
678
                            // get element
679 28
                            $element = $this->getRawObject($pdfData, $offset);
680 28
                            $offset = $element[2];
681 28
                            $objval[] = $element;
682 28
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
683
                        // remove closing delimiter
684 28
                        array_pop($objval);
685
                    }
686
                } else {
687
                    // hexadecimal string object
688 10
                    $objtype = $char;
689 10
                    ++$offset;
690 10
                    $pregResult = preg_match(
691 10
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
692 10
                            substr($pdfData, $offset),
693
                            $matches
694
                        );
695 10
                    if (('<' == $char) && 1 == $pregResult) {
696
                        // remove white space characters
697 10
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
698 10
                        $offset += \strlen($matches[0]);
699
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
700
                        $offset = $endpos + 1;
701
                    }
702
                }
703 28
                    break;
704
705
            default:
706 28
                    if ('endobj' == substr($pdfData, $offset, 6)) {
707
                        // indirect object
708 27
                        $objtype = 'endobj';
709 27
                        $offset += 6;
710 28
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
711
                        // null object
712 3
                        $objtype = 'null';
713 3
                        $offset += 4;
714 3
                        $objval = 'null';
715 28
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
716
                        // boolean true object
717 8
                        $objtype = 'boolean';
718 8
                        $offset += 4;
719 8
                        $objval = 'true';
720 28
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
721
                        // boolean false object
722 2
                        $objtype = 'boolean';
723 2
                        $offset += 5;
724 2
                        $objval = 'false';
725 28
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
726
                        // start stream object
727 27
                        $objtype = 'stream';
728 27
                        $offset += 6;
729 27
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
730 27
                            $offset += \strlen($matches[0]);
731 27
                            $pregResult = preg_match(
732 27
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
733 27
                                substr($pdfData, $offset),
734
                                $matches,
735 27
                                \PREG_OFFSET_CAPTURE
736
                            );
737 27
                            if (1 == $pregResult) {
738 27
                                $objval = substr($pdfData, $offset, $matches[0][1]);
739 27
                                $offset += $matches[1][1];
740
                            }
741
                        }
742 28
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
743
                        // end stream object
744 27
                        $objtype = 'endstream';
745 27
                        $offset += 9;
746 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
747
                        // indirect object reference
748 27
                        $objtype = 'objref';
749 27
                        $offset += \strlen($matches[0]);
750 27
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
751 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
752
                        // object start
753 6
                        $objtype = 'obj';
754 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
755 6
                        $offset += \strlen($matches[0]);
756 28
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
757
                        // numeric object
758 27
                        $objtype = 'numeric';
759 27
                        $objval = substr($pdfData, $offset, $numlen);
760 27
                        $offset += $numlen;
761
                    }
762 28
                    break;
763
        }
764
765 28
        return [$objtype, $objval, $offset];
766
    }
767
768
    /**
769
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
770
     *
771
     * @param string $pdfData
772
     * @param int    $offset  xref offset (if know)
773
     * @param array  $xref    previous xref array (if any)
774
     *
775
     * @return array containing xref and trailer data
776
     *
777
     * @throws Exception if it was unable to find startxref
778
     * @throws Exception if it was unable to find xref
779
     */
780 28
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
781
    {
782 28
        $startxrefPreg = preg_match(
783 28
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
784
            $pdfData,
785
            $matches,
786 28
            \PREG_OFFSET_CAPTURE,
787
            $offset
788
        );
789
790 28
        if (0 == $offset) {
791
            // find last startxref
792 28
            $pregResult = preg_match_all(
793 28
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
794
                $pdfData, $matches,
795 28
                \PREG_SET_ORDER,
796
                $offset
797
            );
798 28
            if (0 == $pregResult) {
799
                throw new Exception('Unable to find startxref');
800
            }
801 28
            $matches = array_pop($matches);
802 28
            $startxref = $matches[1];
803 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
804
            // Already pointing at the xref table
805 5
            $startxref = $offset;
806 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
807
            // Cross-Reference Stream object
808 4
            $startxref = $offset;
809
        } elseif ($startxrefPreg) {
810
            // startxref found
811
            $startxref = $matches[1][0];
812
        } else {
813
            throw new Exception('Unable to find startxref');
814
        }
815
816 28
        if ($startxref > \strlen($pdfData)) {
817 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
818
        }
819
820
        // check xref position
821 27
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
822
            // Cross-Reference
823 22
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
824
        } else {
825
            // Cross-Reference Stream
826 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
827
        }
828 27
        if (empty($xref)) {
829
            throw new Exception('Unable to find xref');
830
        }
831
832 27
        return $xref;
833
    }
834
835
    /**
836
     * Parses PDF data and returns extracted data as array.
837
     *
838
     * @param string $data PDF data to parse
839
     *
840
     * @return array array of parsed PDF document objects
841
     *
842
     * @throws Exception if empty PDF data given
843
     * @throws Exception if PDF data missing %PDF header
844
     */
845 28
    public function parseData($data)
846
    {
847 28
        if (empty($data)) {
848
            throw new Exception('Empty PDF data given.');
849
        }
850
        // find the pdf header starting position
851 28
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
852
            throw new Exception('Invalid PDF data: missing %PDF header.');
853
        }
854
855
        // get PDF content string
856 28
        $pdfData = substr($data, $trimpos);
857
858
        // get xref and trailer data
859 28
        $xref = $this->getXrefData($pdfData);
860
861
        // parse all document objects
862 27
        $objects = [];
863 27
        foreach ($xref['xref'] as $obj => $offset) {
864 27
            if (!isset($objects[$obj]) && ($offset > 0)) {
865
                // decode objects with positive offset
866 27
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
867
            }
868
        }
869
870 27
        return [$xref, $objects];
871
    }
872
}
873