Passed
Pull Request — master (#400)
by Sebastien
04:40 queued 02:27
created

RawDataParser::getRawObject()   F

Complexity

Conditions 39
Paths 36

Size

Total Lines 198
Code Lines 134

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 118
CRAP Score 39.7245

Importance

Changes 4
Bugs 2 Features 1
Metric Value
cc 39
eloc 134
c 4
b 2
f 1
nc 36
nop 2
dl 0
loc 198
ccs 118
cts 128
cp 0.9219
crap 39.7245
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 31
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 31
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 31
        $this->filterHelper = new FilterHelper();
69 31
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 27
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 27
        $slength = \strlen($stream);
85 27
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 27
        $filters = [];
89 27
        foreach ($sdic as $k => $v) {
90 27
            if ('/' == $v[0]) {
91 27
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 26
                    $declength = (int) ($sdic[($k + 1)][1]);
94 26
                    if ($declength < $slength) {
95 26
                        $stream = substr($stream, 0, $declength);
96 26
                        $slength = $declength;
97
                    }
98 27
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 27
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 27
                    if ('/' == $objval[0]) {
102
                        // single filter
103 27
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 27
        $remaining_filters = [];
118 27
        foreach ($filters as $filter) {
119 27
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 27
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 27
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 4
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 27
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 22
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 22
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 22
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 22
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 22
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
157 22
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 22
            $offset += \strlen($matches[0][0]);
162 22
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 22
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 22
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 22
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 22
                ++$obj_num;
171 22
            } elseif ('f' == $matches[3][0]) {
172 22
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 22
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 22
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
180 22
            $trailer_data = $matches[1][0];
181 22
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 22
                $xref['trailer'] = [];
184
                // parse trailer_data
185 22
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 22
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 22
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 22
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 22
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 21
                    $xref['trailer']['id'] = [];
199 21
                    $xref['trailer']['id'][0] = $matches[1];
200 21
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 22
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 22
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 22
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 6
            $xref['trailer'] = [];
233 6
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 6
        if (!isset($xref['xref'])) {
238 6
            $xref['xref'] = [];
239
        }
240 6
        $valid_crs = false;
241 6
        $columns = 0;
242 6
        $sarr = $xrefcrs[0][1];
243 6
        if (!\is_array($sarr)) {
244
            $sarr = [];
245
        }
246
247 6
        $wb = [];
248
249 6
        foreach ($sarr as $k => $v) {
250
            if (
251 6
                ('/' == $v[0])
252 6
                && ('Type' == $v[1])
253
                && (
254 6
                    isset($sarr[($k + 1)])
255 6
                    && '/' == $sarr[($k + 1)][0]
256 6
                    && 'XRef' == $sarr[($k + 1)][1]
257
                )
258
            ) {
259 6
                $valid_crs = true;
260 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
261
                // first object number in the subsection
262 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
263 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
264
                // get previous xref offset
265 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
266 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
267
                // number of bytes (in the decoded stream) of the corresponding field
268 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
269 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
270 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
271 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
272 5
                $decpar = $sarr[($k + 1)][1];
273 5
                foreach ($decpar as $kdc => $vdc) {
274
                    if (
275 5
                        '/' == $vdc[0]
276 5
                        && 'Columns' == $vdc[1]
277
                        && (
278 5
                            isset($decpar[($kdc + 1)])
279 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
280
                        )
281
                    ) {
282 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
283
                    } elseif (
284 5
                        '/' == $vdc[0]
285 5
                        && 'Predictor' == $vdc[1]
286
                        && (
287 5
                            isset($decpar[($kdc + 1)])
288 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
289
                        )
290
                    ) {
291 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
292
                    }
293
                }
294 6
            } elseif ($filltrailer) {
295 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
296 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
297 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
298 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
299 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
300 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
301 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
302
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
303 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
304 6
                    $xref['trailer']['id'] = [];
305 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
306 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
307
                }
308
            }
309
        }
310
311
        // decode data
312 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
313 6
            if (isset($predictor)) {
314
                // number of bytes in a row
315 5
                $rowlen = ($columns + 1);
316
                // convert the stream into an array of integers
317 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
318
                // split the rows
319 5
                $sdata = array_chunk($sdata, $rowlen);
320
321
                // initialize decoded array
322 5
                $ddata = [];
323
                // initialize first row with zeros
324 5
                $prev_row = array_fill(0, $rowlen, 0);
325
                // for each row apply PNG unpredictor
326 5
                foreach ($sdata as $k => $row) {
327
                    // initialize new row
328 5
                    $ddata[$k] = [];
329
                    // get PNG predictor value
330 5
                    $predictor = (10 + $row[0]);
331
                    // for each byte on the row
332 5
                    for ($i = 1; $i <= $columns; ++$i) {
333
                        // new index
334 5
                        $j = ($i - 1);
335 5
                        $row_up = $prev_row[$j];
336 5
                        if (1 == $i) {
337 5
                            $row_left = 0;
338 5
                            $row_upleft = 0;
339
                        } else {
340 5
                            $row_left = $row[($i - 1)];
341 5
                            $row_upleft = $prev_row[($j - 1)];
342
                        }
343 5
                        switch ($predictor) {
344 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
345
                                $ddata[$k][$j] = $row[$i];
346
                                break;
347
348 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
349
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
350
                                break;
351
352 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
353 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
354 5
                                break;
355
356
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
357
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
358
                                break;
359
360
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
361
                                // initial estimate
362
                                $p = ($row_left + $row_up - $row_upleft);
363
                                // distances
364
                                $pa = abs($p - $row_left);
365
                                $pb = abs($p - $row_up);
366
                                $pc = abs($p - $row_upleft);
367
                                $pmin = min($pa, $pb, $pc);
368
                                // return minimum distance
369
                                switch ($pmin) {
370
                                    case $pa:
371
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
372
                                        break;
373
374
                                    case $pb:
375
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
376
                                        break;
377
378
                                    case $pc:
379
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
380
                                        break;
381
                                }
382
                                break;
383
384
                            default:  // PNG prediction (on encoding, PNG optimum)
385
                                throw new Exception('Unknown PNG predictor: '.$predictor);
386
                        }
387
                    }
388 5
                    $prev_row = $ddata[$k];
389
                } // end for each row
390
                // complete decoding
391
            } else {
392
                // number of bytes in a row
393 2
                $rowlen = array_sum($wb);
394
                // convert the stream into an array of integers
395 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
396
                // split the rows
397 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

397
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
398
            }
399
400 6
            $sdata = [];
401
402
            // for every row
403 6
            foreach ($ddata as $k => $row) {
404
                // initialize new row
405 6
                $sdata[$k] = [0, 0, 0];
406 6
                if (0 == $wb[0]) {
407
                    // default type field
408
                    $sdata[$k][0] = 1;
409
                }
410 6
                $i = 0; // count bytes in the row
411
                // for every column
412 6
                for ($c = 0; $c < 3; ++$c) {
413
                    // for every byte on the column
414 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
415 6
                        if (isset($row[$i])) {
416 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
417
                        }
418 6
                        ++$i;
419
                    }
420
                }
421
            }
422 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
423
            // fill xref
424 6
            if (isset($index_first)) {
425 4
                $obj_num = $index_first;
426
            } else {
427 6
                $obj_num = 0;
428
            }
429 6
            foreach ($sdata as $k => $row) {
430 6
                switch ($row[0]) {
431 6
                    case 0:  // (f) linked list of free objects
432 6
                            break;
433
434 6
                    case 1:  // (n) objects that are in use but are not compressed
435
                            // create unique object index: [object number]_[generation number]
436 6
                            $index = $obj_num.'_'.$row[2];
437
                            // check if object already exist
438 6
                            if (!isset($xref['xref'][$index])) {
439
                                // store object offset position
440 6
                                $xref['xref'][$index] = $row[1];
441
                            }
442 6
                            break;
443
444 6
                    case 2:  // compressed objects
445
                            // $row[1] = object number of the object stream in which this object is stored
446
                            // $row[2] = index of this object within the object stream
447 6
                            $index = $row[1].'_0_'.$row[2];
448 6
                            $xref['xref'][$index] = -1;
449 6
                            break;
450
451
                    default:  // null objects
452
                            break;
453
                }
454 6
                ++$obj_num;
455
            }
456
        } // end decoding data
457 6
        if (isset($prevxref)) {
458
            // get previous xref
459 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
460
        }
461
462 6
        return $xref;
463
    }
464
465
    /**
466
     * Get content of indirect object.
467
     *
468
     * @param string $pdfData  PDF data
469
     * @param array  $xref
470
     * @param string $objRef   Object number and generation number separated by underscore character
471
     * @param int    $offset   Object offset
472
     * @param bool   $decoding If true decode streams
473
     *
474
     * @return array containing object data
475
     *
476
     * @throws Exception if invalid object reference found
477
     */
478 27
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
479
    {
480
        /*
481
         * build indirect object header
482
         */
483
        // $objHeader = "[object number] [generation number] obj"
484 27
        $objRefArr = explode('_', $objRef);
485 27
        if (2 !== \count($objRefArr)) {
486
            throw new Exception('Invalid object reference for $obj.');
487
        }
488 27
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
489
490
        /*
491
         * check if we are in position
492
         */
493
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
494 27
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
495
        // ignore leading zeros for object number
496 27
        $offset += strspn($pdfData, '0', $offset);
497 27
        if (substr($pdfData, $offset, \strlen($objHeader)) !== $objHeader) {
498
            // an indirect reference to an undefined object shall be considered a reference to the null object
499 2
            return ['null', 'null', $offset];
500
        }
501
502
        /*
503
         * get content
504
         */
505
        // starting position of object content
506 27
        $offset += \strlen($objHeader);
507 27
        $objContentArr = [];
508 27
        $i = 0; // object main index
509
        do {
510 27
            $oldOffset = $offset;
511
            // get element
512 27
            $element = $this->getRawObject($pdfData, $offset);
513 27
            $offset = $element[2];
514
            // decode stream using stream's dictionary information
515 27
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
516 27
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
517
            }
518 27
            $objContentArr[$i] = $element;
519 27
            ++$i;
520 27
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
521
        // remove closing delimiter
522 27
        array_pop($objContentArr);
523
524
        /*
525
         * return raw object content
526
         */
527 27
        return $objContentArr;
528
    }
529
530
    /**
531
     * Get the content of object, resolving indirect object reference if necessary.
532
     *
533
     * @param string $pdfData PDF data
534
     * @param array  $obj     Object value
535
     *
536
     * @return array containing object data
537
     *
538
     * @throws Exception
539
     */
540 27
    protected function getObjectVal($pdfData, $xref, $obj)
541
    {
542 27
        if ('objref' == $obj[0]) {
543
            // reference to indirect object
544
            if (isset($this->objects[$obj[1]])) {
545
                // this object has been already parsed
546
                return $this->objects[$obj[1]];
547
            } elseif (isset($xref[$obj[1]])) {
548
                // parse new object
549
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
550
551
                return $this->objects[$obj[1]];
552
            }
553
        }
554
555 27
        return $obj;
556
    }
557
558
    /**
559
     * Get object type, raw value and offset to next object
560
     *
561
     * @param int $offset Object offset
562
     *
563
     * @return array containing object type, raw value and offset to next object
564
     */
565 28
    protected function getRawObject($pdfData, $offset = 0)
566
    {
567 28
        $objtype = ''; // object type to be returned
568 28
        $objval = ''; // object value to be returned
569
570
        /*
571
         * skip initial white space chars:
572
         *      \x00 null (NUL)
573
         *      \x09 horizontal tab (HT)
574
         *      \x0A line feed (LF)
575
         *      \x0C form feed (FF)
576
         *      \x0D carriage return (CR)
577
         *      \x20 space (SP)
578
         */
579 28
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
580
581
        // get first char
582 28
        $char = $pdfData[$offset];
583
        // get object type
584 28
        switch ($char) {
585 28
            case '%':  // \x25 PERCENT SIGN
586
                    // skip comment and search for next token
587
                    $next = strcspn($pdfData, "\r\n", $offset);
588
                    if ($next > 0) {
589
                        $offset += $next;
590
591
                        return $this->getRawObject($pdfData, $offset);
592
                    }
593
                    break;
594
595 28
            case '/':  // \x2F SOLIDUS
596
                    // name object
597 28
                    $objtype = $char;
598 28
                    ++$offset;
599 28
                    $pregResult = preg_match(
600 28
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
601 28
                        substr($pdfData, $offset, 256),
602
                        $matches
603
                    );
604 28
                    if (1 == $pregResult) {
605 28
                        $objval = $matches[1]; // unescaped value
606 28
                        $offset += \strlen($objval);
607
                    }
608 28
                    break;
609
610 28
            case '(':   // \x28 LEFT PARENTHESIS
611 28
            case ')':  // \x29 RIGHT PARENTHESIS
612
                    // literal string object
613 25
                    $objtype = $char;
614 25
                    ++$offset;
615 25
                    $strpos = $offset;
616 25
                    if ('(' == $char) {
617 25
                        $open_bracket = 1;
618 25
                        while ($open_bracket > 0) {
619 25
                            if (!isset($pdfData[$strpos])) {
620
                                break;
621
                            }
622 25
                            $ch = $pdfData[$strpos];
623 25
                            switch ($ch) {
624 25
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
625
                                        // skip next character
626 15
                                        ++$strpos;
627 15
                                        break;
628
629 25
                                case '(':  // LEFT PARENHESIS (28h)
630
                                        ++$open_bracket;
631
                                        break;
632
633 25
                                case ')':  // RIGHT PARENTHESIS (29h)
634 25
                                        --$open_bracket;
635 25
                                        break;
636
                            }
637 25
                            ++$strpos;
638
                        }
639 25
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
640 25
                        $offset = $strpos;
641
                    }
642 25
                    break;
643
644 28
            case '[':   // \x5B LEFT SQUARE BRACKET
645 28
            case ']':  // \x5D RIGHT SQUARE BRACKET
646
                // array object
647 27
                $objtype = $char;
648 27
                ++$offset;
649 27
                if ('[' == $char) {
650
                    // get array content
651 27
                    $objval = [];
652
                    do {
653 27
                        $oldOffset = $offset;
654
                        // get element
655 27
                        $element = $this->getRawObject($pdfData, $offset);
656 27
                        $offset = $element[2];
657 27
                        $objval[] = $element;
658 27
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
659
                    // remove closing delimiter
660 27
                    array_pop($objval);
661
                }
662 27
                break;
663
664 28
            case '<':  // \x3C LESS-THAN SIGN
665 28
            case '>':  // \x3E GREATER-THAN SIGN
666 28
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
667
                    // dictionary object
668 28
                    $objtype = $char.$char;
669 28
                    $offset += 2;
670 28
                    if ('<' == $char) {
671
                        // get array content
672 28
                        $objval = [];
673
                        do {
674 28
                            $oldOffset = $offset;
675
                            // get element
676 28
                            $element = $this->getRawObject($pdfData, $offset);
677 28
                            $offset = $element[2];
678 28
                            $objval[] = $element;
679 28
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
680
                        // remove closing delimiter
681 28
                        array_pop($objval);
682
                    }
683
                } else {
684
                    // hexadecimal string object
685 10
                    $objtype = $char;
686 10
                    ++$offset;
687 10
                    $pregResult = preg_match(
688 10
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
689 10
                            substr($pdfData, $offset),
690
                            $matches
691
                        );
692 10
                    if (('<' == $char) && 1 == $pregResult) {
693
                        // remove white space characters
694 10
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
695 10
                        $offset += \strlen($matches[0]);
696
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
697
                        $offset = $endpos + 1;
698
                    }
699
                }
700 28
                    break;
701
702
            default:
703 28
                    if ('endobj' == substr($pdfData, $offset, 6)) {
704
                        // indirect object
705 27
                        $objtype = 'endobj';
706 27
                        $offset += 6;
707 28
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
708
                        // null object
709 3
                        $objtype = 'null';
710 3
                        $offset += 4;
711 3
                        $objval = 'null';
712 28
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
713
                        // boolean true object
714 8
                        $objtype = 'boolean';
715 8
                        $offset += 4;
716 8
                        $objval = 'true';
717 28
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
718
                        // boolean false object
719 2
                        $objtype = 'boolean';
720 2
                        $offset += 5;
721 2
                        $objval = 'false';
722 28
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
723
                        // start stream object
724 27
                        $objtype = 'stream';
725 27
                        $offset += 6;
726 27
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
727 27
                            $offset += \strlen($matches[0]);
728 27
                            $pregResult = preg_match(
729 27
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
730 27
                                substr($pdfData, $offset),
731
                                $matches,
732 27
                                \PREG_OFFSET_CAPTURE
733
                            );
734 27
                            if (1 == $pregResult) {
735 27
                                $objval = substr($pdfData, $offset, $matches[0][1]);
736 27
                                $offset += $matches[1][1];
737
                            }
738
                        }
739 28
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
740
                        // end stream object
741 27
                        $objtype = 'endstream';
742 27
                        $offset += 9;
743 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
744
                        // indirect object reference
745 27
                        $objtype = 'objref';
746 27
                        $offset += \strlen($matches[0]);
747 27
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
748 28
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
749
                        // object start
750 6
                        $objtype = 'obj';
751 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
752 6
                        $offset += \strlen($matches[0]);
753 28
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
754
                        // numeric object
755 27
                        $objtype = 'numeric';
756 27
                        $objval = substr($pdfData, $offset, $numlen);
757 27
                        $offset += $numlen;
758
                    }
759 28
                    break;
760
        }
761
762 28
        return [$objtype, $objval, $offset];
763
    }
764
765
    /**
766
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
767
     *
768
     * @param string $pdfData
769
     * @param int    $offset  xref offset (if know)
770
     * @param array  $xref    previous xref array (if any)
771
     *
772
     * @return array containing xref and trailer data
773
     *
774
     * @throws Exception if it was unable to find startxref
775
     * @throws Exception if it was unable to find xref
776
     */
777 28
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
778
    {
779 28
        $startxrefPreg = preg_match(
780 28
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
781
            $pdfData,
782
            $matches,
783 28
            \PREG_OFFSET_CAPTURE,
784
            $offset
785
        );
786
787 28
        if (0 == $offset) {
788
            // find last startxref
789 28
            $pregResult = preg_match_all(
790 28
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
791
                $pdfData, $matches,
792 28
                \PREG_SET_ORDER,
793
                $offset
794
            );
795 28
            if (0 == $pregResult) {
796
                throw new Exception('Unable to find startxref');
797
            }
798 28
            $matches = array_pop($matches);
799 28
            $startxref = $matches[1];
800 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
801
            // Already pointing at the xref table
802 5
            $startxref = $offset;
803 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
804
            // Cross-Reference Stream object
805 4
            $startxref = $offset;
806
        } elseif ($startxrefPreg) {
807
            // startxref found
808
            $startxref = $matches[1][0];
809
        } else {
810
            throw new Exception('Unable to find startxref');
811
        }
812
813 28
        if ($startxref > \strlen($pdfData)) {
814 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
815
        }
816
817
        // check xref position
818 27
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
819
            // Cross-Reference
820 22
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
821
        } else {
822
            // Cross-Reference Stream
823 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
824
        }
825 27
        if (empty($xref)) {
826
            throw new Exception('Unable to find xref');
827
        }
828
829 27
        return $xref;
830
    }
831
832
    /**
833
     * Parses PDF data and returns extracted data as array.
834
     *
835
     * @param string $data PDF data to parse
836
     *
837
     * @return array array of parsed PDF document objects
838
     *
839
     * @throws Exception if empty PDF data given
840
     * @throws Exception if PDF data missing %PDF header
841
     */
842 28
    public function parseData($data)
843
    {
844 28
        if (empty($data)) {
845
            throw new Exception('Empty PDF data given.');
846
        }
847
        // find the pdf header starting position
848 28
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
849
            throw new Exception('Invalid PDF data: missing %PDF header.');
850
        }
851
852
        // get PDF content string
853 28
        $pdfData = substr($data, $trimpos);
854
855
        // get xref and trailer data
856 28
        $xref = $this->getXrefData($pdfData);
857
858
        // parse all document objects
859 27
        $objects = [];
860 27
        foreach ($xref['xref'] as $obj => $offset) {
861 27
            if (!isset($objects[$obj]) && ($offset > 0)) {
862
                // decode objects with positive offset
863 27
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
864
            }
865
        }
866
867 27
        return [$xref, $objects];
868
    }
869
}
870