Completed
Pull Request — master (#312)
by
unknown
09:12
created

RawDataParser::getIndirectObject()   B

Complexity

Conditions 10
Paths 6

Size

Total Lines 44
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 10.0658

Importance

Changes 5
Bugs 2 Features 1
Metric Value
cc 10
eloc 23
c 5
b 2
f 1
nc 6
nop 5
dl 0
loc 44
ccs 21
cts 23
cp 0.913
crap 10.0658
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 17
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 17
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 17
        $this->filterHelper = new FilterHelper();
69 17
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 17
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 17
        $slength = \strlen($stream);
85 17
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 17
        $filters = [];
89 17
        foreach ($sdic as $k => $v) {
90 17
            if ('/' == $v[0]) {
91 17
                if (('Length' == $v[1]) and (isset($sdic[($k + 1)])) and ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 17
                    $declength = (int) ($sdic[($k + 1)][1]);
94 17
                    if ($declength < $slength) {
95 17
                        $stream = substr($stream, 0, $declength);
96 17
                        $slength = $declength;
97
                    }
98 17
                } elseif (('Filter' == $v[1]) and (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 17
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 17
                    if ('/' == $objval[0]) {
102
                        // single filter
103 17
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 17
        $remaining_filters = [];
118 17
        foreach ($filters as $filter) {
119 17
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 17
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 17
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 3
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 17
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 16
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 16
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 16
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 16
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 16
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
157 16
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 3
                break;
160
            }
161 16
            $offset += \strlen($matches[0][0]);
162 16
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 16
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 16
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 16
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 16
                ++$obj_num;
171 16
            } elseif ('f' == $matches[3][0]) {
172 16
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 16
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 16
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
180 16
            $trailer_data = $matches[1][0];
181 16
            if (!isset($xref['trailer']) or empty($xref['trailer'])) {
182
                // get only the last updated version
183 16
                $xref['trailer'] = [];
184
                // parse trailer_data
185 16
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 16
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 16
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 16
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 16
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 16
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 16
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 16
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 15
                    $xref['trailer']['id'] = [];
199 15
                    $xref['trailer']['id'][0] = $matches[1];
200 15
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 16
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 16
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 16
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 2
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 2
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 2
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 2
        if (!isset($xref['trailer']) or empty($xref['trailer'])) {
231
            // get only the last updated version
232 2
            $xref['trailer'] = [];
233 2
            $filltrailer = true;
234
        } else {
235 2
            $filltrailer = false;
236
        }
237 2
        if (!isset($xref['xref'])) {
238 2
            $xref['xref'] = [];
239
        }
240 2
        $valid_crs = false;
241 2
        $columns = 0;
242 2
        $sarr = $xrefcrs[0][1];
243 2
        if (!\is_array($sarr)) {
244
            $sarr = [];
245
        }
246
247 2
        $wb = [];
248
249 2
        foreach ($sarr as $k => $v) {
250
            if (
251 2
                ('/' == $v[0])
252 2
                && ('Type' == $v[1])
253
                && (
254 2
                    isset($sarr[($k + 1)])
255 2
                    && '/' == $sarr[($k + 1)][0]
256 2
                    && 'XRef' == $sarr[($k + 1)][1]
257
                )
258
            ) {
259 2
                $valid_crs = true;
260 2
            } elseif (('/' == $v[0]) and ('Index' == $v[1]) and (isset($sarr[($k + 1)]))) {
261
                // first object number in the subsection
262 2
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
263 2
            } elseif (('/' == $v[0]) and ('Prev' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
264
                // get previous xref offset
265 2
                $prevxref = (int) ($sarr[($k + 1)][1]);
266 2
            } elseif (('/' == $v[0]) and ('W' == $v[1]) and (isset($sarr[($k + 1)]))) {
267
                // number of bytes (in the decoded stream) of the corresponding field
268 2
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
269 2
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
270 2
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
271 2
            } elseif (('/' == $v[0]) and ('DecodeParms' == $v[1]) and (isset($sarr[($k + 1)][1]))) {
272 2
                $decpar = $sarr[($k + 1)][1];
273 2
                foreach ($decpar as $kdc => $vdc) {
274
                    if (
275 2
                        '/' == $vdc[0]
276 2
                        && 'Columns' == $vdc[1]
277
                        && (
278 2
                            isset($decpar[($kdc + 1)])
279 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
280
                        )
281
                    ) {
282 2
                        $columns = (int) ($decpar[($kdc + 1)][1]);
283
                    } elseif (
284 2
                        '/' == $vdc[0]
285 2
                        && 'Predictor' == $vdc[1]
286
                        && (
287 2
                            isset($decpar[($kdc + 1)])
288 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
289
                        )
290
                    ) {
291 2
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
292
                    }
293
                }
294 2
            } elseif ($filltrailer) {
295 2
                if (('/' == $v[0]) and ('Size' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
296 2
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
297 2
                } elseif (('/' == $v[0]) and ('Root' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
298 2
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
299 2
                } elseif (('/' == $v[0]) and ('Info' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
300 2
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
301 2
                } elseif (('/' == $v[0]) and ('Encrypt' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
302
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
303 2
                } elseif (('/' == $v[0]) and ('ID' == $v[1]) and (isset($sarr[($k + 1)]))) {
304 2
                    $xref['trailer']['id'] = [];
305 2
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
306 2
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
307
                }
308
            }
309
        }
310
311
        // decode data
312 2
        if ($valid_crs and isset($xrefcrs[1][3][0])) {
313
            // number of bytes in a row
314 2
            $rowlen = ($columns + 1);
315
            // convert the stream into an array of integers
316 2
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
317
            // split the rows
318 2
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
319
            // initialize decoded array
320 2
            $ddata = [];
321
            // initialize first row with zeros
322 2
            $prev_row = array_fill(0, $rowlen, 0);
323
            // for each row apply PNG unpredictor
324 2
            foreach ($sdata as $k => $row) {
325
                // initialize new row
326 2
                $ddata[$k] = [];
327
                // get PNG predictor value
328 2
                $predictor = (10 + $row[0]);
329
                // for each byte on the row
330 2
                for ($i = 1; $i <= $columns; ++$i) {
331
                    // new index
332 2
                    $j = ($i - 1);
333 2
                    $row_up = $prev_row[$j];
334 2
                    if (1 == $i) {
335 2
                        $row_left = 0;
336 2
                        $row_upleft = 0;
337
                    } else {
338 2
                        $row_left = $row[($i - 1)];
339 2
                        $row_upleft = $prev_row[($j - 1)];
340
                    }
341 2
                    switch ($predictor) {
342 2
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
343
                            $ddata[$k][$j] = $row[$i];
344
                            break;
345
346 2
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
347
                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
348
                            break;
349
350 2
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
351 2
                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
352 2
                            break;
353
354
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
355
                            $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
356
                            break;
357
358
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
359
                            // initial estimate
360
                            $p = ($row_left + $row_up - $row_upleft);
361
                            // distances
362
                            $pa = abs($p - $row_left);
363
                            $pb = abs($p - $row_up);
364
                            $pc = abs($p - $row_upleft);
365
                            $pmin = min($pa, $pb, $pc);
366
                            // return minimum distance
367
                            switch ($pmin) {
368
                                case $pa:
369
                                    $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
370
                                    break;
371
372
                                case $pb:
373
                                    $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
374
                                    break;
375
376
                                case $pc:
377
                                    $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
378
                                    break;
379
                            }
380
                            break;
381
382
                        default:  // PNG prediction (on encoding, PNG optimum)
383
                            throw new Exception('Unknown PNG predictor');
384
                    }
385
                }
386 2
                $prev_row = $ddata[$k];
387
            } // end for each row
388
            // complete decoding
389 2
            $sdata = [];
390
            // for every row
391 2
            foreach ($ddata as $k => $row) {
392
                // initialize new row
393 2
                $sdata[$k] = [0, 0, 0];
394 2
                if (0 == $wb[0]) {
395
                    // default type field
396
                    $sdata[$k][0] = 1;
397
                }
398 2
                $i = 0; // count bytes in the row
399
                // for every column
400 2
                for ($c = 0; $c < 3; ++$c) {
401
                    // for every byte on the column
402 2
                    for ($b = 0; $b < $wb[$c]; ++$b) {
403 2
                        if (isset($row[$i])) {
404 2
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
405
                        }
406 2
                        ++$i;
407
                    }
408
                }
409
            }
410 2
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
411
            // fill xref
412 2
            if (isset($index_first)) {
413 2
                $obj_num = $index_first;
414
            } else {
415 2
                $obj_num = 0;
416
            }
417 2
            foreach ($sdata as $k => $row) {
418 2
                switch ($row[0]) {
419 2
                    case 0:  // (f) linked list of free objects
420 2
                            break;
421
422 2
                    case 1:  // (n) objects that are in use but are not compressed
423
                            // create unique object index: [object number]_[generation number]
424 2
                            $index = $obj_num.'_'.$row[2];
425
                            // check if object already exist
426 2
                            if (!isset($xref['xref'][$index])) {
427
                                // store object offset position
428 2
                                $xref['xref'][$index] = $row[1];
429
                            }
430 2
                            break;
431
432 2
                    case 2:  // compressed objects
433
                            // $row[1] = object number of the object stream in which this object is stored
434
                            // $row[2] = index of this object within the object stream
435 2
                            $index = $row[1].'_0_'.$row[2];
436 2
                            $xref['xref'][$index] = -1;
437 2
                            break;
438
439
                    default:  // null objects
440
                            break;
441
                }
442 2
                ++$obj_num;
443
            }
444
        } // end decoding data
445 2
        if (isset($prevxref)) {
446
            // get previous xref
447 2
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
448
        }
449
450 2
        return $xref;
451
    }
452
453
    /**
454
     * Get content of indirect object.
455
     *
456
     * @param string $pdfData  PDF data
457
     * @param array  $xref
458
     * @param string $objRef   Object number and generation number separated by underscore character
459
     * @param int    $offset   Object offset
460
     * @param bool   $decoding If true decode streams
461
     *
462
     * @return array containing object data
463
     *
464
     * @throws Exception if invalid object reference found
465
     */
466 17
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
467
    {
468
        // --- build indirect object header
469
        // $objHeader = "[object number] [generation number] obj"
470 17
        $objRefArr = explode('_', $objRef);
471 17
        if (2 !== \count($objRefArr)) {
472
            throw new Exception('Invalid object reference for $obj.');
473
        }
474 17
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
475
476
        // --- check if we are in position
477
        // ignore whitespace characters at offset
478 17
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
479
        // ignore leading zeros for object number
480 17
        $offset += strspn($pdfData, '0', $offset);
481 17
        if (substr($pdfData, $offset, \strlen($objHeader)) !== $objHeader) {
482
            // an indirect reference to an undefined object shall be considered a reference to the null object
483
            return ['null', 'null', $offset];
484
        }
485
486
        // --- get content
487
        // starting position of object content
488 17
        $offset += \strlen($objHeader);
489 17
        $objContentArr = [];
490 17
        $i = 0; // object main index
491
        do {
492 17
            $oldOffset = $offset;
493
            // get element
494 17
            $element = $this->getRawObject($pdfData, $offset);
495 17
            $offset = $element[2];
496
            // decode stream using stream's dictionary information
497 17
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
498 17
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
499
            }
500 17
            $objContentArr[$i] = $element;
501 17
            ++$i;
502 17
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
503
        // remove closing delimiter
504 17
        if ('endobj' === $objContentArr[\count($objContentArr) - 1][0]) {
505 17
            array_pop($objContentArr);
506
        }
507
508
        // --- return raw object content
509 17
        return $objContentArr;
510
    }
511
512
    /**
513
     * Get the content of object, resolving indirect object reference if necessary.
514
     *
515
     * @param string $pdfData PDF data
516
     * @param array  $obj     Object value
517
     *
518
     * @return array containing object data
519
     *
520
     * @throws Exception
521
     */
522 17
    protected function getObjectVal($pdfData, $xref, $obj)
523
    {
524 17
        if ('objref' == $obj[0]) {
525
            // reference to indirect object
526
            if (isset($this->objects[$obj[1]])) {
527
                // this object has been already parsed
528
                return $this->objects[$obj[1]];
529
            } elseif (isset($xref[$obj[1]])) {
530
                // parse new object
531
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
532
533
                return $this->objects[$obj[1]];
534
            }
535
        }
536
537 17
        return $obj;
538
    }
539
540
    /**
541
     * Get object type, raw value and offset to next object
542
     *
543
     * @param int $offset Object offset
544
     *
545
     * @return array containing object type, raw value and offset to next object
546
     */
547 17
    protected function getRawObject($pdfData, $offset = 0)
548
    {
549 17
        $objtype = ''; // object type to be returned
550 17
        $objval = ''; // object value to be returned
551
552
        /*
553
         * skip initial white space chars:
554
         *      \x00 null (NUL)
555
         *      \x09 horizontal tab (HT)
556
         *      \x0A line feed (LF)
557
         *      \x0C form feed (FF)
558
         *      \x0D carriage return (CR)
559
         *      \x20 space (SP)
560
         */
561 17
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
562
563
        // get first char
564 17
        $char = $pdfData[$offset];
565
        // get object type
566 17
        switch ($char) {
567 17
            case '%':  // \x25 PERCENT SIGN
568
                    // skip comment and search for next token
569
                    $next = strcspn($pdfData, "\r\n", $offset);
570
                    if ($next > 0) {
571
                        $offset += $next;
572
573
                        return $this->getRawObject($pdfData, $offset);
574
                    }
575
                    break;
576
577 17
            case '/':  // \x2F SOLIDUS
578
                    // name object
579 17
                    $objtype = $char;
580 17
                    ++$offset;
581 17
                    $pregResult = preg_match(
582 17
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
583 17
                        substr($pdfData, $offset, 256),
584
                        $matches
585
                    );
586 17
                    if (1 == $pregResult) {
587 17
                        $objval = $matches[1]; // unescaped value
588 17
                        $offset += \strlen($objval);
589
                    }
590 17
                    break;
591
592 17
            case '(':   // \x28 LEFT PARENTHESIS
593 17
            case ')':  // \x29 RIGHT PARENTHESIS
594
                    // literal string object
595 17
                    $objtype = $char;
596 17
                    ++$offset;
597 17
                    $strpos = $offset;
598 17
                    if ('(' == $char) {
599 17
                        $open_bracket = 1;
600 17
                        while ($open_bracket > 0) {
601 17
                            if (!isset($pdfData[$strpos])) {
602
                                break;
603
                            }
604 17
                            $ch = $pdfData[$strpos];
605 17
                            switch ($ch) {
606 17
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
607
                                        // skip next character
608 14
                                        ++$strpos;
609 14
                                        break;
610
611 17
                                case '(':  // LEFT PARENHESIS (28h)
612
                                        ++$open_bracket;
613
                                        break;
614
615 17
                                case ')':  // RIGHT PARENTHESIS (29h)
616 17
                                        --$open_bracket;
617 17
                                        break;
618
                            }
619 17
                            ++$strpos;
620
                        }
621 17
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
622 17
                        $offset = $strpos;
623
                    }
624 17
                    break;
625
626 17
            case '[':   // \x5B LEFT SQUARE BRACKET
627 17
            case ']':  // \x5D RIGHT SQUARE BRACKET
628
                    // array object
629 17
                    $objtype = $char;
630 17
                    ++$offset;
631 17
                    if ('[' == $char) {
632
                        // get array content
633 17
                        $objval = [];
634
                        do {
635
                            // get element
636 17
                            $element = $this->getRawObject($pdfData, $offset);
637 17
                            $offset = $element[2];
638 17
                            $objval[] = $element;
639 17
                        } while (']' != $element[0]);
640
                        // remove closing delimiter
641 17
                        array_pop($objval);
642
                    }
643 17
                    break;
644
645 17
            case '<':  // \x3C LESS-THAN SIGN
646 17
            case '>':  // \x3E GREATER-THAN SIGN
647 17
                    if (isset($pdfData[($offset + 1)]) and ($pdfData[($offset + 1)] == $char)) {
648
                        // dictionary object
649 17
                        $objtype = $char.$char;
650 17
                        $offset += 2;
651 17
                        if ('<' == $char) {
652
                            // get array content
653 17
                            $objval = [];
654
                            do {
655
                                // get element
656 17
                                $element = $this->getRawObject($pdfData, $offset);
657 17
                                $offset = $element[2];
658 17
                                $objval[] = $element;
659 17
                            } while ('>>' != $element[0]);
660
                            // remove closing delimiter
661 17
                            array_pop($objval);
662
                        }
663
                    } else {
664
                        // hexadecimal string object
665 4
                        $objtype = $char;
666 4
                        ++$offset;
667 4
                        $pregResult = preg_match(
668 4
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
669 4
                            substr($pdfData, $offset),
670
                            $matches
671
                        );
672 4
                        if (('<' == $char) && 1 == $pregResult) {
673
                            // remove white space characters
674 4
                            $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
675 4
                            $offset += \strlen($matches[0]);
676
                        } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
677
                            $offset = $endpos + 1;
678
                        }
679
                    }
680 17
                    break;
681
682
            default:
683 17
                    if ('endobj' == substr($pdfData, $offset, 6)) {
684
                        // indirect object
685 17
                        $objtype = 'endobj';
686 17
                        $offset += 6;
687 17
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
688
                        // null object
689 2
                        $objtype = 'null';
690 2
                        $offset += 4;
691 2
                        $objval = 'null';
692 17
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
693
                        // boolean true object
694 4
                        $objtype = 'boolean';
695 4
                        $offset += 4;
696 4
                        $objval = 'true';
697 17
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
698
                        // boolean false object
699 1
                        $objtype = 'boolean';
700 1
                        $offset += 5;
701 1
                        $objval = 'false';
702 17
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
703
                        // start stream object
704 17
                        $objtype = 'stream';
705 17
                        $offset += 6;
706 17
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
707 17
                            $offset += \strlen($matches[0]);
708 17
                            $pregResult = preg_match(
709 17
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
710 17
                                substr($pdfData, $offset),
711
                                $matches,
712 17
                                PREG_OFFSET_CAPTURE
713
                            );
714 17
                            if (1 == $pregResult) {
715 17
                                $objval = substr($pdfData, $offset, $matches[0][1]);
716 17
                                $offset += $matches[1][1];
717
                            }
718
                        }
719 17
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
720
                        // end stream object
721 17
                        $objtype = 'endstream';
722 17
                        $offset += 9;
723 17
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
724
                        // indirect object reference
725 17
                        $objtype = 'objref';
726 17
                        $offset += \strlen($matches[0]);
727 17
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
728 17
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
729
                        // object start
730 2
                        $objtype = 'obj';
731 2
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
732 2
                        $offset += \strlen($matches[0]);
733 17
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
734
                        // numeric object
735 17
                        $objtype = 'numeric';
736 17
                        $objval = substr($pdfData, $offset, $numlen);
737 17
                        $offset += $numlen;
738
                    }
739 17
                    break;
740
        }
741
742 17
        return [$objtype, $objval, $offset];
743
    }
744
745
    /**
746
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
747
     *
748
     * @param string $pdfData
749
     * @param int    $offset  xref offset (if know)
750
     * @param array  $xref    previous xref array (if any)
751
     *
752
     * @return array containing xref and trailer data
753
     *
754
     * @throws Exception if it was unable to find startxref
755
     * @throws Exception if it was unable to find xref
756
     */
757 17
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
758
    {
759 17
        $startxrefPreg = preg_match(
760 17
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
761
            $pdfData,
762
            $matches,
763 17
            PREG_OFFSET_CAPTURE,
764
            $offset
765
        );
766
767 17
        if (0 == $offset) {
768
            // find last startxref
769 17
            $pregResult = preg_match_all(
770 17
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
771
                $pdfData, $matches,
772 17
                PREG_SET_ORDER,
773
                $offset
774
            );
775 17
            if (0 == $pregResult) {
776
                throw new Exception('Unable to find startxref');
777
            }
778 17
            $matches = array_pop($matches);
779 17
            $startxref = $matches[1];
780 4
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
781
            // Already pointing at the xref table
782 3
            $startxref = $offset;
783 2
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
784
            // Cross-Reference Stream object
785 2
            $startxref = $offset;
786
        } elseif ($startxrefPreg) {
787
            // startxref found
788
            $startxref = $matches[1][0];
789
        } else {
790
            throw new Exception('Unable to find startxref');
791
        }
792
793
        // check xref position
794 17
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
795
            // Cross-Reference
796 16
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
797
        } else {
798
            // Cross-Reference Stream
799 2
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
800
        }
801 17
        if (empty($xref)) {
802
            throw new Exception('Unable to find xref');
803
        }
804
805 17
        return $xref;
806
    }
807
808
    /**
809
     * Parses PDF data and returns extracted data as array.
810
     *
811
     * @param string $data PDF data to parse
812
     *
813
     * @return array array of parsed PDF document objects
814
     *
815
     * @throws Exception if empty PDF data given
816
     * @throws Exception if PDF data missing %PDF header
817
     */
818 17
    public function parseData($data)
819
    {
820 17
        if (empty($data)) {
821
            throw new Exception('Empty PDF data given.');
822
        }
823
        // find the pdf header starting position
824 17
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
825
            throw new Exception('Invalid PDF data: missing %PDF header.');
826
        }
827
828
        // get PDF content string
829 17
        $pdfData = substr($data, $trimpos);
830
831
        // get xref and trailer data
832 17
        $xref = $this->getXrefData($pdfData);
833
834
        // parse all document objects
835 17
        $objects = [];
836 17
        foreach ($xref['xref'] as $obj => $offset) {
837 17
            if (!isset($objects[$obj]) and ($offset > 0)) {
838
                // decode objects with positive offset
839 17
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
840
            }
841
        }
842
843 17
        return [$xref, $objects];
844
    }
845
}
846