Passed
Push — master ( 8b8a15...b47f26 )
by Konrad
02:41
created

RawDataParser::decodeStream()   D

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 21.0148

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 4
dl 0
loc 56
ccs 30
cts 31
cp 0.9677
crap 21.0148
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 29
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 29
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 29
        $this->filterHelper = new FilterHelper();
69 29
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 25
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 25
        $slength = \strlen($stream);
85 25
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 25
        $filters = [];
89 25
        foreach ($sdic as $k => $v) {
90 25
            if ('/' == $v[0]) {
91 25
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 25
                    $declength = (int) ($sdic[($k + 1)][1]);
94 25
                    if ($declength < $slength) {
95 25
                        $stream = substr($stream, 0, $declength);
96 25
                        $slength = $declength;
97
                    }
98 25
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 25
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 25
                    if ('/' == $objval[0]) {
102
                        // single filter
103 25
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 25
        $remaining_filters = [];
118 25
        foreach ($filters as $filter) {
119 25
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 25
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 25
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 3
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 25
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 21
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 21
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 21
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 21
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 21
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
157 21
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 5
                break;
160
            }
161 21
            $offset += \strlen($matches[0][0]);
162 21
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 21
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 21
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 21
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 21
                ++$obj_num;
171 21
            } elseif ('f' == $matches[3][0]) {
172 21
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 21
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 21
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
180 21
            $trailer_data = $matches[1][0];
181 21
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
182
                // get only the last updated version
183 21
                $xref['trailer'] = [];
184
                // parse trailer_data
185 21
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 21
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 21
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 21
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 21
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 21
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 21
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 21
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 20
                    $xref['trailer']['id'] = [];
199 20
                    $xref['trailer']['id'][0] = $matches[1];
200 20
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 21
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 21
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 21
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 5
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 5
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 5
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 5
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
231
            // get only the last updated version
232 5
            $xref['trailer'] = [];
233 5
            $filltrailer = true;
234
        } else {
235 4
            $filltrailer = false;
236
        }
237 5
        if (!isset($xref['xref'])) {
238 5
            $xref['xref'] = [];
239
        }
240 5
        $valid_crs = false;
241 5
        $columns = 0;
242 5
        $sarr = $xrefcrs[0][1];
243 5
        if (!\is_array($sarr)) {
244
            $sarr = [];
245
        }
246
247 5
        $wb = [];
248
249 5
        foreach ($sarr as $k => $v) {
250
            if (
251 5
                ('/' == $v[0])
252 5
                && ('Type' == $v[1])
253
                && (
254 5
                    isset($sarr[($k + 1)])
255 5
                    && '/' == $sarr[($k + 1)][0]
256 5
                    && 'XRef' == $sarr[($k + 1)][1]
257
                )
258
            ) {
259 5
                $valid_crs = true;
260 5
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
261
                // first object number in the subsection
262 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
263 5
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
264
                // get previous xref offset
265 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
266 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
267
                // number of bytes (in the decoded stream) of the corresponding field
268 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
269 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
270 5
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
271 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
272 5
                $decpar = $sarr[($k + 1)][1];
273 5
                foreach ($decpar as $kdc => $vdc) {
274
                    if (
275 5
                        '/' == $vdc[0]
276 5
                        && 'Columns' == $vdc[1]
277
                        && (
278 5
                            isset($decpar[($kdc + 1)])
279 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
280
                        )
281
                    ) {
282 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
283
                    } elseif (
284 5
                        '/' == $vdc[0]
285 5
                        && 'Predictor' == $vdc[1]
286
                        && (
287 5
                            isset($decpar[($kdc + 1)])
288 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
289
                        )
290
                    ) {
291 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
292
                    }
293
                }
294 5
            } elseif ($filltrailer) {
295 5
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
296 5
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
297 5
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
298 5
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
299 5
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
300 5
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
301 5
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
302
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
303 5
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
304 5
                    $xref['trailer']['id'] = [];
305 5
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
306 5
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
307
                }
308
            }
309
        }
310
311
        // decode data
312 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
313
            // number of bytes in a row
314 5
            $rowlen = ($columns + 1);
315
            // convert the stream into an array of integers
316 5
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
317
            // split the rows
318 5
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
319
            // initialize decoded array
320 5
            $ddata = [];
321
            // initialize first row with zeros
322 5
            $prev_row = array_fill(0, $rowlen, 0);
323
            // for each row apply PNG unpredictor
324 5
            foreach ($sdata as $k => $row) {
325
                // initialize new row
326 5
                $ddata[$k] = [];
327
                // get PNG predictor value
328 5
                $predictor = (10 + $row[0]);
329
                // for each byte on the row
330 5
                for ($i = 1; $i <= $columns; ++$i) {
331
                    // new index
332 5
                    $j = ($i - 1);
333 5
                    $row_up = $prev_row[$j];
334 5
                    if (1 == $i) {
335 5
                        $row_left = 0;
336 5
                        $row_upleft = 0;
337
                    } else {
338 5
                        $row_left = $row[($i - 1)];
339 5
                        $row_upleft = $prev_row[($j - 1)];
340
                    }
341 5
                    switch ($predictor) {
342 5
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
343
                            $ddata[$k][$j] = $row[$i];
344
                            break;
345
346 5
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
347
                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
348
                            break;
349
350 5
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
351 5
                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
352 5
                            break;
353
354
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
355
                            $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
356
                            break;
357
358
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
359
                            // initial estimate
360
                            $p = ($row_left + $row_up - $row_upleft);
361
                            // distances
362
                            $pa = abs($p - $row_left);
363
                            $pb = abs($p - $row_up);
364
                            $pc = abs($p - $row_upleft);
365
                            $pmin = min($pa, $pb, $pc);
366
                            // return minimum distance
367
                            switch ($pmin) {
368
                                case $pa:
369
                                    $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
370
                                    break;
371
372
                                case $pb:
373
                                    $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
374
                                    break;
375
376
                                case $pc:
377
                                    $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
378
                                    break;
379
                            }
380
                            break;
381
382
                        default:  // PNG prediction (on encoding, PNG optimum)
383
                            throw new Exception('Unknown PNG predictor');
384
                    }
385
                }
386 5
                $prev_row = $ddata[$k];
387
            } // end for each row
388
            // complete decoding
389 5
            $sdata = [];
390
            // for every row
391 5
            foreach ($ddata as $k => $row) {
392
                // initialize new row
393 5
                $sdata[$k] = [0, 0, 0];
394 5
                if (0 == $wb[0]) {
395
                    // default type field
396
                    $sdata[$k][0] = 1;
397
                }
398 5
                $i = 0; // count bytes in the row
399
                // for every column
400 5
                for ($c = 0; $c < 3; ++$c) {
401
                    // for every byte on the column
402 5
                    for ($b = 0; $b < $wb[$c]; ++$b) {
403 5
                        if (isset($row[$i])) {
404 5
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
405
                        }
406 5
                        ++$i;
407
                    }
408
                }
409
            }
410 5
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
411
            // fill xref
412 5
            if (isset($index_first)) {
413 4
                $obj_num = $index_first;
414
            } else {
415 5
                $obj_num = 0;
416
            }
417 5
            foreach ($sdata as $k => $row) {
418 5
                switch ($row[0]) {
419 5
                    case 0:  // (f) linked list of free objects
420 5
                            break;
421
422 5
                    case 1:  // (n) objects that are in use but are not compressed
423
                            // create unique object index: [object number]_[generation number]
424 5
                            $index = $obj_num.'_'.$row[2];
425
                            // check if object already exist
426 5
                            if (!isset($xref['xref'][$index])) {
427
                                // store object offset position
428 5
                                $xref['xref'][$index] = $row[1];
429
                            }
430 5
                            break;
431
432 5
                    case 2:  // compressed objects
433
                            // $row[1] = object number of the object stream in which this object is stored
434
                            // $row[2] = index of this object within the object stream
435 5
                            $index = $row[1].'_0_'.$row[2];
436 5
                            $xref['xref'][$index] = -1;
437 5
                            break;
438
439
                    default:  // null objects
440
                            break;
441
                }
442 5
                ++$obj_num;
443
            }
444
        } // end decoding data
445 5
        if (isset($prevxref)) {
446
            // get previous xref
447 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
448
        }
449
450 5
        return $xref;
451
    }
452
453
    /**
454
     * Get content of indirect object.
455
     *
456
     * @param string $pdfData  PDF data
457
     * @param array  $xref
458
     * @param string $objRef   Object number and generation number separated by underscore character
459
     * @param int    $offset   Object offset
460
     * @param bool   $decoding If true decode streams
461
     *
462
     * @return array containing object data
463
     *
464
     * @throws Exception if invalid object reference found
465
     */
466 25
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
467
    {
468
        /*
469
         * build indirect object header
470
         */
471
        // $objHeader = "[object number] [generation number] obj"
472 25
        $objRefArr = explode('_', $objRef);
473 25
        if (2 !== \count($objRefArr)) {
474
            throw new Exception('Invalid object reference for $obj.');
475
        }
476 25
        $objHeader = $objRefArr[0].' '.$objRefArr[1].' obj';
477
478
        /*
479
         * check if we are in position
480
         */
481
        // ignore whitespace characters at offset (NUL, HT, LF, FF, CR, SP)
482 25
        $offset += strspn($pdfData, "\0\t\n\f\r ", $offset);
483
        // ignore leading zeros for object number
484 25
        $offset += strspn($pdfData, '0', $offset);
485 25
        if (substr($pdfData, $offset, \strlen($objHeader)) !== $objHeader) {
486
            // an indirect reference to an undefined object shall be considered a reference to the null object
487 2
            return ['null', 'null', $offset];
488
        }
489
490
        /*
491
         * get content
492
         */
493
        // starting position of object content
494 25
        $offset += \strlen($objHeader);
495 25
        $objContentArr = [];
496 25
        $i = 0; // object main index
497
        do {
498 25
            $oldOffset = $offset;
499
            // get element
500 25
            $element = $this->getRawObject($pdfData, $offset);
501 25
            $offset = $element[2];
502
            // decode stream using stream's dictionary information
503 25
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
504 25
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
505
            }
506 25
            $objContentArr[$i] = $element;
507 25
            ++$i;
508 25
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
509
        // remove closing delimiter
510 25
        array_pop($objContentArr);
511
512
        /*
513
         * return raw object content
514
         */
515 25
        return $objContentArr;
516
    }
517
518
    /**
519
     * Get the content of object, resolving indirect object reference if necessary.
520
     *
521
     * @param string $pdfData PDF data
522
     * @param array  $obj     Object value
523
     *
524
     * @return array containing object data
525
     *
526
     * @throws Exception
527
     */
528 25
    protected function getObjectVal($pdfData, $xref, $obj)
529
    {
530 25
        if ('objref' == $obj[0]) {
531
            // reference to indirect object
532
            if (isset($this->objects[$obj[1]])) {
533
                // this object has been already parsed
534
                return $this->objects[$obj[1]];
535
            } elseif (isset($xref[$obj[1]])) {
536
                // parse new object
537
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
538
539
                return $this->objects[$obj[1]];
540
            }
541
        }
542
543 25
        return $obj;
544
    }
545
546
    /**
547
     * Get object type, raw value and offset to next object
548
     *
549
     * @param int $offset Object offset
550
     *
551
     * @return array containing object type, raw value and offset to next object
552
     */
553 26
    protected function getRawObject($pdfData, $offset = 0)
554
    {
555 26
        $objtype = ''; // object type to be returned
556 26
        $objval = ''; // object value to be returned
557
558
        /*
559
         * skip initial white space chars:
560
         *      \x00 null (NUL)
561
         *      \x09 horizontal tab (HT)
562
         *      \x0A line feed (LF)
563
         *      \x0C form feed (FF)
564
         *      \x0D carriage return (CR)
565
         *      \x20 space (SP)
566
         */
567 26
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
568
569
        // get first char
570 26
        $char = $pdfData[$offset];
571
        // get object type
572 26
        switch ($char) {
573 26
            case '%':  // \x25 PERCENT SIGN
574
                    // skip comment and search for next token
575
                    $next = strcspn($pdfData, "\r\n", $offset);
576
                    if ($next > 0) {
577
                        $offset += $next;
578
579
                        return $this->getRawObject($pdfData, $offset);
580
                    }
581
                    break;
582
583 26
            case '/':  // \x2F SOLIDUS
584
                    // name object
585 26
                    $objtype = $char;
586 26
                    ++$offset;
587 26
                    $pregResult = preg_match(
588 26
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
589 26
                        substr($pdfData, $offset, 256),
590
                        $matches
591
                    );
592 26
                    if (1 == $pregResult) {
593 26
                        $objval = $matches[1]; // unescaped value
594 26
                        $offset += \strlen($objval);
595
                    }
596 26
                    break;
597
598 26
            case '(':   // \x28 LEFT PARENTHESIS
599 26
            case ')':  // \x29 RIGHT PARENTHESIS
600
                    // literal string object
601 24
                    $objtype = $char;
602 24
                    ++$offset;
603 24
                    $strpos = $offset;
604 24
                    if ('(' == $char) {
605 24
                        $open_bracket = 1;
606 24
                        while ($open_bracket > 0) {
607 24
                            if (!isset($pdfData[$strpos])) {
608
                                break;
609
                            }
610 24
                            $ch = $pdfData[$strpos];
611 24
                            switch ($ch) {
612 24
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
613
                                        // skip next character
614 15
                                        ++$strpos;
615 15
                                        break;
616
617 24
                                case '(':  // LEFT PARENHESIS (28h)
618
                                        ++$open_bracket;
619
                                        break;
620
621 24
                                case ')':  // RIGHT PARENTHESIS (29h)
622 24
                                        --$open_bracket;
623 24
                                        break;
624
                            }
625 24
                            ++$strpos;
626
                        }
627 24
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
628 24
                        $offset = $strpos;
629
                    }
630 24
                    break;
631
632 26
            case '[':   // \x5B LEFT SQUARE BRACKET
633 26
            case ']':  // \x5D RIGHT SQUARE BRACKET
634
                // array object
635 25
                $objtype = $char;
636 25
                ++$offset;
637 25
                if ('[' == $char) {
638
                    // get array content
639 25
                    $objval = [];
640
                    do {
641 25
                        $oldOffset = $offset;
642
                        // get element
643 25
                        $element = $this->getRawObject($pdfData, $offset);
644 25
                        $offset = $element[2];
645 25
                        $objval[] = $element;
646 25
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
647
                    // remove closing delimiter
648 25
                    array_pop($objval);
649
                }
650 25
                break;
651
652 26
            case '<':  // \x3C LESS-THAN SIGN
653 26
            case '>':  // \x3E GREATER-THAN SIGN
654 26
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
655
                    // dictionary object
656 26
                    $objtype = $char.$char;
657 26
                    $offset += 2;
658 26
                    if ('<' == $char) {
659
                        // get array content
660 26
                        $objval = [];
661
                        do {
662 26
                            $oldOffset = $offset;
663
                            // get element
664 26
                            $element = $this->getRawObject($pdfData, $offset);
665 26
                            $offset = $element[2];
666 26
                            $objval[] = $element;
667 26
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
668
                        // remove closing delimiter
669 26
                        array_pop($objval);
670
                    }
671
                } else {
672
                    // hexadecimal string object
673 8
                    $objtype = $char;
674 8
                    ++$offset;
675 8
                    $pregResult = preg_match(
676 8
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
677 8
                            substr($pdfData, $offset),
678
                            $matches
679
                        );
680 8
                    if (('<' == $char) && 1 == $pregResult) {
681
                        // remove white space characters
682 8
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
683 8
                        $offset += \strlen($matches[0]);
684
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
685
                        $offset = $endpos + 1;
686
                    }
687
                }
688 26
                    break;
689
690
            default:
691 26
                    if ('endobj' == substr($pdfData, $offset, 6)) {
692
                        // indirect object
693 25
                        $objtype = 'endobj';
694 25
                        $offset += 6;
695 26
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
696
                        // null object
697 3
                        $objtype = 'null';
698 3
                        $offset += 4;
699 3
                        $objval = 'null';
700 26
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
701
                        // boolean true object
702 7
                        $objtype = 'boolean';
703 7
                        $offset += 4;
704 7
                        $objval = 'true';
705 26
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
706
                        // boolean false object
707 2
                        $objtype = 'boolean';
708 2
                        $offset += 5;
709 2
                        $objval = 'false';
710 26
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
711
                        // start stream object
712 25
                        $objtype = 'stream';
713 25
                        $offset += 6;
714 25
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
715 25
                            $offset += \strlen($matches[0]);
716 25
                            $pregResult = preg_match(
717 25
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
718 25
                                substr($pdfData, $offset),
719
                                $matches,
720 25
                                PREG_OFFSET_CAPTURE
721
                            );
722 25
                            if (1 == $pregResult) {
723 25
                                $objval = substr($pdfData, $offset, $matches[0][1]);
724 25
                                $offset += $matches[1][1];
725
                            }
726
                        }
727 26
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
728
                        // end stream object
729 25
                        $objtype = 'endstream';
730 25
                        $offset += 9;
731 26
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
732
                        // indirect object reference
733 25
                        $objtype = 'objref';
734 25
                        $offset += \strlen($matches[0]);
735 25
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
736 26
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
737
                        // object start
738 5
                        $objtype = 'obj';
739 5
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
740 5
                        $offset += \strlen($matches[0]);
741 26
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
742
                        // numeric object
743 25
                        $objtype = 'numeric';
744 25
                        $objval = substr($pdfData, $offset, $numlen);
745 25
                        $offset += $numlen;
746
                    }
747 26
                    break;
748
        }
749
750 26
        return [$objtype, $objval, $offset];
751
    }
752
753
    /**
754
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
755
     *
756
     * @param string $pdfData
757
     * @param int    $offset  xref offset (if know)
758
     * @param array  $xref    previous xref array (if any)
759
     *
760
     * @return array containing xref and trailer data
761
     *
762
     * @throws Exception if it was unable to find startxref
763
     * @throws Exception if it was unable to find xref
764
     */
765 26
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
766
    {
767 26
        $startxrefPreg = preg_match(
768 26
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
769
            $pdfData,
770
            $matches,
771 26
            PREG_OFFSET_CAPTURE,
772
            $offset
773
        );
774
775 26
        if (0 == $offset) {
776
            // find last startxref
777 26
            $pregResult = preg_match_all(
778 26
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
779
                $pdfData, $matches,
780 26
                PREG_SET_ORDER,
781
                $offset
782
            );
783 26
            if (0 == $pregResult) {
784
                throw new Exception('Unable to find startxref');
785
            }
786 26
            $matches = array_pop($matches);
787 26
            $startxref = $matches[1];
788 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
789
            // Already pointing at the xref table
790 5
            $startxref = $offset;
791 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
792
            // Cross-Reference Stream object
793 4
            $startxref = $offset;
794
        } elseif ($startxrefPreg) {
795
            // startxref found
796
            $startxref = $matches[1][0];
797
        } else {
798
            throw new Exception('Unable to find startxref');
799
        }
800
801 26
        if ($startxref > \strlen($pdfData)) {
802 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
803
        }
804
805
        // check xref position
806 25
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
807
            // Cross-Reference
808 21
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
809
        } else {
810
            // Cross-Reference Stream
811 5
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
812
        }
813 25
        if (empty($xref)) {
814
            throw new Exception('Unable to find xref');
815
        }
816
817 25
        return $xref;
818
    }
819
820
    /**
821
     * Parses PDF data and returns extracted data as array.
822
     *
823
     * @param string $data PDF data to parse
824
     *
825
     * @return array array of parsed PDF document objects
826
     *
827
     * @throws Exception if empty PDF data given
828
     * @throws Exception if PDF data missing %PDF header
829
     */
830 26
    public function parseData($data)
831
    {
832 26
        if (empty($data)) {
833
            throw new Exception('Empty PDF data given.');
834
        }
835
        // find the pdf header starting position
836 26
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
837
            throw new Exception('Invalid PDF data: missing %PDF header.');
838
        }
839
840
        // get PDF content string
841 26
        $pdfData = substr($data, $trimpos);
842
843
        // get xref and trailer data
844 26
        $xref = $this->getXrefData($pdfData);
845
846
        // parse all document objects
847 25
        $objects = [];
848 25
        foreach ($xref['xref'] as $obj => $offset) {
849 25
            if (!isset($objects[$obj]) && ($offset > 0)) {
850
                // decode objects with positive offset
851 25
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
852
            }
853
        }
854
855 25
        return [$xref, $objects];
856
    }
857
}
858