Passed
Push — master ( ef0204...57667e )
by Konrad
02:16
created

RawDataParser::decodeStream()   D

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 21.0148

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 4
dl 0
loc 56
ccs 30
cts 31
cp 0.9677
crap 21.0148
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 33
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 33
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 33
        $this->filterHelper = new FilterHelper();
75 33
        $this->config = $config ?: new Config();
76 33
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $xref
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     */
88 29
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
89
    {
90
        // get stream length and filters
91 29
        $slength = \strlen($stream);
92 29
        if ($slength <= 0) {
93
            return ['', []];
94
        }
95 29
        $filters = [];
96 29
        foreach ($sdic as $k => $v) {
97 29
            if ('/' == $v[0]) {
98 29
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
99
                    // get declared stream length
100 28
                    $declength = (int) ($sdic[($k + 1)][1]);
101 28
                    if ($declength < $slength) {
102 28
                        $stream = substr($stream, 0, $declength);
103 28
                        $slength = $declength;
104
                    }
105 29
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
106
                    // resolve indirect object
107 29
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
108 29
                    if ('/' == $objval[0]) {
109
                        // single filter
110 29
                        $filters[] = $objval[1];
111 1
                    } elseif ('[' == $objval[0]) {
112
                        // array of filters
113 1
                        foreach ($objval[1] as $flt) {
114 1
                            if ('/' == $flt[0]) {
115 1
                                $filters[] = $flt[1];
116
                            }
117
                        }
118
                    }
119
                }
120
            }
121
        }
122
123
        // decode the stream
124 29
        $remaining_filters = [];
125 29
        foreach ($filters as $filter) {
126 29
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
127
                try {
128 29
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
129 1
                } catch (Exception $e) {
130 1
                    $emsg = $e->getMessage();
131 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
132 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
133
                    ) {
134 29
                        throw new Exception($e->getMessage());
135
                    }
136
                }
137
            } else {
138
                // add missing filter to array
139 4
                $remaining_filters[] = $filter;
140
            }
141
        }
142
143 29
        return [$stream, $remaining_filters];
144
    }
145
146
    /**
147
     * Decode the Cross-Reference section
148
     *
149
     * @param string $pdfData   PDF data
150
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
151
     * @param array  $xref      Previous xref array (if any)
152
     *
153
     * @return array containing xref and trailer data
154
     */
155 24
    protected function decodeXref($pdfData, $startxref, $xref = [])
156
    {
157 24
        $startxref += 4; // 4 is the length of the word 'xref'
158
        // skip initial white space chars
159 24
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
160
        // initialize object number
161 24
        $obj_num = 0;
162
        // search for cross-reference entries or subsection
163 24
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
164 24
            if ($matches[0][1] != $offset) {
165
                // we are on another section
166 5
                break;
167
            }
168 24
            $offset += \strlen($matches[0][0]);
169 24
            if ('n' == $matches[3][0]) {
170
                // create unique object index: [object number]_[generation number]
171 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
172
                // check if object already exist
173 24
                if (!isset($xref['xref'][$index])) {
174
                    // store object offset position
175 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
176
                }
177 24
                ++$obj_num;
178 24
            } elseif ('f' == $matches[3][0]) {
179 24
                ++$obj_num;
180
            } else {
181
                // object number (index)
182 24
                $obj_num = (int) ($matches[1][0]);
183
            }
184
        }
185
        // get trailer data
186 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
187 24
            $trailer_data = $matches[1][0];
188 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
189
                // get only the last updated version
190 24
                $xref['trailer'] = [];
191
                // parse trailer_data
192 24
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
193 24
                    $xref['trailer']['size'] = (int) ($matches[1]);
194
                }
195 24
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196 24
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197
                }
198 24
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 24
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202 24
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 24
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
205 22
                    $xref['trailer']['id'] = [];
206 22
                    $xref['trailer']['id'][0] = $matches[1];
207 22
                    $xref['trailer']['id'][1] = $matches[2];
208
                }
209
            }
210 24
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
211
                // get previous xref
212 24
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
213
            }
214
        } else {
215
            throw new Exception('Unable to find trailer');
216
        }
217
218 24
        return $xref;
219
    }
220
221
    /**
222
     * Decode the Cross-Reference Stream section
223
     *
224
     * @param string $pdfData   PDF data
225
     * @param int    $startxref Offset at which the xref section starts
226
     * @param array  $xref      Previous xref array (if any)
227
     *
228
     * @return array containing xref and trailer data
229
     *
230
     * @throws Exception if unknown PNG predictor detected
231
     */
232 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
233
    {
234
        // try to read Cross-Reference Stream
235 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
236 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
237 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
238
            // get only the last updated version
239 6
            $xref['trailer'] = [];
240 6
            $filltrailer = true;
241
        } else {
242 4
            $filltrailer = false;
243
        }
244 6
        if (!isset($xref['xref'])) {
245 6
            $xref['xref'] = [];
246
        }
247 6
        $valid_crs = false;
248 6
        $columns = 0;
249 6
        $predictor = null;
250 6
        $sarr = $xrefcrs[0][1];
251 6
        if (!\is_array($sarr)) {
252
            $sarr = [];
253
        }
254
255 6
        $wb = [];
256
257 6
        foreach ($sarr as $k => $v) {
258
            if (
259 6
                ('/' == $v[0])
260 6
                && ('Type' == $v[1])
261
                && (
262 6
                    isset($sarr[($k + 1)])
263 6
                    && '/' == $sarr[($k + 1)][0]
264 6
                    && 'XRef' == $sarr[($k + 1)][1]
265
                )
266
            ) {
267 6
                $valid_crs = true;
268 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
269
                // first object number in the subsection
270 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
271 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
272
                // get previous xref offset
273 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
274 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
275
                // number of bytes (in the decoded stream) of the corresponding field
276 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
277 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
278 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
279 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
280 5
                $decpar = $sarr[($k + 1)][1];
281 5
                foreach ($decpar as $kdc => $vdc) {
282
                    if (
283 5
                        '/' == $vdc[0]
284 5
                        && 'Columns' == $vdc[1]
285
                        && (
286 5
                            isset($decpar[($kdc + 1)])
287 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
288
                        )
289
                    ) {
290 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
291
                    } elseif (
292 5
                        '/' == $vdc[0]
293 5
                        && 'Predictor' == $vdc[1]
294
                        && (
295 5
                            isset($decpar[($kdc + 1)])
296 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
297
                        )
298
                    ) {
299 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
300
                    }
301
                }
302 6
            } elseif ($filltrailer) {
303 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
304 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
305 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
306 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
307 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
308 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
309 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
310
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
311 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
312 6
                    $xref['trailer']['id'] = [];
313 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
314 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
315
                }
316
            }
317
        }
318
319
        // decode data
320 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
321 6
            if (null !== $predictor) {
322
                // number of bytes in a row
323 5
                $rowlen = ($columns + 1);
324
                // convert the stream into an array of integers
325 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
326
                // split the rows
327 5
                $sdata = array_chunk($sdata, $rowlen);
328
329
                // initialize decoded array
330 5
                $ddata = [];
331
                // initialize first row with zeros
332 5
                $prev_row = array_fill(0, $rowlen, 0);
333
                // for each row apply PNG unpredictor
334 5
                foreach ($sdata as $k => $row) {
335
                    // initialize new row
336 5
                    $ddata[$k] = [];
337
                    // get PNG predictor value
338 5
                    $predictor = (10 + $row[0]);
339
                    // for each byte on the row
340 5
                    for ($i = 1; $i <= $columns; ++$i) {
341
                        // new index
342 5
                        $j = ($i - 1);
343 5
                        $row_up = $prev_row[$j];
344 5
                        if (1 == $i) {
345 5
                            $row_left = 0;
346 5
                            $row_upleft = 0;
347
                        } else {
348 5
                            $row_left = $row[($i - 1)];
349 5
                            $row_upleft = $prev_row[($j - 1)];
350
                        }
351 5
                        switch ($predictor) {
352 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
353
                                $ddata[$k][$j] = $row[$i];
354
                                break;
355
356 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
357
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
358
                                break;
359
360 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
361 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
362 5
                                break;
363
364
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
365
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
366
                                break;
367
368
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
369
                                // initial estimate
370
                                $p = ($row_left + $row_up - $row_upleft);
371
                                // distances
372
                                $pa = abs($p - $row_left);
373
                                $pb = abs($p - $row_up);
374
                                $pc = abs($p - $row_upleft);
375
                                $pmin = min($pa, $pb, $pc);
376
                                // return minimum distance
377
                                switch ($pmin) {
378
                                    case $pa:
379
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
380
                                        break;
381
382
                                    case $pb:
383
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
384
                                        break;
385
386
                                    case $pc:
387
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
388
                                        break;
389
                                }
390
                                break;
391
392
                            default:  // PNG prediction (on encoding, PNG optimum)
393
                                throw new Exception('Unknown PNG predictor: '.$predictor);
394
                        }
395
                    }
396 5
                    $prev_row = $ddata[$k];
397
                } // end for each row
398
                // complete decoding
399
            } else {
400
                // number of bytes in a row
401 2
                $rowlen = array_sum($wb);
402
                // convert the stream into an array of integers
403 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
404
                // split the rows
405 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

405
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
406
            }
407
408 6
            $sdata = [];
409
410
            // for every row
411 6
            foreach ($ddata as $k => $row) {
412
                // initialize new row
413 6
                $sdata[$k] = [0, 0, 0];
414 6
                if (0 == $wb[0]) {
415
                    // default type field
416
                    $sdata[$k][0] = 1;
417
                }
418 6
                $i = 0; // count bytes in the row
419
                // for every column
420 6
                for ($c = 0; $c < 3; ++$c) {
421
                    // for every byte on the column
422 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
423 6
                        if (isset($row[$i])) {
424 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
425
                        }
426 6
                        ++$i;
427
                    }
428
                }
429
            }
430 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
431
            // fill xref
432 6
            if (isset($index_first)) {
433 4
                $obj_num = $index_first;
434
            } else {
435 6
                $obj_num = 0;
436
            }
437 6
            foreach ($sdata as $k => $row) {
438 6
                switch ($row[0]) {
439 6
                    case 0:  // (f) linked list of free objects
440 6
                            break;
441
442 6
                    case 1:  // (n) objects that are in use but are not compressed
443
                            // create unique object index: [object number]_[generation number]
444 6
                            $index = $obj_num.'_'.$row[2];
445
                            // check if object already exist
446 6
                            if (!isset($xref['xref'][$index])) {
447
                                // store object offset position
448 6
                                $xref['xref'][$index] = $row[1];
449
                            }
450 6
                            break;
451
452 6
                    case 2:  // compressed objects
453
                            // $row[1] = object number of the object stream in which this object is stored
454
                            // $row[2] = index of this object within the object stream
455 6
                            $index = $row[1].'_0_'.$row[2];
456 6
                            $xref['xref'][$index] = -1;
457 6
                            break;
458
459
                    default:  // null objects
460
                            break;
461
                }
462 6
                ++$obj_num;
463
            }
464
        } // end decoding data
465 6
        if (isset($prevxref)) {
466
            // get previous xref
467 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
468
        }
469
470 6
        return $xref;
471
    }
472
473 29
    protected function getObjectHeaderPattern($objRefArr): string
474
    {
475
        // consider all whitespace character (PDF specifications)
476 29
        return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
477
    }
478
479 29
    protected function getObjectHeaderLen($objRefArr): int
480
    {
481
        // "4 0 obj"
482
        // 2 whitespaces + strlen("obj") = 5
483 29
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
484
    }
485
486
    /**
487
     * Get content of indirect object.
488
     *
489
     * @param string $pdfData  PDF data
490
     * @param array  $xref
491
     * @param string $objRef   Object number and generation number separated by underscore character
492
     * @param int    $offset   Object offset
493
     * @param bool   $decoding If true decode streams
494
     *
495
     * @return array containing object data
496
     *
497
     * @throws Exception if invalid object reference found
498
     */
499 29
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
500
    {
501
        /*
502
         * build indirect object header
503
         */
504
        // $objHeader = "[object number] [generation number] obj"
505 29
        $objRefArr = explode('_', $objRef);
506 29
        if (2 !== \count($objRefArr)) {
507
            throw new Exception('Invalid object reference for $obj.');
508
        }
509
510 29
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
511
512
        /*
513
         * check if we are in position
514
         */
515
        // ignore whitespace characters at offset
516 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
517
        // ignore leading zeros for object number
518 29
        $offset += strspn($pdfData, '0', $offset);
519 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
520
            // an indirect reference to an undefined object shall be considered a reference to the null object
521 2
            return ['null', 'null', $offset];
522
        }
523
524
        /*
525
         * get content
526
         */
527
        // starting position of object content
528 29
        $offset += $objHeaderLen;
529 29
        $objContentArr = [];
530 29
        $i = 0; // object main index
531
        do {
532 29
            $oldOffset = $offset;
533
            // get element
534 29
            $element = $this->getRawObject($pdfData, $offset);
535 29
            $offset = $element[2];
536
            // decode stream using stream's dictionary information
537 29
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
538 29
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
539
            }
540 29
            $objContentArr[$i] = $element;
541 29
            ++$i;
542 29
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
543
        // remove closing delimiter
544 29
        array_pop($objContentArr);
545
546
        /*
547
         * return raw object content
548
         */
549 29
        return $objContentArr;
550
    }
551
552
    /**
553
     * Get the content of object, resolving indirect object reference if necessary.
554
     *
555
     * @param string $pdfData PDF data
556
     * @param array  $obj     Object value
557
     *
558
     * @return array containing object data
559
     *
560
     * @throws Exception
561
     */
562 29
    protected function getObjectVal($pdfData, $xref, $obj)
563
    {
564 29
        if ('objref' == $obj[0]) {
565
            // reference to indirect object
566
            if (isset($this->objects[$obj[1]])) {
567
                // this object has been already parsed
568
                return $this->objects[$obj[1]];
569
            } elseif (isset($xref[$obj[1]])) {
570
                // parse new object
571
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
572
573
                return $this->objects[$obj[1]];
574
            }
575
        }
576
577 29
        return $obj;
578
    }
579
580
    /**
581
     * Get object type, raw value and offset to next object
582
     *
583
     * @param int $offset Object offset
584
     *
585
     * @return array containing object type, raw value and offset to next object
586
     */
587 30
    protected function getRawObject($pdfData, $offset = 0)
588
    {
589 30
        $objtype = ''; // object type to be returned
590 30
        $objval = ''; // object value to be returned
591
592
        // skip initial white space chars
593 30
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
594
595
        // get first char
596 30
        $char = $pdfData[$offset];
597
        // get object type
598 30
        switch ($char) {
599 30
            case '%':  // \x25 PERCENT SIGN
600
                    // skip comment and search for next token
601
                    $next = strcspn($pdfData, "\r\n", $offset);
602
                    if ($next > 0) {
603
                        $offset += $next;
604
605
                        return $this->getRawObject($pdfData, $offset);
606
                    }
607
                    break;
608
609 30
            case '/':  // \x2F SOLIDUS
610
                    // name object
611 30
                    $objtype = $char;
612 30
                    ++$offset;
613 30
                    $pregResult = preg_match(
614 30
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
615 30
                        substr($pdfData, $offset, 256),
616
                        $matches
617
                    );
618 30
                    if (1 == $pregResult) {
619 30
                        $objval = $matches[1]; // unescaped value
620 30
                        $offset += \strlen($objval);
621
                    }
622 30
                    break;
623
624 30
            case '(':   // \x28 LEFT PARENTHESIS
625 30
            case ')':  // \x29 RIGHT PARENTHESIS
626
                    // literal string object
627 27
                    $objtype = $char;
628 27
                    ++$offset;
629 27
                    $strpos = $offset;
630 27
                    if ('(' == $char) {
631 27
                        $open_bracket = 1;
632 27
                        while ($open_bracket > 0) {
633 27
                            if (!isset($pdfData[$strpos])) {
634
                                break;
635
                            }
636 27
                            $ch = $pdfData[$strpos];
637 27
                            switch ($ch) {
638 27
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
639
                                        // skip next character
640 15
                                        ++$strpos;
641 15
                                        break;
642
643 27
                                case '(':  // LEFT PARENHESIS (28h)
644
                                        ++$open_bracket;
645
                                        break;
646
647 27
                                case ')':  // RIGHT PARENTHESIS (29h)
648 27
                                        --$open_bracket;
649 27
                                        break;
650
                            }
651 27
                            ++$strpos;
652
                        }
653 27
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
654 27
                        $offset = $strpos;
655
                    }
656 27
                    break;
657
658 30
            case '[':   // \x5B LEFT SQUARE BRACKET
659 30
            case ']':  // \x5D RIGHT SQUARE BRACKET
660
                // array object
661 29
                $objtype = $char;
662 29
                ++$offset;
663 29
                if ('[' == $char) {
664
                    // get array content
665 29
                    $objval = [];
666
                    do {
667 29
                        $oldOffset = $offset;
668
                        // get element
669 29
                        $element = $this->getRawObject($pdfData, $offset);
670 29
                        $offset = $element[2];
671 29
                        $objval[] = $element;
672 29
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
673
                    // remove closing delimiter
674 29
                    array_pop($objval);
675
                }
676 29
                break;
677
678 30
            case '<':  // \x3C LESS-THAN SIGN
679 30
            case '>':  // \x3E GREATER-THAN SIGN
680 30
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
681
                    // dictionary object
682 30
                    $objtype = $char.$char;
683 30
                    $offset += 2;
684 30
                    if ('<' == $char) {
685
                        // get array content
686 30
                        $objval = [];
687
                        do {
688 30
                            $oldOffset = $offset;
689
                            // get element
690 30
                            $element = $this->getRawObject($pdfData, $offset);
691 30
                            $offset = $element[2];
692 30
                            $objval[] = $element;
693 30
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
694
                        // remove closing delimiter
695 30
                        array_pop($objval);
696
                    }
697
                } else {
698
                    // hexadecimal string object
699 12
                    $objtype = $char;
700 12
                    ++$offset;
701 12
                    $pregResult = preg_match(
702 12
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
703 12
                            substr($pdfData, $offset),
704
                            $matches
705
                        );
706 12
                    if (('<' == $char) && 1 == $pregResult) {
707
                        // remove white space characters
708 12
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
709 12
                        $offset += \strlen($matches[0]);
710
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
711
                        $offset = $endpos + 1;
712
                    }
713
                }
714 30
                    break;
715
716
            default:
717 30
                    if ('endobj' == substr($pdfData, $offset, 6)) {
718
                        // indirect object
719 29
                        $objtype = 'endobj';
720 29
                        $offset += 6;
721 30
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
722
                        // null object
723 4
                        $objtype = 'null';
724 4
                        $offset += 4;
725 4
                        $objval = 'null';
726 30
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
727
                        // boolean true object
728 9
                        $objtype = 'boolean';
729 9
                        $offset += 4;
730 9
                        $objval = 'true';
731 30
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
732
                        // boolean false object
733 2
                        $objtype = 'boolean';
734 2
                        $offset += 5;
735 2
                        $objval = 'false';
736 30
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
737
                        // start stream object
738 29
                        $objtype = 'stream';
739 29
                        $offset += 6;
740 29
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
741 29
                            $offset += \strlen($matches[0]);
742 29
                            $pregResult = preg_match(
743 29
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
744 29
                                substr($pdfData, $offset),
745
                                $matches,
746 29
                                \PREG_OFFSET_CAPTURE
747
                            );
748 29
                            if (1 == $pregResult) {
749 29
                                $objval = substr($pdfData, $offset, $matches[0][1]);
750 29
                                $offset += $matches[1][1];
751
                            }
752
                        }
753 30
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
754
                        // end stream object
755 29
                        $objtype = 'endstream';
756 29
                        $offset += 9;
757 30
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
758
                        // indirect object reference
759 29
                        $objtype = 'objref';
760 29
                        $offset += \strlen($matches[0]);
761 29
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 30
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
763
                        // object start
764 6
                        $objtype = 'obj';
765 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
766 6
                        $offset += \strlen($matches[0]);
767 30
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
768
                        // numeric object
769 29
                        $objtype = 'numeric';
770 29
                        $objval = substr($pdfData, $offset, $numlen);
771 29
                        $offset += $numlen;
772
                    }
773 30
                    break;
774
        }
775
776 30
        return [$objtype, $objval, $offset];
777
    }
778
779
    /**
780
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
781
     *
782
     * @param string $pdfData
783
     * @param int    $offset  xref offset (if know)
784
     * @param array  $xref    previous xref array (if any)
785
     *
786
     * @return array containing xref and trailer data
787
     *
788
     * @throws Exception if it was unable to find startxref
789
     * @throws Exception if it was unable to find xref
790
     */
791 30
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
792
    {
793 30
        $startxrefPreg = preg_match(
794 30
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
795
            $pdfData,
796
            $matches,
797 30
            \PREG_OFFSET_CAPTURE,
798
            $offset
799
        );
800
801 30
        if (0 == $offset) {
802
            // find last startxref
803 30
            $pregResult = preg_match_all(
804 30
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
805
                $pdfData, $matches,
806 30
                \PREG_SET_ORDER,
807
                $offset
808
            );
809 30
            if (0 == $pregResult) {
810
                throw new Exception('Unable to find startxref');
811
            }
812 30
            $matches = array_pop($matches);
813 30
            $startxref = $matches[1];
814 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
815
            // Already pointing at the xref table
816 5
            $startxref = $offset;
817 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
818
            // Cross-Reference Stream object
819 4
            $startxref = $offset;
820
        } elseif ($startxrefPreg) {
821
            // startxref found
822
            $startxref = $matches[1][0];
823
        } else {
824
            throw new Exception('Unable to find startxref');
825
        }
826
827 30
        if ($startxref > \strlen($pdfData)) {
828 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
829
        }
830
831
        // check xref position
832 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
833
            // Cross-Reference
834 24
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
835
        } else {
836
            // Cross-Reference Stream
837 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
838
        }
839 29
        if (empty($xref)) {
840
            throw new Exception('Unable to find xref');
841
        }
842
843 29
        return $xref;
844
    }
845
846
    /**
847
     * Parses PDF data and returns extracted data as array.
848
     *
849
     * @param string $data PDF data to parse
850
     *
851
     * @return array array of parsed PDF document objects
852
     *
853
     * @throws Exception if empty PDF data given
854
     * @throws Exception if PDF data missing %PDF header
855
     */
856 30
    public function parseData($data)
857
    {
858 30
        if (empty($data)) {
859
            throw new Exception('Empty PDF data given.');
860
        }
861
        // find the pdf header starting position
862 30
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
863
            throw new Exception('Invalid PDF data: missing %PDF header.');
864
        }
865
866
        // get PDF content string
867 30
        $pdfData = substr($data, $trimpos);
868
869
        // get xref and trailer data
870 30
        $xref = $this->getXrefData($pdfData);
871
872
        // parse all document objects
873 29
        $objects = [];
874 29
        foreach ($xref['xref'] as $obj => $offset) {
875 29
            if (!isset($objects[$obj]) && ($offset > 0)) {
876
                // decode objects with positive offset
877 29
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
878
            }
879
        }
880
881 29
        return [$xref, $objects];
882
    }
883
}
884