Passed
Push — master ( 1048d7...43e436 )
by Konrad
02:47
created

RawDataParser::decodeStream()   D

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 31.7889

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 4
dl 0
loc 56
ccs 22
cts 31
cp 0.7097
crap 31.7889
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 32
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 32
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 32
        $this->filterHelper = new FilterHelper();
75 32
        $this->config = $config ?: new Config();
76 32
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $xref
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     */
88 28
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
89
    {
90
        // get stream length and filters
91 28
        $slength = \strlen($stream);
92 28
        if ($slength <= 0) {
93
            return ['', []];
94
        }
95 28
        $filters = [];
96 28
        foreach ($sdic as $k => $v) {
97 28
            if ('/' == $v[0]) {
98 28
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
99
                    // get declared stream length
100 27
                    $declength = (int) ($sdic[($k + 1)][1]);
101 27
                    if ($declength < $slength) {
102 27
                        $stream = substr($stream, 0, $declength);
103 27
                        $slength = $declength;
104
                    }
105 28
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
106
                    // resolve indirect object
107 28
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
108 28
                    if ('/' == $objval[0]) {
109
                        // single filter
110 28
                        $filters[] = $objval[1];
111
                    } elseif ('[' == $objval[0]) {
112
                        // array of filters
113
                        foreach ($objval[1] as $flt) {
114
                            if ('/' == $flt[0]) {
115
                                $filters[] = $flt[1];
116
                            }
117
                        }
118
                    }
119
                }
120
            }
121
        }
122
123
        // decode the stream
124 28
        $remaining_filters = [];
125 28
        foreach ($filters as $filter) {
126 28
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
127
                try {
128 28
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
129
                } catch (Exception $e) {
130
                    $emsg = $e->getMessage();
131
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
132
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
133
                    ) {
134 28
                        throw new Exception($e->getMessage());
135
                    }
136
                }
137
            } else {
138
                // add missing filter to array
139 3
                $remaining_filters[] = $filter;
140
            }
141
        }
142
143 28
        return [$stream, $remaining_filters];
144
    }
145
146
    /**
147
     * Decode the Cross-Reference section
148
     *
149
     * @param string $pdfData   PDF data
150
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
151
     * @param array  $xref      Previous xref array (if any)
152
     *
153
     * @return array containing xref and trailer data
154
     */
155 23
    protected function decodeXref($pdfData, $startxref, $xref = [])
156
    {
157 23
        $startxref += 4; // 4 is the length of the word 'xref'
158
        // skip initial white space chars
159 23
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
160
        // initialize object number
161 23
        $obj_num = 0;
162
        // search for cross-reference entries or subsection
163 23
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
164 23
            if ($matches[0][1] != $offset) {
165
                // we are on another section
166 4
                break;
167
            }
168 23
            $offset += \strlen($matches[0][0]);
169 23
            if ('n' == $matches[3][0]) {
170
                // create unique object index: [object number]_[generation number]
171 23
                $index = $obj_num.'_'.(int) ($matches[2][0]);
172
                // check if object already exist
173 23
                if (!isset($xref['xref'][$index])) {
174
                    // store object offset position
175 23
                    $xref['xref'][$index] = (int) ($matches[1][0]);
176
                }
177 23
                ++$obj_num;
178 23
            } elseif ('f' == $matches[3][0]) {
179 23
                ++$obj_num;
180
            } else {
181
                // object number (index)
182 23
                $obj_num = (int) ($matches[1][0]);
183
            }
184
        }
185
        // get trailer data
186 23
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
187 23
            $trailer_data = $matches[1][0];
188 23
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
189
                // get only the last updated version
190 23
                $xref['trailer'] = [];
191
                // parse trailer_data
192 23
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
193 23
                    $xref['trailer']['size'] = (int) ($matches[1]);
194
                }
195 23
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196 23
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197
                }
198 23
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 23
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202 23
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 23
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
205 21
                    $xref['trailer']['id'] = [];
206 21
                    $xref['trailer']['id'][0] = $matches[1];
207 21
                    $xref['trailer']['id'][1] = $matches[2];
208
                }
209
            }
210 23
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
211
                // get previous xref
212 23
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
213
            }
214
        } else {
215
            throw new Exception('Unable to find trailer');
216
        }
217
218 23
        return $xref;
219
    }
220
221
    /**
222
     * Decode the Cross-Reference Stream section
223
     *
224
     * @param string $pdfData   PDF data
225
     * @param int    $startxref Offset at which the xref section starts
226
     * @param array  $xref      Previous xref array (if any)
227
     *
228
     * @return array containing xref and trailer data
229
     *
230
     * @throws Exception if unknown PNG predictor detected
231
     */
232 5
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
233
    {
234
        // try to read Cross-Reference Stream
235 5
        $xrefobj = $this->getRawObject($pdfData, $startxref);
236 5
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
237 5
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
238
            // get only the last updated version
239 5
            $xref['trailer'] = [];
240 5
            $filltrailer = true;
241
        } else {
242 3
            $filltrailer = false;
243
        }
244 5
        if (!isset($xref['xref'])) {
245 5
            $xref['xref'] = [];
246
        }
247 5
        $valid_crs = false;
248 5
        $columns = 0;
249 5
        $predictor = null;
250 5
        $sarr = $xrefcrs[0][1];
251 5
        if (!\is_array($sarr)) {
252
            $sarr = [];
253
        }
254
255 5
        $wb = [];
256
257 5
        foreach ($sarr as $k => $v) {
258
            if (
259 5
                ('/' == $v[0])
260 5
                && ('Type' == $v[1])
261
                && (
262 5
                    isset($sarr[($k + 1)])
263 5
                    && '/' == $sarr[($k + 1)][0]
264 5
                    && 'XRef' == $sarr[($k + 1)][1]
265
                )
266
            ) {
267 5
                $valid_crs = true;
268 5
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
269
                // first object number in the subsection
270 3
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
271 5
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
272
                // get previous xref offset
273 3
                $prevxref = (int) ($sarr[($k + 1)][1]);
274 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
275
                // number of bytes (in the decoded stream) of the corresponding field
276 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
277 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
278 5
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
279 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
280 4
                $decpar = $sarr[($k + 1)][1];
281 4
                foreach ($decpar as $kdc => $vdc) {
282
                    if (
283 4
                        '/' == $vdc[0]
284 4
                        && 'Columns' == $vdc[1]
285
                        && (
286 4
                            isset($decpar[($kdc + 1)])
287 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
288
                        )
289
                    ) {
290 4
                        $columns = (int) ($decpar[($kdc + 1)][1]);
291
                    } elseif (
292 4
                        '/' == $vdc[0]
293 4
                        && 'Predictor' == $vdc[1]
294
                        && (
295 4
                            isset($decpar[($kdc + 1)])
296 4
                            && 'numeric' == $decpar[($kdc + 1)][0]
297
                        )
298
                    ) {
299 4
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
300
                    }
301
                }
302 5
            } elseif ($filltrailer) {
303 5
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
304 5
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
305 5
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
306 5
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
307 5
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
308 5
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
309 5
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
310
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
311 5
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
312 5
                    $xref['trailer']['id'] = [];
313 5
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
314 5
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
315
                }
316
            }
317
        }
318
319
        // decode data
320 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
321 5
            if (null !== $predictor) {
322
                // number of bytes in a row
323 4
                $rowlen = ($columns + 1);
324
                // convert the stream into an array of integers
325 4
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
326
                // split the rows
327 4
                $sdata = array_chunk($sdata, $rowlen);
328
329
                // initialize decoded array
330 4
                $ddata = [];
331
                // initialize first row with zeros
332 4
                $prev_row = array_fill(0, $rowlen, 0);
333
                // for each row apply PNG unpredictor
334 4
                foreach ($sdata as $k => $row) {
335
                    // initialize new row
336 4
                    $ddata[$k] = [];
337
                    // get PNG predictor value
338 4
                    $predictor = (10 + $row[0]);
339
                    // for each byte on the row
340 4
                    for ($i = 1; $i <= $columns; ++$i) {
341
                        // new index
342 4
                        $j = ($i - 1);
343 4
                        $row_up = $prev_row[$j];
344 4
                        if (1 == $i) {
345 4
                            $row_left = 0;
346 4
                            $row_upleft = 0;
347
                        } else {
348 4
                            $row_left = $row[($i - 1)];
349 4
                            $row_upleft = $prev_row[($j - 1)];
350
                        }
351 4
                        switch ($predictor) {
352 4
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
353
                                $ddata[$k][$j] = $row[$i];
354
                                break;
355
356 4
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
357
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
358
                                break;
359
360 4
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
361 4
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
362 4
                                break;
363
364
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
365
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
366
                                break;
367
368
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
369
                                // initial estimate
370
                                $p = ($row_left + $row_up - $row_upleft);
371
                                // distances
372
                                $pa = abs($p - $row_left);
373
                                $pb = abs($p - $row_up);
374
                                $pc = abs($p - $row_upleft);
375
                                $pmin = min($pa, $pb, $pc);
376
                                // return minimum distance
377
                                switch ($pmin) {
378
                                    case $pa:
379
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
380
                                        break;
381
382
                                    case $pb:
383
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
384
                                        break;
385
386
                                    case $pc:
387
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
388
                                        break;
389
                                }
390
                                break;
391
392
                            default:  // PNG prediction (on encoding, PNG optimum)
393
                                throw new Exception('Unknown PNG predictor: '.$predictor);
394
                        }
395
                    }
396 4
                    $prev_row = $ddata[$k];
397
                } // end for each row
398
                // complete decoding
399
            } else {
400
                // number of bytes in a row
401 1
                $rowlen = array_sum($wb);
402
                // convert the stream into an array of integers
403 1
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
404
                // split the rows
405 1
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

405
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
406
            }
407
408 5
            $sdata = [];
409
410
            // for every row
411 5
            foreach ($ddata as $k => $row) {
412
                // initialize new row
413 5
                $sdata[$k] = [0, 0, 0];
414 5
                if (0 == $wb[0]) {
415
                    // default type field
416
                    $sdata[$k][0] = 1;
417
                }
418 5
                $i = 0; // count bytes in the row
419
                // for every column
420 5
                for ($c = 0; $c < 3; ++$c) {
421
                    // for every byte on the column
422 5
                    for ($b = 0; $b < $wb[$c]; ++$b) {
423 5
                        if (isset($row[$i])) {
424 5
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
425
                        }
426 5
                        ++$i;
427
                    }
428
                }
429
            }
430 5
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
431
            // fill xref
432 5
            if (isset($index_first)) {
433 3
                $obj_num = $index_first;
434
            } else {
435 5
                $obj_num = 0;
436
            }
437 5
            foreach ($sdata as $k => $row) {
438 5
                switch ($row[0]) {
439 5
                    case 0:  // (f) linked list of free objects
440 5
                            break;
441
442 5
                    case 1:  // (n) objects that are in use but are not compressed
443
                            // create unique object index: [object number]_[generation number]
444 5
                            $index = $obj_num.'_'.$row[2];
445
                            // check if object already exist
446 5
                            if (!isset($xref['xref'][$index])) {
447
                                // store object offset position
448 5
                                $xref['xref'][$index] = $row[1];
449
                            }
450 5
                            break;
451
452 5
                    case 2:  // compressed objects
453
                            // $row[1] = object number of the object stream in which this object is stored
454
                            // $row[2] = index of this object within the object stream
455 5
                            $index = $row[1].'_0_'.$row[2];
456 5
                            $xref['xref'][$index] = -1;
457 5
                            break;
458
459
                    default:  // null objects
460
                            break;
461
                }
462 5
                ++$obj_num;
463
            }
464
        } // end decoding data
465 5
        if (isset($prevxref)) {
466
            // get previous xref
467 3
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
468
        }
469
470 5
        return $xref;
471
    }
472
473 28
    protected function getObjectHeaderPattern($objRefArr): string
474
    {
475
        // consider all whitespace character (PDF specifications)
476 28
        return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
477
    }
478
479 28
    protected function getObjectHeaderLen($objRefArr): int
480
    {
481
        // "4 0 obj"
482
        // 2 whitespaces + strlen("obj") = 5
483 28
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
484
    }
485
486
    /**
487
     * Get content of indirect object.
488
     *
489
     * @param string $pdfData  PDF data
490
     * @param array  $xref
491
     * @param string $objRef   Object number and generation number separated by underscore character
492
     * @param int    $offset   Object offset
493
     * @param bool   $decoding If true decode streams
494
     *
495
     * @return array containing object data
496
     *
497
     * @throws Exception if invalid object reference found
498
     */
499 28
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
500
    {
501
        /*
502
         * build indirect object header
503
         */
504
        // $objHeader = "[object number] [generation number] obj"
505 28
        $objRefArr = explode('_', $objRef);
506 28
        if (2 !== \count($objRefArr)) {
507
            throw new Exception('Invalid object reference for $obj.');
508
        }
509
510 28
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
511
512
        /*
513
         * check if we are in position
514
         */
515
        // ignore whitespace characters at offset
516 28
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
517
        // ignore leading zeros for object number
518 28
        $offset += strspn($pdfData, '0', $offset);
519 28
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
520
            // an indirect reference to an undefined object shall be considered a reference to the null object
521 1
            return ['null', 'null', $offset];
522
        }
523
524
        /*
525
         * get content
526
         */
527
        // starting position of object content
528 28
        $offset += $objHeaderLen;
529 28
        $objContentArr = [];
530 28
        $i = 0; // object main index
531
        do {
532 28
            $oldOffset = $offset;
533
            // get element
534 28
            $element = $this->getRawObject($pdfData, $offset);
535 28
            $offset = $element[2];
536
            // decode stream using stream's dictionary information
537 28
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
538 28
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
539
            }
540 28
            $objContentArr[$i] = $element;
541 28
            ++$i;
542 28
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
543
        // remove closing delimiter
544 28
        array_pop($objContentArr);
545
546
        /*
547
         * return raw object content
548
         */
549 28
        return $objContentArr;
550
    }
551
552
    /**
553
     * Get the content of object, resolving indirect object reference if necessary.
554
     *
555
     * @param string $pdfData PDF data
556
     * @param array  $obj     Object value
557
     *
558
     * @return array containing object data
559
     *
560
     * @throws Exception
561
     */
562 28
    protected function getObjectVal($pdfData, $xref, $obj)
563
    {
564 28
        if ('objref' == $obj[0]) {
565
            // reference to indirect object
566
            if (isset($this->objects[$obj[1]])) {
567
                // this object has been already parsed
568
                return $this->objects[$obj[1]];
569
            } elseif (isset($xref[$obj[1]])) {
570
                // parse new object
571
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
572
573
                return $this->objects[$obj[1]];
574
            }
575
        }
576
577 28
        return $obj;
578
    }
579
580
    /**
581
     * Get object type, raw value and offset to next object
582
     *
583
     * @param int $offset Object offset
584
     *
585
     * @return array containing object type, raw value and offset to next object
586
     */
587 29
    protected function getRawObject($pdfData, $offset = 0)
588
    {
589 29
        $objtype = ''; // object type to be returned
590 29
        $objval = ''; // object value to be returned
591
592
        // skip initial white space chars
593 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
594
595
        // get first char
596 29
        $char = $pdfData[$offset];
597
        // get object type
598 29
        switch ($char) {
599 29
            case '%':  // \x25 PERCENT SIGN
600
                    // skip comment and search for next token
601
                    $next = strcspn($pdfData, "\r\n", $offset);
602
                    if ($next > 0) {
603
                        $offset += $next;
604
605
                        return $this->getRawObject($pdfData, $offset);
606
                    }
607
                    break;
608
609 29
            case '/':  // \x2F SOLIDUS
610
                    // name object
611 29
                    $objtype = $char;
612 29
                    ++$offset;
613 29
                    $pregResult = preg_match(
614 29
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
615 29
                        substr($pdfData, $offset, 256),
616
                        $matches
617
                    );
618 29
                    if (1 == $pregResult) {
619 29
                        $objval = $matches[1]; // unescaped value
620 29
                        $offset += \strlen($objval);
621
                    }
622 29
                    break;
623
624 29
            case '(':   // \x28 LEFT PARENTHESIS
625 29
            case ')':  // \x29 RIGHT PARENTHESIS
626
                    // literal string object
627 26
                    $objtype = $char;
628 26
                    ++$offset;
629 26
                    $strpos = $offset;
630 26
                    if ('(' == $char) {
631 26
                        $open_bracket = 1;
632 26
                        while ($open_bracket > 0) {
633 26
                            if (!isset($pdfData[$strpos])) {
634
                                break;
635
                            }
636 26
                            $ch = $pdfData[$strpos];
637 26
                            switch ($ch) {
638 26
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
639
                                        // skip next character
640 15
                                        ++$strpos;
641 15
                                        break;
642
643 26
                                case '(':  // LEFT PARENHESIS (28h)
644
                                        ++$open_bracket;
645
                                        break;
646
647 26
                                case ')':  // RIGHT PARENTHESIS (29h)
648 26
                                        --$open_bracket;
649 26
                                        break;
650
                            }
651 26
                            ++$strpos;
652
                        }
653 26
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
654 26
                        $offset = $strpos;
655
                    }
656 26
                    break;
657
658 29
            case '[':   // \x5B LEFT SQUARE BRACKET
659 29
            case ']':  // \x5D RIGHT SQUARE BRACKET
660
                // array object
661 28
                $objtype = $char;
662 28
                ++$offset;
663 28
                if ('[' == $char) {
664
                    // get array content
665 28
                    $objval = [];
666
                    do {
667 28
                        $oldOffset = $offset;
668
                        // get element
669 28
                        $element = $this->getRawObject($pdfData, $offset);
670 28
                        $offset = $element[2];
671 28
                        $objval[] = $element;
672 28
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
673
                    // remove closing delimiter
674 28
                    array_pop($objval);
675
                }
676 28
                break;
677
678 29
            case '<':  // \x3C LESS-THAN SIGN
679 29
            case '>':  // \x3E GREATER-THAN SIGN
680 29
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
681
                    // dictionary object
682 29
                    $objtype = $char.$char;
683 29
                    $offset += 2;
684 29
                    if ('<' == $char) {
685
                        // get array content
686 29
                        $objval = [];
687
                        do {
688 29
                            $oldOffset = $offset;
689
                            // get element
690 29
                            $element = $this->getRawObject($pdfData, $offset);
691 29
                            $offset = $element[2];
692 29
                            $objval[] = $element;
693 29
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
694
                        // remove closing delimiter
695 29
                        array_pop($objval);
696
                    }
697
                } else {
698
                    // hexadecimal string object
699 11
                    $objtype = $char;
700 11
                    ++$offset;
701 11
                    $pregResult = preg_match(
702 11
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
703 11
                            substr($pdfData, $offset),
704
                            $matches
705
                        );
706 11
                    if (('<' == $char) && 1 == $pregResult) {
707
                        // remove white space characters
708 11
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
709 11
                        $offset += \strlen($matches[0]);
710
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
711
                        $offset = $endpos + 1;
712
                    }
713
                }
714 29
                    break;
715
716
            default:
717 29
                    if ('endobj' == substr($pdfData, $offset, 6)) {
718
                        // indirect object
719 28
                        $objtype = 'endobj';
720 28
                        $offset += 6;
721 29
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
722
                        // null object
723 3
                        $objtype = 'null';
724 3
                        $offset += 4;
725 3
                        $objval = 'null';
726 29
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
727
                        // boolean true object
728 8
                        $objtype = 'boolean';
729 8
                        $offset += 4;
730 8
                        $objval = 'true';
731 29
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
732
                        // boolean false object
733 1
                        $objtype = 'boolean';
734 1
                        $offset += 5;
735 1
                        $objval = 'false';
736 29
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
737
                        // start stream object
738 28
                        $objtype = 'stream';
739 28
                        $offset += 6;
740 28
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
741 28
                            $offset += \strlen($matches[0]);
742 28
                            $pregResult = preg_match(
743 28
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
744 28
                                substr($pdfData, $offset),
745
                                $matches,
746 28
                                \PREG_OFFSET_CAPTURE
747
                            );
748 28
                            if (1 == $pregResult) {
749 28
                                $objval = substr($pdfData, $offset, $matches[0][1]);
750 28
                                $offset += $matches[1][1];
751
                            }
752
                        }
753 29
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
754
                        // end stream object
755 28
                        $objtype = 'endstream';
756 28
                        $offset += 9;
757 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
758
                        // indirect object reference
759 28
                        $objtype = 'objref';
760 28
                        $offset += \strlen($matches[0]);
761 28
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
763
                        // object start
764 5
                        $objtype = 'obj';
765 5
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
766 5
                        $offset += \strlen($matches[0]);
767 29
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
768
                        // numeric object
769 28
                        $objtype = 'numeric';
770 28
                        $objval = substr($pdfData, $offset, $numlen);
771 28
                        $offset += $numlen;
772
                    }
773 29
                    break;
774
        }
775
776 29
        return [$objtype, $objval, $offset];
777
    }
778
779
    /**
780
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
781
     *
782
     * @param string $pdfData
783
     * @param int    $offset  xref offset (if know)
784
     * @param array  $xref    previous xref array (if any)
785
     *
786
     * @return array containing xref and trailer data
787
     *
788
     * @throws Exception if it was unable to find startxref
789
     * @throws Exception if it was unable to find xref
790
     */
791 29
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
792
    {
793 29
        $startxrefPreg = preg_match(
794 29
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
795
            $pdfData,
796
            $matches,
797 29
            \PREG_OFFSET_CAPTURE,
798
            $offset
799
        );
800
801 29
        if (0 == $offset) {
802
            // find last startxref
803 29
            $pregResult = preg_match_all(
804 29
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
805
                $pdfData, $matches,
806 29
                \PREG_SET_ORDER,
807
                $offset
808
            );
809 29
            if (0 == $pregResult) {
810
                throw new Exception('Unable to find startxref');
811
            }
812 29
            $matches = array_pop($matches);
813 29
            $startxref = $matches[1];
814 7
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
815
            // Already pointing at the xref table
816 4
            $startxref = $offset;
817 3
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
818
            // Cross-Reference Stream object
819 3
            $startxref = $offset;
820
        } elseif ($startxrefPreg) {
821
            // startxref found
822
            $startxref = $matches[1][0];
823
        } else {
824
            throw new Exception('Unable to find startxref');
825
        }
826
827 29
        if ($startxref > \strlen($pdfData)) {
828 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
829
        }
830
831
        // check xref position
832 28
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
833
            // Cross-Reference
834 23
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
835
        } else {
836
            // Cross-Reference Stream
837 5
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
838
        }
839 28
        if (empty($xref)) {
840
            throw new Exception('Unable to find xref');
841
        }
842
843 28
        return $xref;
844
    }
845
846
    /**
847
     * Parses PDF data and returns extracted data as array.
848
     *
849
     * @param string $data PDF data to parse
850
     *
851
     * @return array array of parsed PDF document objects
852
     *
853
     * @throws Exception if empty PDF data given
854
     * @throws Exception if PDF data missing %PDF header
855
     */
856 29
    public function parseData($data)
857
    {
858 29
        if (empty($data)) {
859
            throw new Exception('Empty PDF data given.');
860
        }
861
        // find the pdf header starting position
862 29
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
863
            throw new Exception('Invalid PDF data: missing %PDF header.');
864
        }
865
866
        // get PDF content string
867 29
        $pdfData = substr($data, $trimpos);
868
869
        // get xref and trailer data
870 29
        $xref = $this->getXrefData($pdfData);
871
872
        // parse all document objects
873 28
        $objects = [];
874 28
        foreach ($xref['xref'] as $obj => $offset) {
875 28
            if (!isset($objects[$obj]) && ($offset > 0)) {
876
                // decode objects with positive offset
877 28
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
878
            }
879
        }
880
881 28
        return [$xref, $objects];
882
    }
883
}
884