Test Failed
Pull Request — master (#411)
by
unknown
02:25
created

RawDataParser::getIndirectObject()   B

Complexity

Conditions 9
Paths 4

Size

Total Lines 51
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 9.081

Importance

Changes 5
Bugs 2 Features 1
Metric Value
cc 9
eloc 22
c 5
b 2
f 1
nc 4
nop 5
dl 0
loc 51
ccs 18
cts 20
cp 0.9
crap 9.081
rs 8.0555

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63 33
    protected $filterHelper;
64
    protected $objects;
65
66 33
    /**
67
     * @param array $cfg Configuration array, default is []
68 33
     */
69 33
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72
        $this->cfg = array_merge($this->cfg, $cfg);
73
74
        $this->filterHelper = new FilterHelper();
75
        $this->config = $config ?: new Config();
76
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81 29
     * @param string $pdfData PDF data
82
     * @param array  $xref
83
     * @param array  $sdic    Stream's dictionary array
84 29
     * @param string $stream  Stream to decode
85 29
     *
86
     * @return array containing decoded stream data and remaining filters
87
     */
88 29
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
89 29
    {
90 29
        // get stream length and filters
91 29
        $slength = \strlen($stream);
92
        if ($slength <= 0) {
93 28
            return ['', []];
94 28
        }
95 28
        $filters = [];
96 28
        foreach ($sdic as $k => $v) {
97
            if ('/' == $v[0]) {
98 29
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
99
                    // get declared stream length
100 29
                    $declength = (int) ($sdic[($k + 1)][1]);
101 29
                    if ($declength < $slength) {
102
                        $stream = substr($stream, 0, $declength);
103 29
                        $slength = $declength;
104 1
                    }
105
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
106 1
                    // resolve indirect object
107 1
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
108 1
                    if ('/' == $objval[0]) {
109
                        // single filter
110
                        $filters[] = $objval[1];
111
                    } elseif ('[' == $objval[0]) {
112
                        // array of filters
113
                        foreach ($objval[1] as $flt) {
114
                            if ('/' == $flt[0]) {
115
                                $filters[] = $flt[1];
116
                            }
117 29
                        }
118 29
                    }
119 29
                }
120
            }
121 29
        }
122 1
123 1
        // decode the stream
124 1
        $remaining_filters = [];
125 1
        foreach ($filters as $filter) {
126
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
127 29
                try {
128
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
129
                } catch (Exception $e) {
130
                    $emsg = $e->getMessage();
131
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
132 4
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
133
                    ) {
134
                        throw new Exception($e->getMessage());
135
                    }
136 29
                }
137
            } else {
138
                // add missing filter to array
139
                $remaining_filters[] = $filter;
140
            }
141
        }
142
143
        return [$stream, $remaining_filters];
144
    }
145
146
    /**
147
     * Decode the Cross-Reference section
148 24
     *
149
     * @param string $pdfData   PDF data
150 24
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
151
     * @param array  $xref      Previous xref array (if any)
152 24
     *
153
     * @return array containing xref and trailer data
154 24
     */
155
    protected function decodeXref($pdfData, $startxref, $xref = [])
156 24
    {
157 24
        $startxref += 4; // 4 is the length of the word 'xref'
158
        // skip initial white space chars
159 5
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
160
        // initialize object number
161 24
        $obj_num = 0;
162 24
        // search for cross-reference entries or subsection
163
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
164 24
            if ($matches[0][1] != $offset) {
165
                // we are on another section
166 24
                break;
167
            }
168 24
            $offset += \strlen($matches[0][0]);
169
            if ('n' == $matches[3][0]) {
170 24
                // create unique object index: [object number]_[generation number]
171 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
172 24
                // check if object already exist
173
                if (!isset($xref['xref'][$index])) {
174
                    // store object offset position
175 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
176
                }
177
                ++$obj_num;
178
            } elseif ('f' == $matches[3][0]) {
179 24
                ++$obj_num;
180 24
            } else {
181 24
                // object number (index)
182
                $obj_num = (int) ($matches[1][0]);
183 24
            }
184
        }
185 24
        // get trailer data
186 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
187
            $trailer_data = $matches[1][0];
188 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
189 24
                // get only the last updated version
190
                $xref['trailer'] = [];
191 24
                // parse trailer_data
192
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
193
                    $xref['trailer']['size'] = (int) ($matches[1]);
194 24
                }
195 24
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197 24
                }
198 22
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 22
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200 22
                }
201
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203 24
                }
204
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
205 24
                    $xref['trailer']['id'] = [];
206
                    $xref['trailer']['id'][0] = $matches[1];
207
                    $xref['trailer']['id'][1] = $matches[2];
208
                }
209
            }
210
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
211 24
                // get previous xref
212
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
213
            }
214
        } else {
215
            throw new Exception('Unable to find trailer');
216
        }
217
218
        return $xref;
219
    }
220
221
    /**
222
     * Decode the Cross-Reference Stream section
223
     *
224
     * @param string $pdfData   PDF data
225 6
     * @param int    $startxref Offset at which the xref section starts
226
     * @param array  $xref      Previous xref array (if any)
227
     *
228 6
     * @return array containing xref and trailer data
229 6
     *
230 6
     * @throws Exception if unknown PNG predictor detected
231
     */
232 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
233 6
    {
234
        // try to read Cross-Reference Stream
235 4
        $xrefobj = $this->getRawObject($pdfData, $startxref);
236
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
237 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
238 6
            // get only the last updated version
239
            $xref['trailer'] = [];
240 6
            $filltrailer = true;
241 6
        } else {
242 6
            $filltrailer = false;
243 6
        }
244 6
        if (!isset($xref['xref'])) {
245
            $xref['xref'] = [];
246
        }
247
        $valid_crs = false;
248 6
        $columns = 0;
249
        $predictor = null;
250 6
        $sarr = $xrefcrs[0][1];
251
        if (!\is_array($sarr)) {
252 6
            $sarr = [];
253 6
        }
254
255 6
        $wb = [];
256 6
257 6
        foreach ($sarr as $k => $v) {
258
            if (
259
                ('/' == $v[0])
260 6
                && ('Type' == $v[1])
261 6
                && (
262
                    isset($sarr[($k + 1)])
263 4
                    && '/' == $sarr[($k + 1)][0]
264 6
                    && 'XRef' == $sarr[($k + 1)][1]
265
                )
266 4
            ) {
267 6
                $valid_crs = true;
268
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
269 6
                // first object number in the subsection
270 6
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
271 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
272 6
                // get previous xref offset
273 5
                $prevxref = (int) ($sarr[($k + 1)][1]);
274 5
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
275
                // number of bytes (in the decoded stream) of the corresponding field
276 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
277 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
278
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
279 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
280 5
                $decpar = $sarr[($k + 1)][1];
281
                foreach ($decpar as $kdc => $vdc) {
282
                    if (
283 5
                        '/' == $vdc[0]
284
                        && 'Columns' == $vdc[1]
285 5
                        && (
286 5
                            isset($decpar[($kdc + 1)])
287
                            && 'numeric' == $decpar[($kdc + 1)][0]
288 5
                        )
289 5
                    ) {
290
                        $columns = (int) ($decpar[($kdc + 1)][1]);
291
                    } elseif (
292 5
                        '/' == $vdc[0]
293
                        && 'Predictor' == $vdc[1]
294
                        && (
295 6
                            isset($decpar[($kdc + 1)])
296 6
                            && 'numeric' == $decpar[($kdc + 1)][0]
297 6
                        )
298 6
                    ) {
299 6
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
300 6
                    }
301 6
                }
302 6
            } elseif ($filltrailer) {
303
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
304 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
305 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
306 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
307 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
308
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
309
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
310
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
311
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
312
                    $xref['trailer']['id'] = [];
313 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
314 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
315
                }
316 5
            }
317
        }
318 5
319
        // decode data
320 5
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
321
            if (null !== $predictor) {
322
                // number of bytes in a row
323 5
                $rowlen = ($columns + 1);
324
                // convert the stream into an array of integers
325 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
326
                // split the rows
327 5
                $sdata = array_chunk($sdata, $rowlen);
328
329 5
                // initialize decoded array
330
                $ddata = [];
331 5
                // initialize first row with zeros
332
                $prev_row = array_fill(0, $rowlen, 0);
333 5
                // for each row apply PNG unpredictor
334
                foreach ($sdata as $k => $row) {
335 5
                    // initialize new row
336 5
                    $ddata[$k] = [];
337 5
                    // get PNG predictor value
338 5
                    $predictor = (10 + $row[0]);
339 5
                    // for each byte on the row
340
                    for ($i = 1; $i <= $columns; ++$i) {
341 5
                        // new index
342 5
                        $j = ($i - 1);
343
                        $row_up = $prev_row[$j];
344 5
                        if (1 == $i) {
345 5
                            $row_left = 0;
346
                            $row_upleft = 0;
347
                        } else {
348
                            $row_left = $row[($i - 1)];
349 5
                            $row_upleft = $prev_row[($j - 1)];
350
                        }
351
                        switch ($predictor) {
352
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
353 5
                                $ddata[$k][$j] = $row[$i];
354 5
                                break;
355 5
356
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
357
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
358
                                break;
359
360
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
361
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
362
                                break;
363
364
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
365
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
366
                                break;
367
368
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
369
                                // initial estimate
370
                                $p = ($row_left + $row_up - $row_upleft);
371
                                // distances
372
                                $pa = abs($p - $row_left);
373
                                $pb = abs($p - $row_up);
374
                                $pc = abs($p - $row_upleft);
375
                                $pmin = min($pa, $pb, $pc);
376
                                // return minimum distance
377
                                switch ($pmin) {
378
                                    case $pa:
379
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
380
                                        break;
381
382
                                    case $pb:
383
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
384
                                        break;
385
386
                                    case $pc:
387
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
388
                                        break;
389 5
                                }
390
                                break;
391
392
                            default:  // PNG prediction (on encoding, PNG optimum)
393
                                throw new Exception('Unknown PNG predictor: '.$predictor);
394 2
                        }
395
                    }
396 2
                    $prev_row = $ddata[$k];
397
                } // end for each row
398 2
                // complete decoding
399
            } else {
400
                // number of bytes in a row
401 6
                $rowlen = array_sum($wb);
402
                // convert the stream into an array of integers
403
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
404 6
                // split the rows
405
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

405
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
406 6
            }
407 6
408
            $sdata = [];
409
410
            // for every row
411 6
            foreach ($ddata as $k => $row) {
412
                // initialize new row
413 6
                $sdata[$k] = [0, 0, 0];
414
                if (0 == $wb[0]) {
415 6
                    // default type field
416 6
                    $sdata[$k][0] = 1;
417 6
                }
418
                $i = 0; // count bytes in the row
419 6
                // for every column
420
                for ($c = 0; $c < 3; ++$c) {
421
                    // for every byte on the column
422
                    for ($b = 0; $b < $wb[$c]; ++$b) {
423 6
                        if (isset($row[$i])) {
424
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
425 6
                        }
426 4
                        ++$i;
427
                    }
428 6
                }
429
            }
430 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
431 6
            // fill xref
432 6
            if (isset($index_first)) {
433 6
                $obj_num = $index_first;
434
            } else {
435 6
                $obj_num = 0;
436
            }
437 6
            foreach ($sdata as $k => $row) {
438
                switch ($row[0]) {
439 6
                    case 0:  // (f) linked list of free objects
440
                            break;
441 6
442
                    case 1:  // (n) objects that are in use but are not compressed
443 6
                            // create unique object index: [object number]_[generation number]
444
                            $index = $obj_num.'_'.$row[2];
445 6
                            // check if object already exist
446
                            if (!isset($xref['xref'][$index])) {
447
                                // store object offset position
448 6
                                $xref['xref'][$index] = $row[1];
449 6
                            }
450 6
                            break;
451
452
                    case 2:  // compressed objects
453
                            // $row[1] = object number of the object stream in which this object is stored
454
                            // $row[2] = index of this object within the object stream
455 6
                            $index = $row[1].'_0_'.$row[2];
456
                            $xref['xref'][$index] = -1;
457
                            break;
458 6
459
                    default:  // null objects
460 4
                            break;
461
                }
462
                ++$obj_num;
463 6
            }
464
        } // end decoding data
465
        if (isset($prevxref)) {
466
            // get previous xref
467
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
468
        }
469
470
        return $xref;
471
    }
472
473
    protected function getObjectHeaderPattern($objRefArr): string
474
    {
475
        // consider all whitespace character (PDF specifications)
476
        return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
477
    }
478
479 29
    protected function getObjectHeaderLen($objRefArr): int
480
    {
481
        // "4 0 obj"
482
        // 2 whitespaces + strlen("obj") = 5
483
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
484
    }
485 29
486 29
    /**
487
     * Get content of indirect object.
488
     *
489 29
     * @param string $pdfData  PDF data
490
     * @param array  $xref
491
     * @param string $objRef   Object number and generation number separated by underscore character
492
     * @param int    $offset   Object offset
493
     * @param bool   $decoding If true decode streams
494
     *
495 29
     * @return array containing object data
496
     *
497 29
     * @throws Exception if invalid object reference found
498
     */
499 29
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
500 29
    {
501
        /*
502 2
         * build indirect object header
503
         */
504
        // $objHeader = "[object number] [generation number] obj"
505
        $objRefArr = explode('_', $objRef);
506
        if (2 !== \count($objRefArr)) {
507
            throw new Exception('Invalid object reference for $obj.');
508
        }
509 29
510 29
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
511 29
512
        /*
513 29
         * check if we are in position
514
         */
515 29
        // ignore whitespace characters at offset
516 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
517
        // ignore leading zeros for object number
518 29
        $offset += strspn($pdfData, '0', $offset);
519 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
520
            // an indirect reference to an undefined object shall be considered a reference to the null object
521 29
            return ['null', 'null', $offset];
522 29
        }
523 29
524
        /*
525 29
         * get content
526
         */
527
        // starting position of object content
528
        $offset += $objHeaderLen;
529
        $objContentArr = [];
530 29
        $i = 0; // object main index
531
        do {
532
            $oldOffset = $offset;
533
            // get element
534
            $element = $this->getRawObject($pdfData, $offset);
535
            $offset = $element[2];
536
            // decode stream using stream's dictionary information
537
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
538
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
539
            }
540
            $objContentArr[$i] = $element;
541
            ++$i;
542
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
543 29
        // remove closing delimiter
544
        array_pop($objContentArr);
545 29
546
        /*
547
         * return raw object content
548
         */
549
        return $objContentArr;
550
    }
551
552
    /**
553
     * Get the content of object, resolving indirect object reference if necessary.
554
     *
555
     * @param string $pdfData PDF data
556
     * @param array  $obj     Object value
557
     *
558 29
     * @return array containing object data
559
     *
560
     * @throws Exception
561
     */
562
    protected function getObjectVal($pdfData, $xref, $obj)
563
    {
564
        if ('objref' == $obj[0]) {
565
            // reference to indirect object
566
            if (isset($this->objects[$obj[1]])) {
567
                // this object has been already parsed
568 30
                return $this->objects[$obj[1]];
569
            } elseif (isset($xref[$obj[1]])) {
570 30
                // parse new object
571 30
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
572
573
                return $this->objects[$obj[1]];
574
            }
575
        }
576
577
        return $obj;
578
    }
579
580
    /**
581
     * Get object type, raw value and offset to next object
582 30
     *
583
     * @param int $offset Object offset
584
     *
585 30
     * @return array containing object type, raw value and offset to next object
586
     */
587 30
    protected function getRawObject($pdfData, $offset = 0)
588 30
    {
589
        $objtype = ''; // object type to be returned
590
        $objval = ''; // object value to be returned
591
592
        // skip initial white space chars
593
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
594
595
        // get first char
596
        $char = $pdfData[$offset];
597
        // get object type
598 30
        switch ($char) {
599
            case '%':  // \x25 PERCENT SIGN
600 30
                    // skip comment and search for next token
601 30
                    $next = strcspn($pdfData, "\r\n", $offset);
602 30
                    if ($next > 0) {
603 30
                        $offset += $next;
604 30
605
                        return $this->getRawObject($pdfData, $offset);
606
                    }
607 30
                    break;
608 30
609 30
            case '/':  // \x2F SOLIDUS
610
                    // name object
611 30
                    $objtype = $char;
612
                    ++$offset;
613 30
                    $pregResult = preg_match(
614 30
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
615
                        substr($pdfData, $offset, 256),
616 27
                        $matches
617 27
                    );
618 27
                    if (1 == $pregResult) {
619 27
                        $objval = $matches[1]; // unescaped value
620 27
                        $offset += \strlen($objval);
621 27
                    }
622 27
                    break;
623
624
            case '(':   // \x28 LEFT PARENTHESIS
625 27
            case ')':  // \x29 RIGHT PARENTHESIS
626 27
                    // literal string object
627 27
                    $objtype = $char;
628
                    ++$offset;
629 15
                    $strpos = $offset;
630 15
                    if ('(' == $char) {
631
                        $open_bracket = 1;
632 27
                        while ($open_bracket > 0) {
633
                            if (!isset($pdfData[$strpos])) {
634
                                break;
635
                            }
636 27
                            $ch = $pdfData[$strpos];
637 27
                            switch ($ch) {
638 27
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
639
                                        // skip next character
640 27
                                        ++$strpos;
641
                                        break;
642 27
643 27
                                case '(':  // LEFT PARENHESIS (28h)
644
                                        ++$open_bracket;
645 27
                                        break;
646
647 30
                                case ')':  // RIGHT PARENTHESIS (29h)
648 30
                                        --$open_bracket;
649
                                        break;
650 29
                            }
651 29
                            ++$strpos;
652 29
                        }
653
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
654 29
                        $offset = $strpos;
655
                    }
656 29
                    break;
657
658 29
            case '[':   // \x5B LEFT SQUARE BRACKET
659 29
            case ']':  // \x5D RIGHT SQUARE BRACKET
660 29
                // array object
661 29
                $objtype = $char;
662
                ++$offset;
663 29
                if ('[' == $char) {
664
                    // get array content
665 29
                    $objval = [];
666
                    do {
667 30
                        $oldOffset = $offset;
668 30
                        // get element
669 30
                        $element = $this->getRawObject($pdfData, $offset);
670
                        $offset = $element[2];
671 30
                        $objval[] = $element;
672 30
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
673 30
                    // remove closing delimiter
674
                    array_pop($objval);
675 30
                }
676
                break;
677 30
678
            case '<':  // \x3C LESS-THAN SIGN
679 30
            case '>':  // \x3E GREATER-THAN SIGN
680 30
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
681 30
                    // dictionary object
682 30
                    $objtype = $char.$char;
683
                    $offset += 2;
684 30
                    if ('<' == $char) {
685
                        // get array content
686
                        $objval = [];
687
                        do {
688 12
                            $oldOffset = $offset;
689 12
                            // get element
690 12
                            $element = $this->getRawObject($pdfData, $offset);
691 12
                            $offset = $element[2];
692 12
                            $objval[] = $element;
693
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
694
                        // remove closing delimiter
695 12
                        array_pop($objval);
696
                    }
697 12
                } else {
698 12
                    // hexadecimal string object
699
                    $objtype = $char;
700
                    ++$offset;
701
                    $pregResult = preg_match(
702
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
703 30
                            substr($pdfData, $offset),
704
                            $matches
705
                        );
706 30
                    if (('<' == $char) && 1 == $pregResult) {
707
                        // remove white space characters
708 29
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
709 29
                        $offset += \strlen($matches[0]);
710 30
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
711
                        $offset = $endpos + 1;
712 4
                    }
713 4
                }
714 4
                    break;
715 30
716
            default:
717 9
                    if ('endobj' == substr($pdfData, $offset, 6)) {
718 9
                        // indirect object
719 9
                        $objtype = 'endobj';
720 30
                        $offset += 6;
721
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
722 2
                        // null object
723 2
                        $objtype = 'null';
724 2
                        $offset += 4;
725 30
                        $objval = 'null';
726
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
727 29
                        // boolean true object
728 29
                        $objtype = 'boolean';
729 29
                        $offset += 4;
730 29
                        $objval = 'true';
731 29
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
732 29
                        // boolean false object
733 29
                        $objtype = 'boolean';
734
                        $offset += 5;
735 29
                        $objval = 'false';
736
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
737 29
                        // start stream object
738 29
                        $objtype = 'stream';
739 29
                        $offset += 6;
740
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
741
                            $offset += \strlen($matches[0]);
742 30
                            $pregResult = preg_match(
743
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
744 29
                                substr($pdfData, $offset),
745 29
                                $matches,
746 30
                                \PREG_OFFSET_CAPTURE
747
                            );
748 29
                            if (1 == $pregResult) {
749 29
                                $objval = substr($pdfData, $offset, $matches[0][1]);
750 29
                                $offset += $matches[1][1];
751 30
                            }
752
                        }
753 6
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
754 6
                        // end stream object
755 6
                        $objtype = 'endstream';
756 30
                        $offset += 9;
757
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
758 29
                        // indirect object reference
759 29
                        $objtype = 'objref';
760 29
                        $offset += \strlen($matches[0]);
761
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 30
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
763
                        // object start
764
                        $objtype = 'obj';
765 30
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
766
                        $offset += \strlen($matches[0]);
767
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
768
                        // numeric object
769
                        $objtype = 'numeric';
770
                        $objval = substr($pdfData, $offset, $numlen);
771
                        $offset += $numlen;
772
                    }
773
                    break;
774
        }
775
776
        return [$objtype, $objval, $offset];
777
    }
778
779
    /**
780 30
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
781
     *
782 30
     * @param string $pdfData
783 30
     * @param int    $offset  xref offset (if know)
784
     * @param array  $xref    previous xref array (if any)
785
     *
786 30
     * @return array containing xref and trailer data
787
     *
788
     * @throws Exception if it was unable to find startxref
789
     * @throws Exception if it was unable to find xref
790 30
     */
791
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
792 30
    {
793 30
        $startxrefPreg = preg_match(
794
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
795 30
            $pdfData,
796
            $matches,
797
            \PREG_OFFSET_CAPTURE,
798 30
            $offset
799
        );
800
801 30
        if (0 == $offset) {
802 30
            // find last startxref
803 8
            $pregResult = preg_match_all(
804
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
805 5
                $pdfData, $matches,
806 4
                \PREG_SET_ORDER,
807
                $offset
808 4
            );
809
            if (0 == $pregResult) {
810
                throw new Exception('Unable to find startxref');
811
            }
812
            $matches = array_pop($matches);
813
            $startxref = $matches[1];
814
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
815
            // Already pointing at the xref table
816 30
            $startxref = $offset;
817 1
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
818
            // Cross-Reference Stream object
819
            $startxref = $offset;
820
        } elseif ($startxrefPreg) {
821 29
            // startxref found
822
            $startxref = $matches[1][0];
823 24
        } else {
824
            throw new Exception('Unable to find startxref');
825
        }
826 6
827
        if ($startxref > \strlen($pdfData)) {
828 29
            throw new Exception('Unable to find xref (PDF corrupted?)');
829
        }
830
831
        // check xref position
832 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
833
            // Cross-Reference
834
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
835
        } else {
836
            // Cross-Reference Stream
837
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
838
        }
839
        if (empty($xref)) {
840
            throw new Exception('Unable to find xref');
841
        }
842
843
        return $xref;
844
    }
845 30
846
    /**
847 30
     * Parses PDF data and returns extracted data as array.
848
     *
849
     * @param string $data PDF data to parse
850
     *
851 30
     * @return array array of parsed PDF document objects
852
     *
853
     * @throws Exception if empty PDF data given
854
     * @throws Exception if PDF data missing %PDF header
855
     */
856 30
    public function parseData($data)
857
    {
858
        if (empty($data)) {
859 30
            throw new Exception('Empty PDF data given.');
860
        }
861
        // find the pdf header starting position
862 29
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
863 29
            throw new Exception('Invalid PDF data: missing %PDF header.');
864 29
        }
865
866 29
        // get PDF content string
867
        $pdfData = substr($data, $trimpos);
868
869
        // get xref and trailer data
870 29
        $xref = $this->getXrefData($pdfData);
871
872
        // parse all document objects
873
        $objects = [];
874
        foreach ($xref['xref'] as $obj => $offset) {
875
            if (!isset($objects[$obj]) && ($offset > 0)) {
876
                // decode objects with positive offset
877
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
878
            }
879
        }
880
881
        return [$xref, $objects];
882
    }
883
}
884