Test Failed
Pull Request — master (#411)
by
unknown
02:00
created

RawDataParser::getRawObject()   F

Complexity

Conditions 39
Paths 36

Size

Total Lines 190
Code Lines 134

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 114
CRAP Score 39.7964

Importance

Changes 5
Bugs 2 Features 1
Metric Value
cc 39
eloc 134
c 5
b 2
f 1
nc 36
nop 2
dl 0
loc 190
ccs 114
cts 124
cp 0.9194
crap 39.7964
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    // (NUL, HT, LF, FF, CR, SP)
61
    protected $pdfWhitespaces = "\0\t\n\f\r ";
62
    protected $pdfWhitespacesRegex = '[\0\t\n\f\r ]';
63 33
64
    /**
65
     * @param array $cfg Configuration array, default is []
66 33
     */
67
    public function __construct($cfg = [])
68 33
    {
69 33
        // merge given array with default values
70
        $this->cfg = array_merge($this->cfg, $cfg);
71
72
        $this->filterHelper = new FilterHelper();
73
    }
74
75
    /**
76
     * Decode the specified stream.
77
     *
78
     * @param string $pdfData PDF data
79
     * @param array  $xref
80
     * @param array  $sdic    Stream's dictionary array
81 29
     * @param string $stream  Stream to decode
82
     *
83
     * @return array containing decoded stream data and remaining filters
84 29
     */
85 29
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
86
    {
87
        // get stream length and filters
88 29
        $slength = \strlen($stream);
89 29
        if ($slength <= 0) {
90 29
            return ['', []];
91 29
        }
92
        $filters = [];
93 28
        foreach ($sdic as $k => $v) {
94 28
            if ('/' == $v[0]) {
95 28
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
96 28
                    // get declared stream length
97
                    $declength = (int) ($sdic[($k + 1)][1]);
98 29
                    if ($declength < $slength) {
99
                        $stream = substr($stream, 0, $declength);
100 29
                        $slength = $declength;
101 29
                    }
102
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
103 29
                    // resolve indirect object
104 1
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
105
                    if ('/' == $objval[0]) {
106 1
                        // single filter
107 1
                        $filters[] = $objval[1];
108 1
                    } elseif ('[' == $objval[0]) {
109
                        // array of filters
110
                        foreach ($objval[1] as $flt) {
111
                            if ('/' == $flt[0]) {
112
                                $filters[] = $flt[1];
113
                            }
114
                        }
115
                    }
116
                }
117 29
            }
118 29
        }
119 29
120
        // decode the stream
121 29
        $remaining_filters = [];
122 1
        foreach ($filters as $filter) {
123 1
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
124 1
                try {
125 1
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
126
                } catch (Exception $e) {
127 29
                    $emsg = $e->getMessage();
128
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
129
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
130
                    ) {
131
                        throw new Exception($e->getMessage());
132 4
                    }
133
                }
134
            } else {
135
                // add missing filter to array
136 29
                $remaining_filters[] = $filter;
137
            }
138
        }
139
140
        return [$stream, $remaining_filters];
141
    }
142
143
    /**
144
     * Decode the Cross-Reference section
145
     *
146
     * @param string $pdfData   PDF data
147
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
148 24
     * @param array  $xref      Previous xref array (if any)
149
     *
150 24
     * @return array containing xref and trailer data
151
     */
152 24
    protected function decodeXref($pdfData, $startxref, $xref = [])
153
    {
154 24
        $startxref += 4; // 4 is the length of the word 'xref'
155
        // skip initial white space chars
156 24
        $offset = $startxref + strspn($pdfData, $this->pdfWhitespaces, $startxref);
157 24
        // initialize object number
158
        $obj_num = 0;
159 5
        // search for cross-reference entries or subsection
160
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
161 24
            if ($matches[0][1] != $offset) {
162 24
                // we are on another section
163
                break;
164 24
            }
165
            $offset += \strlen($matches[0][0]);
166 24
            if ('n' == $matches[3][0]) {
167
                // create unique object index: [object number]_[generation number]
168 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
169
                // check if object already exist
170 24
                if (!isset($xref['xref'][$index])) {
171 24
                    // store object offset position
172 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
173
                }
174
                ++$obj_num;
175 24
            } elseif ('f' == $matches[3][0]) {
176
                ++$obj_num;
177
            } else {
178
                // object number (index)
179 24
                $obj_num = (int) ($matches[1][0]);
180 24
            }
181 24
        }
182
        // get trailer data
183 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
184
            $trailer_data = $matches[1][0];
185 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
186 24
                // get only the last updated version
187
                $xref['trailer'] = [];
188 24
                // parse trailer_data
189 24
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
190
                    $xref['trailer']['size'] = (int) ($matches[1]);
191 24
                }
192
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
193
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
194 24
                }
195 24
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197 24
                }
198 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200 22
                }
201
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['id'] = [];
203 24
                    $xref['trailer']['id'][0] = $matches[1];
204
                    $xref['trailer']['id'][1] = $matches[2];
205 24
                }
206
            }
207
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
208
                // get previous xref
209
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
210
            }
211 24
        } else {
212
            throw new Exception('Unable to find trailer');
213
        }
214
215
        return $xref;
216
    }
217
218
    /**
219
     * Decode the Cross-Reference Stream section
220
     *
221
     * @param string $pdfData   PDF data
222
     * @param int    $startxref Offset at which the xref section starts
223
     * @param array  $xref      Previous xref array (if any)
224
     *
225 6
     * @return array containing xref and trailer data
226
     *
227
     * @throws Exception if unknown PNG predictor detected
228 6
     */
229 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
230 6
    {
231
        // try to read Cross-Reference Stream
232 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
233 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
234
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
235 4
            // get only the last updated version
236
            $xref['trailer'] = [];
237 6
            $filltrailer = true;
238 6
        } else {
239
            $filltrailer = false;
240 6
        }
241 6
        if (!isset($xref['xref'])) {
242 6
            $xref['xref'] = [];
243 6
        }
244 6
        $valid_crs = false;
245
        $columns = 0;
246
        $predictor = null;
247
        $sarr = $xrefcrs[0][1];
248 6
        if (!\is_array($sarr)) {
249
            $sarr = [];
250 6
        }
251
252 6
        $wb = [];
253 6
254
        foreach ($sarr as $k => $v) {
255 6
            if (
256 6
                ('/' == $v[0])
257 6
                && ('Type' == $v[1])
258
                && (
259
                    isset($sarr[($k + 1)])
260 6
                    && '/' == $sarr[($k + 1)][0]
261 6
                    && 'XRef' == $sarr[($k + 1)][1]
262
                )
263 4
            ) {
264 6
                $valid_crs = true;
265
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
266 4
                // first object number in the subsection
267 6
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
268
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
269 6
                // get previous xref offset
270 6
                $prevxref = (int) ($sarr[($k + 1)][1]);
271 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
272 6
                // number of bytes (in the decoded stream) of the corresponding field
273 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
274 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
275
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
276 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
277 5
                $decpar = $sarr[($k + 1)][1];
278
                foreach ($decpar as $kdc => $vdc) {
279 5
                    if (
280 5
                        '/' == $vdc[0]
281
                        && 'Columns' == $vdc[1]
282
                        && (
283 5
                            isset($decpar[($kdc + 1)])
284
                            && 'numeric' == $decpar[($kdc + 1)][0]
285 5
                        )
286 5
                    ) {
287
                        $columns = (int) ($decpar[($kdc + 1)][1]);
288 5
                    } elseif (
289 5
                        '/' == $vdc[0]
290
                        && 'Predictor' == $vdc[1]
291
                        && (
292 5
                            isset($decpar[($kdc + 1)])
293
                            && 'numeric' == $decpar[($kdc + 1)][0]
294
                        )
295 6
                    ) {
296 6
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
297 6
                    }
298 6
                }
299 6
            } elseif ($filltrailer) {
300 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
305 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
306 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
307 6
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
308
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
309
                    $xref['trailer']['id'] = [];
310
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
311
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
312
                }
313 6
            }
314 6
        }
315
316 5
        // decode data
317
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
318 5
            if (null !== $predictor) {
319
                // number of bytes in a row
320 5
                $rowlen = ($columns + 1);
321
                // convert the stream into an array of integers
322
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
323 5
                // split the rows
324
                $sdata = array_chunk($sdata, $rowlen);
325 5
326
                // initialize decoded array
327 5
                $ddata = [];
328
                // initialize first row with zeros
329 5
                $prev_row = array_fill(0, $rowlen, 0);
330
                // for each row apply PNG unpredictor
331 5
                foreach ($sdata as $k => $row) {
332
                    // initialize new row
333 5
                    $ddata[$k] = [];
334
                    // get PNG predictor value
335 5
                    $predictor = (10 + $row[0]);
336 5
                    // for each byte on the row
337 5
                    for ($i = 1; $i <= $columns; ++$i) {
338 5
                        // new index
339 5
                        $j = ($i - 1);
340
                        $row_up = $prev_row[$j];
341 5
                        if (1 == $i) {
342 5
                            $row_left = 0;
343
                            $row_upleft = 0;
344 5
                        } else {
345 5
                            $row_left = $row[($i - 1)];
346
                            $row_upleft = $prev_row[($j - 1)];
347
                        }
348
                        switch ($predictor) {
349 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
350
                                $ddata[$k][$j] = $row[$i];
351
                                break;
352
353 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
355 5
                                break;
356
357
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
359
                                break;
360
361
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
362
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
363
                                break;
364
365
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
366
                                // initial estimate
367
                                $p = ($row_left + $row_up - $row_upleft);
368
                                // distances
369
                                $pa = abs($p - $row_left);
370
                                $pb = abs($p - $row_up);
371
                                $pc = abs($p - $row_upleft);
372
                                $pmin = min($pa, $pb, $pc);
373
                                // return minimum distance
374
                                switch ($pmin) {
375
                                    case $pa:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
377
                                        break;
378
379
                                    case $pb:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
381
                                        break;
382
383
                                    case $pc:
384
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
385
                                        break;
386
                                }
387
                                break;
388
389 5
                            default:  // PNG prediction (on encoding, PNG optimum)
390
                                throw new Exception('Unknown PNG predictor: '.$predictor);
391
                        }
392
                    }
393
                    $prev_row = $ddata[$k];
394 2
                } // end for each row
395
                // complete decoding
396 2
            } else {
397
                // number of bytes in a row
398 2
                $rowlen = array_sum($wb);
399
                // convert the stream into an array of integers
400
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
401 6
                // split the rows
402
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

402
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
403
            }
404 6
405
            $sdata = [];
406 6
407 6
            // for every row
408
            foreach ($ddata as $k => $row) {
409
                // initialize new row
410
                $sdata[$k] = [0, 0, 0];
411 6
                if (0 == $wb[0]) {
412
                    // default type field
413 6
                    $sdata[$k][0] = 1;
414
                }
415 6
                $i = 0; // count bytes in the row
416 6
                // for every column
417 6
                for ($c = 0; $c < 3; ++$c) {
418
                    // for every byte on the column
419 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
420
                        if (isset($row[$i])) {
421
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
422
                        }
423 6
                        ++$i;
424
                    }
425 6
                }
426 4
            }
427
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
428 6
            // fill xref
429
            if (isset($index_first)) {
430 6
                $obj_num = $index_first;
431 6
            } else {
432 6
                $obj_num = 0;
433 6
            }
434
            foreach ($sdata as $k => $row) {
435 6
                switch ($row[0]) {
436
                    case 0:  // (f) linked list of free objects
437 6
                            break;
438
439 6
                    case 1:  // (n) objects that are in use but are not compressed
440
                            // create unique object index: [object number]_[generation number]
441 6
                            $index = $obj_num.'_'.$row[2];
442
                            // check if object already exist
443 6
                            if (!isset($xref['xref'][$index])) {
444
                                // store object offset position
445 6
                                $xref['xref'][$index] = $row[1];
446
                            }
447
                            break;
448 6
449 6
                    case 2:  // compressed objects
450 6
                            // $row[1] = object number of the object stream in which this object is stored
451
                            // $row[2] = index of this object within the object stream
452
                            $index = $row[1].'_0_'.$row[2];
453
                            $xref['xref'][$index] = -1;
454
                            break;
455 6
456
                    default:  // null objects
457
                            break;
458 6
                }
459
                ++$obj_num;
460 4
            }
461
        } // end decoding data
462
        if (isset($prevxref)) {
463 6
            // get previous xref
464
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
465
        }
466
467
        return $xref;
468
    }
469
470
    protected function getObjectHeaderPattern($objRefArr)
471
    {
472
        // consider all whitespace character (PDF specifications)
473
        return '/'.$objRefArr[0].$this->pdfWhitespacesRegex.$objRefArr[1].$this->pdfWhitespacesRegex.'obj'.'/';
474
    }
475
476
    protected function getObjectHeaderLen($objRefArr)
477
    {
478
        // "4 0 obj"
479 29
        // 2 whitespaces + strlen("obj") = 5
480
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
481
    }
482
483
    /**
484
     * Get content of indirect object.
485 29
     *
486 29
     * @param string $pdfData  PDF data
487
     * @param array  $xref
488
     * @param string $objRef   Object number and generation number separated by underscore character
489 29
     * @param int    $offset   Object offset
490
     * @param bool   $decoding If true decode streams
491
     *
492
     * @return array containing object data
493
     *
494
     * @throws Exception if invalid object reference found
495 29
     */
496
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
497 29
    {
498
        /*
499 29
         * build indirect object header
500 29
         */
501
        // $objHeader = "[object number] [generation number] obj"
502 2
        $objRefArr = explode('_', $objRef);
503
        if (2 !== \count($objRefArr)) {
504
            throw new Exception('Invalid object reference for $obj.');
505
        }
506
507
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
508
        /*
509 29
         * check if we are in position
510 29
         */
511 29
        // ignore whitespace characters at offset
512
        $offset += strspn($pdfData, $this->pdfWhitespaces, $offset);
513 29
        // ignore leading zeros for object number
514
        $offset += strspn($pdfData, '0', $offset);
515 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
516 29
            // an indirect reference to an undefined object shall be considered a reference to the null object
517
            return ['null', 'null', $offset];
518 29
        }
519 29
520
        /*
521 29
         * get content
522 29
         */
523 29
        // starting position of object content
524
        $offset += $objHeaderLen;
525 29
        $objContentArr = [];
526
        $i = 0; // object main index
527
        do {
528
            $oldOffset = $offset;
529
            // get element
530 29
            $element = $this->getRawObject($pdfData, $offset);
531
            $offset = $element[2];
532
            // decode stream using stream's dictionary information
533
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
534
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
535
            }
536
            $objContentArr[$i] = $element;
537
            ++$i;
538
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
539
        // remove closing delimiter
540
        array_pop($objContentArr);
541
542
        /*
543 29
         * return raw object content
544
         */
545 29
        return $objContentArr;
546
    }
547
548
    /**
549
     * Get the content of object, resolving indirect object reference if necessary.
550
     *
551
     * @param string $pdfData PDF data
552
     * @param array  $obj     Object value
553
     *
554
     * @return array containing object data
555
     *
556
     * @throws Exception
557
     */
558 29
    protected function getObjectVal($pdfData, $xref, $obj)
559
    {
560
        if ('objref' == $obj[0]) {
561
            // reference to indirect object
562
            if (isset($this->objects[$obj[1]])) {
563
                // this object has been already parsed
564
                return $this->objects[$obj[1]];
565
            } elseif (isset($xref[$obj[1]])) {
566
                // parse new object
567
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
568 30
569
                return $this->objects[$obj[1]];
570 30
            }
571 30
        }
572
573
        return $obj;
574
    }
575
576
    /**
577
     * Get object type, raw value and offset to next object
578
     *
579
     * @param int $offset Object offset
580
     *
581
     * @return array containing object type, raw value and offset to next object
582 30
     */
583
    protected function getRawObject($pdfData, $offset = 0)
584
    {
585 30
        $objtype = ''; // object type to be returned
586
        $objval = ''; // object value to be returned
587 30
588 30
        //skip initial white space chars
589
        $offset += strspn($pdfData, $this->pdfWhitespaces, $offset);
590
591
        // get first char
592
        $char = $pdfData[$offset];
593
        // get object type
594
        switch ($char) {
595
            case '%':  // \x25 PERCENT SIGN
596
                    // skip comment and search for next token
597
                    $next = strcspn($pdfData, "\r\n", $offset);
598 30
                    if ($next > 0) {
599
                        $offset += $next;
600 30
601 30
                        return $this->getRawObject($pdfData, $offset);
602 30
                    }
603 30
                    break;
604 30
605
            case '/':  // \x2F SOLIDUS
606
                    // name object
607 30
                    $objtype = $char;
608 30
                    ++$offset;
609 30
                    $pregResult = preg_match(
610
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
611 30
                        substr($pdfData, $offset, 256),
612
                        $matches
613 30
                    );
614 30
                    if (1 == $pregResult) {
615
                        $objval = $matches[1]; // unescaped value
616 27
                        $offset += \strlen($objval);
617 27
                    }
618 27
                    break;
619 27
620 27
            case '(':   // \x28 LEFT PARENTHESIS
621 27
            case ')':  // \x29 RIGHT PARENTHESIS
622 27
                    // literal string object
623
                    $objtype = $char;
624
                    ++$offset;
625 27
                    $strpos = $offset;
626 27
                    if ('(' == $char) {
627 27
                        $open_bracket = 1;
628
                        while ($open_bracket > 0) {
629 15
                            if (!isset($pdfData[$strpos])) {
630 15
                                break;
631
                            }
632 27
                            $ch = $pdfData[$strpos];
633
                            switch ($ch) {
634
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
635
                                        // skip next character
636 27
                                        ++$strpos;
637 27
                                        break;
638 27
639
                                case '(':  // LEFT PARENHESIS (28h)
640 27
                                        ++$open_bracket;
641
                                        break;
642 27
643 27
                                case ')':  // RIGHT PARENTHESIS (29h)
644
                                        --$open_bracket;
645 27
                                        break;
646
                            }
647 30
                            ++$strpos;
648 30
                        }
649
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
650 29
                        $offset = $strpos;
651 29
                    }
652 29
                    break;
653
654 29
            case '[':   // \x5B LEFT SQUARE BRACKET
655
            case ']':  // \x5D RIGHT SQUARE BRACKET
656 29
                // array object
657
                $objtype = $char;
658 29
                ++$offset;
659 29
                if ('[' == $char) {
660 29
                    // get array content
661 29
                    $objval = [];
662
                    do {
663 29
                        $oldOffset = $offset;
664
                        // get element
665 29
                        $element = $this->getRawObject($pdfData, $offset);
666
                        $offset = $element[2];
667 30
                        $objval[] = $element;
668 30
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
669 30
                    // remove closing delimiter
670
                    array_pop($objval);
671 30
                }
672 30
                break;
673 30
674
            case '<':  // \x3C LESS-THAN SIGN
675 30
            case '>':  // \x3E GREATER-THAN SIGN
676
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
677 30
                    // dictionary object
678
                    $objtype = $char.$char;
679 30
                    $offset += 2;
680 30
                    if ('<' == $char) {
681 30
                        // get array content
682 30
                        $objval = [];
683
                        do {
684 30
                            $oldOffset = $offset;
685
                            // get element
686
                            $element = $this->getRawObject($pdfData, $offset);
687
                            $offset = $element[2];
688 12
                            $objval[] = $element;
689 12
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
690 12
                        // remove closing delimiter
691 12
                        array_pop($objval);
692 12
                    }
693
                } else {
694
                    // hexadecimal string object
695 12
                    $objtype = $char;
696
                    ++$offset;
697 12
                    $pregResult = preg_match(
698 12
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
699
                            substr($pdfData, $offset),
700
                            $matches
701
                        );
702
                    if (('<' == $char) && 1 == $pregResult) {
703 30
                        // remove white space characters
704
                        $objval = strtr($matches[1], $this->pdfWhitespaces, '');
705
                        $offset += \strlen($matches[0]);
706 30
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
707
                        $offset = $endpos + 1;
708 29
                    }
709 29
                }
710 30
                    break;
711
712 4
            default:
713 4
                    if ('endobj' == substr($pdfData, $offset, 6)) {
714 4
                        // indirect object
715 30
                        $objtype = 'endobj';
716
                        $offset += 6;
717 9
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
718 9
                        // null object
719 9
                        $objtype = 'null';
720 30
                        $offset += 4;
721
                        $objval = 'null';
722 2
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
723 2
                        // boolean true object
724 2
                        $objtype = 'boolean';
725 30
                        $offset += 4;
726
                        $objval = 'true';
727 29
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
728 29
                        // boolean false object
729 29
                        $objtype = 'boolean';
730 29
                        $offset += 5;
731 29
                        $objval = 'false';
732 29
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
733 29
                        // start stream object
734
                        $objtype = 'stream';
735 29
                        $offset += 6;
736
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
737 29
                            $offset += \strlen($matches[0]);
738 29
                            $pregResult = preg_match(
739 29
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
740
                                substr($pdfData, $offset),
741
                                $matches,
742 30
                                \PREG_OFFSET_CAPTURE
743
                            );
744 29
                            if (1 == $pregResult) {
745 29
                                $objval = substr($pdfData, $offset, $matches[0][1]);
746 30
                                $offset += $matches[1][1];
747
                            }
748 29
                        }
749 29
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
750 29
                        // end stream object
751 30
                        $objtype = 'endstream';
752
                        $offset += 9;
753 6
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
754 6
                        // indirect object reference
755 6
                        $objtype = 'objref';
756 30
                        $offset += \strlen($matches[0]);
757
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
758 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
759 29
                        // object start
760 29
                        $objtype = 'obj';
761
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 30
                        $offset += \strlen($matches[0]);
763
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
764
                        // numeric object
765 30
                        $objtype = 'numeric';
766
                        $objval = substr($pdfData, $offset, $numlen);
767
                        $offset += $numlen;
768
                    }
769
                    break;
770
        }
771
772
        return [$objtype, $objval, $offset];
773
    }
774
775
    /**
776
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
777
     *
778
     * @param string $pdfData
779
     * @param int    $offset  xref offset (if know)
780 30
     * @param array  $xref    previous xref array (if any)
781
     *
782 30
     * @return array containing xref and trailer data
783 30
     *
784
     * @throws Exception if it was unable to find startxref
785
     * @throws Exception if it was unable to find xref
786 30
     */
787
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
788
    {
789
        $startxrefPreg = preg_match(
790 30
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
791
            $pdfData,
792 30
            $matches,
793 30
            \PREG_OFFSET_CAPTURE,
794
            $offset
795 30
        );
796
797
        if (0 == $offset) {
798 30
            // find last startxref
799
            $pregResult = preg_match_all(
800
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
801 30
                $pdfData, $matches,
802 30
                \PREG_SET_ORDER,
803 8
                $offset
804
            );
805 5
            if (0 == $pregResult) {
806 4
                throw new Exception('Unable to find startxref');
807
            }
808 4
            $matches = array_pop($matches);
809
            $startxref = $matches[1];
810
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
811
            // Already pointing at the xref table
812
            $startxref = $offset;
813
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
814
            // Cross-Reference Stream object
815
            $startxref = $offset;
816 30
        } elseif ($startxrefPreg) {
817 1
            // startxref found
818
            $startxref = $matches[1][0];
819
        } else {
820
            throw new Exception('Unable to find startxref');
821 29
        }
822
823 24
        if ($startxref > \strlen($pdfData)) {
824
            throw new Exception('Unable to find xref (PDF corrupted?)');
825
        }
826 6
827
        // check xref position
828 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
829
            // Cross-Reference
830
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
831
        } else {
832 29
            // Cross-Reference Stream
833
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
834
        }
835
        if (empty($xref)) {
836
            throw new Exception('Unable to find xref');
837
        }
838
839
        return $xref;
840
    }
841
842
    /**
843
     * Parses PDF data and returns extracted data as array.
844
     *
845 30
     * @param string $data PDF data to parse
846
     *
847 30
     * @return array array of parsed PDF document objects
848
     *
849
     * @throws Exception if empty PDF data given
850
     * @throws Exception if PDF data missing %PDF header
851 30
     */
852
    public function parseData($data)
853
    {
854
        if (empty($data)) {
855
            throw new Exception('Empty PDF data given.');
856 30
        }
857
        // find the pdf header starting position
858
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
859 30
            throw new Exception('Invalid PDF data: missing %PDF header.');
860
        }
861
862 29
        // get PDF content string
863 29
        $pdfData = substr($data, $trimpos);
864 29
865
        // get xref and trailer data
866 29
        $xref = $this->getXrefData($pdfData);
867
868
        // parse all document objects
869
        $objects = [];
870 29
        foreach ($xref['xref'] as $obj => $offset) {
871
            if (!isset($objects[$obj]) && ($offset > 0)) {
872
                // decode objects with positive offset
873
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
874
            }
875
        }
876
877
        return [$xref, $objects];
878
    }
879
}
880