Test Failed
Pull Request — master (#411)
by
unknown
02:00
created

RawDataParser::getObjectHeaderPattern()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 4
ccs 0
cts 0
cp 0
crap 2
rs 10
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    // (NUL, HT, LF, FF, CR, SP)
61
    protected $pdfWhitespaces = "\0\t\n\f\r ";
62
    protected $pdfWhitespacesRegex = '[\0\t\n\f\r ]';
63 33
64
    /**
65
     * @param array $cfg Configuration array, default is []
66 33
     */
67
    public function __construct($cfg = [])
68 33
    {
69 33
        // merge given array with default values
70
        $this->cfg = array_merge($this->cfg, $cfg);
71
72
        $this->filterHelper = new FilterHelper();
73
    }
74
75
    /**
76
     * Decode the specified stream.
77
     *
78
     * @param string $pdfData PDF data
79
     * @param array  $xref
80
     * @param array  $sdic    Stream's dictionary array
81 29
     * @param string $stream  Stream to decode
82
     *
83
     * @return array containing decoded stream data and remaining filters
84 29
     */
85 29
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
86
    {
87
        // get stream length and filters
88 29
        $slength = \strlen($stream);
89 29
        if ($slength <= 0) {
90 29
            return ['', []];
91 29
        }
92
        $filters = [];
93 28
        foreach ($sdic as $k => $v) {
94 28
            if ('/' == $v[0]) {
95 28
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
96 28
                    // get declared stream length
97
                    $declength = (int) ($sdic[($k + 1)][1]);
98 29
                    if ($declength < $slength) {
99
                        $stream = substr($stream, 0, $declength);
100 29
                        $slength = $declength;
101 29
                    }
102
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
103 29
                    // resolve indirect object
104 1
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
105
                    if ('/' == $objval[0]) {
106 1
                        // single filter
107 1
                        $filters[] = $objval[1];
108 1
                    } elseif ('[' == $objval[0]) {
109
                        // array of filters
110
                        foreach ($objval[1] as $flt) {
111
                            if ('/' == $flt[0]) {
112
                                $filters[] = $flt[1];
113
                            }
114
                        }
115
                    }
116
                }
117 29
            }
118 29
        }
119 29
120
        // decode the stream
121 29
        $remaining_filters = [];
122 1
        foreach ($filters as $filter) {
123 1
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
124 1
                try {
125 1
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
126
                } catch (Exception $e) {
127 29
                    $emsg = $e->getMessage();
128
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
129
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
130
                    ) {
131
                        throw new Exception($e->getMessage());
132 4
                    }
133
                }
134
            } else {
135
                // add missing filter to array
136 29
                $remaining_filters[] = $filter;
137
            }
138
        }
139
140
        return [$stream, $remaining_filters];
141
    }
142
143
    /**
144
     * Decode the Cross-Reference section
145
     *
146
     * @param string $pdfData   PDF data
147
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
148 24
     * @param array  $xref      Previous xref array (if any)
149
     *
150 24
     * @return array containing xref and trailer data
151
     */
152 24
    protected function decodeXref($pdfData, $startxref, $xref = [])
153
    {
154 24
        $startxref += 4; // 4 is the length of the word 'xref'
155
        // skip initial white space chars
156 24
        $offset = $startxref + strspn($pdfData, $this->pdfWhitespaces, $startxref);
157 24
        // initialize object number
158
        $obj_num = 0;
159 5
        // search for cross-reference entries or subsection
160
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
161 24
            if ($matches[0][1] != $offset) {
162 24
                // we are on another section
163
                break;
164 24
            }
165
            $offset += \strlen($matches[0][0]);
166 24
            if ('n' == $matches[3][0]) {
167
                // create unique object index: [object number]_[generation number]
168 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
169
                // check if object already exist
170 24
                if (!isset($xref['xref'][$index])) {
171 24
                    // store object offset position
172 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
173
                }
174
                ++$obj_num;
175 24
            } elseif ('f' == $matches[3][0]) {
176
                ++$obj_num;
177
            } else {
178
                // object number (index)
179 24
                $obj_num = (int) ($matches[1][0]);
180 24
            }
181 24
        }
182
        // get trailer data
183 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
184
            $trailer_data = $matches[1][0];
185 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
186 24
                // get only the last updated version
187
                $xref['trailer'] = [];
188 24
                // parse trailer_data
189 24
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
190
                    $xref['trailer']['size'] = (int) ($matches[1]);
191 24
                }
192
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
193
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
194 24
                }
195 24
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197 24
                }
198 22
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199 22
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200 22
                }
201
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
202
                    $xref['trailer']['id'] = [];
203 24
                    $xref['trailer']['id'][0] = $matches[1];
204
                    $xref['trailer']['id'][1] = $matches[2];
205 24
                }
206
            }
207
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
208
                // get previous xref
209
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
210
            }
211 24
        } else {
212
            throw new Exception('Unable to find trailer');
213
        }
214
215
        return $xref;
216
    }
217
218
    /**
219
     * Decode the Cross-Reference Stream section
220
     *
221
     * @param string $pdfData   PDF data
222
     * @param int    $startxref Offset at which the xref section starts
223
     * @param array  $xref      Previous xref array (if any)
224
     *
225 6
     * @return array containing xref and trailer data
226
     *
227
     * @throws Exception if unknown PNG predictor detected
228 6
     */
229 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
230 6
    {
231
        // try to read Cross-Reference Stream
232 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
233 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
234
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
235 4
            // get only the last updated version
236
            $xref['trailer'] = [];
237 6
            $filltrailer = true;
238 6
        } else {
239
            $filltrailer = false;
240 6
        }
241 6
        if (!isset($xref['xref'])) {
242 6
            $xref['xref'] = [];
243 6
        }
244 6
        $valid_crs = false;
245
        $columns = 0;
246
        $predictor = null;
247
        $sarr = $xrefcrs[0][1];
248 6
        if (!\is_array($sarr)) {
249
            $sarr = [];
250 6
        }
251
252 6
        $wb = [];
253 6
254
        foreach ($sarr as $k => $v) {
255 6
            if (
256 6
                ('/' == $v[0])
257 6
                && ('Type' == $v[1])
258
                && (
259
                    isset($sarr[($k + 1)])
260 6
                    && '/' == $sarr[($k + 1)][0]
261 6
                    && 'XRef' == $sarr[($k + 1)][1]
262
                )
263 4
            ) {
264 6
                $valid_crs = true;
265
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
266 4
                // first object number in the subsection
267 6
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
268
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
269 6
                // get previous xref offset
270 6
                $prevxref = (int) ($sarr[($k + 1)][1]);
271 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
272 6
                // number of bytes (in the decoded stream) of the corresponding field
273 5
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
274 5
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
275
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
276 5
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
277 5
                $decpar = $sarr[($k + 1)][1];
278
                foreach ($decpar as $kdc => $vdc) {
279 5
                    if (
280 5
                        '/' == $vdc[0]
281
                        && 'Columns' == $vdc[1]
282
                        && (
283 5
                            isset($decpar[($kdc + 1)])
284
                            && 'numeric' == $decpar[($kdc + 1)][0]
285 5
                        )
286 5
                    ) {
287
                        $columns = (int) ($decpar[($kdc + 1)][1]);
288 5
                    } elseif (
289 5
                        '/' == $vdc[0]
290
                        && 'Predictor' == $vdc[1]
291
                        && (
292 5
                            isset($decpar[($kdc + 1)])
293
                            && 'numeric' == $decpar[($kdc + 1)][0]
294
                        )
295 6
                    ) {
296 6
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
297 6
                    }
298 6
                }
299 6
            } elseif ($filltrailer) {
300 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
301 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
302 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
304 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
305 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
306 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
307 6
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
308
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
309
                    $xref['trailer']['id'] = [];
310
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
311
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
312
                }
313 6
            }
314 6
        }
315
316 5
        // decode data
317
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
318 5
            if (null !== $predictor) {
319
                // number of bytes in a row
320 5
                $rowlen = ($columns + 1);
321
                // convert the stream into an array of integers
322
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
323 5
                // split the rows
324
                $sdata = array_chunk($sdata, $rowlen);
325 5
326
                // initialize decoded array
327 5
                $ddata = [];
328
                // initialize first row with zeros
329 5
                $prev_row = array_fill(0, $rowlen, 0);
330
                // for each row apply PNG unpredictor
331 5
                foreach ($sdata as $k => $row) {
332
                    // initialize new row
333 5
                    $ddata[$k] = [];
334
                    // get PNG predictor value
335 5
                    $predictor = (10 + $row[0]);
336 5
                    // for each byte on the row
337 5
                    for ($i = 1; $i <= $columns; ++$i) {
338 5
                        // new index
339 5
                        $j = ($i - 1);
340
                        $row_up = $prev_row[$j];
341 5
                        if (1 == $i) {
342 5
                            $row_left = 0;
343
                            $row_upleft = 0;
344 5
                        } else {
345 5
                            $row_left = $row[($i - 1)];
346
                            $row_upleft = $prev_row[($j - 1)];
347
                        }
348
                        switch ($predictor) {
349 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
350
                                $ddata[$k][$j] = $row[$i];
351
                                break;
352
353 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
354 5
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
355 5
                                break;
356
357
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
358
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
359
                                break;
360
361
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
362
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
363
                                break;
364
365
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
366
                                // initial estimate
367
                                $p = ($row_left + $row_up - $row_upleft);
368
                                // distances
369
                                $pa = abs($p - $row_left);
370
                                $pb = abs($p - $row_up);
371
                                $pc = abs($p - $row_upleft);
372
                                $pmin = min($pa, $pb, $pc);
373
                                // return minimum distance
374
                                switch ($pmin) {
375
                                    case $pa:
376
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
377
                                        break;
378
379
                                    case $pb:
380
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
381
                                        break;
382
383
                                    case $pc:
384
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
385
                                        break;
386
                                }
387
                                break;
388
389 5
                            default:  // PNG prediction (on encoding, PNG optimum)
390
                                throw new Exception('Unknown PNG predictor: '.$predictor);
391
                        }
392
                    }
393
                    $prev_row = $ddata[$k];
394 2
                } // end for each row
395
                // complete decoding
396 2
            } else {
397
                // number of bytes in a row
398 2
                $rowlen = array_sum($wb);
399
                // convert the stream into an array of integers
400
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
401 6
                // split the rows
402
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

402
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
403
            }
404 6
405
            $sdata = [];
406 6
407 6
            // for every row
408
            foreach ($ddata as $k => $row) {
409
                // initialize new row
410
                $sdata[$k] = [0, 0, 0];
411 6
                if (0 == $wb[0]) {
412
                    // default type field
413 6
                    $sdata[$k][0] = 1;
414
                }
415 6
                $i = 0; // count bytes in the row
416 6
                // for every column
417 6
                for ($c = 0; $c < 3; ++$c) {
418
                    // for every byte on the column
419 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
420
                        if (isset($row[$i])) {
421
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
422
                        }
423 6
                        ++$i;
424
                    }
425 6
                }
426 4
            }
427
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
428 6
            // fill xref
429
            if (isset($index_first)) {
430 6
                $obj_num = $index_first;
431 6
            } else {
432 6
                $obj_num = 0;
433 6
            }
434
            foreach ($sdata as $k => $row) {
435 6
                switch ($row[0]) {
436
                    case 0:  // (f) linked list of free objects
437 6
                            break;
438
439 6
                    case 1:  // (n) objects that are in use but are not compressed
440
                            // create unique object index: [object number]_[generation number]
441 6
                            $index = $obj_num.'_'.$row[2];
442
                            // check if object already exist
443 6
                            if (!isset($xref['xref'][$index])) {
444
                                // store object offset position
445 6
                                $xref['xref'][$index] = $row[1];
446
                            }
447
                            break;
448 6
449 6
                    case 2:  // compressed objects
450 6
                            // $row[1] = object number of the object stream in which this object is stored
451
                            // $row[2] = index of this object within the object stream
452
                            $index = $row[1].'_0_'.$row[2];
453
                            $xref['xref'][$index] = -1;
454
                            break;
455 6
456
                    default:  // null objects
457
                            break;
458 6
                }
459
                ++$obj_num;
460 4
            }
461
        } // end decoding data
462
        if (isset($prevxref)) {
463 6
            // get previous xref
464
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
465
        }
466
467
        return $xref;
468
    }
469
470
    protected function getObjectHeaderPattern($objRefArr)
471
    {
472
        // consider all whitespace character (PDF specifications)
473
        return '/'.$objRefArr[0].$this->pdfWhitespacesRegex.$objRefArr[1].$this->pdfWhitespacesRegex.'obj'.'/';
474
    }
475
476
    protected function getObjectHeaderLen($objRefArr)
477
    {
478
        // "4 0 obj"
479 29
        // 2 whitespaces + strlen("obj") = 5
480
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
481
    }
482
483
    /**
484
     * Get content of indirect object.
485 29
     *
486 29
     * @param string $pdfData  PDF data
487
     * @param array  $xref
488
     * @param string $objRef   Object number and generation number separated by underscore character
489 29
     * @param int    $offset   Object offset
490
     * @param bool   $decoding If true decode streams
491
     *
492
     * @return array containing object data
493
     *
494
     * @throws Exception if invalid object reference found
495 29
     */
496
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
497 29
    {
498
        /*
499 29
         * build indirect object header
500 29
         */
501
        // $objHeader = "[object number] [generation number] obj"
502 2
        $objRefArr = explode('_', $objRef);
503
        if (2 !== \count($objRefArr)) {
504
            throw new Exception('Invalid object reference for $obj.');
505
        }
506
507
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
508
        /*
509 29
         * check if we are in position
510 29
         */
511 29
        // ignore whitespace characters at offset
512
        $offset += strspn($pdfData, $this->pdfWhitespaces, $offset);
513 29
        // ignore leading zeros for object number
514
        $offset += strspn($pdfData, '0', $offset);
515 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
516 29
            // an indirect reference to an undefined object shall be considered a reference to the null object
517
            return ['null', 'null', $offset];
518 29
        }
519 29
520
        /*
521 29
         * get content
522 29
         */
523 29
        // starting position of object content
524
        $offset += $objHeaderLen;
525 29
        $objContentArr = [];
526
        $i = 0; // object main index
527
        do {
528
            $oldOffset = $offset;
529
            // get element
530 29
            $element = $this->getRawObject($pdfData, $offset);
531
            $offset = $element[2];
532
            // decode stream using stream's dictionary information
533
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
534
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
535
            }
536
            $objContentArr[$i] = $element;
537
            ++$i;
538
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
539
        // remove closing delimiter
540
        array_pop($objContentArr);
541
542
        /*
543 29
         * return raw object content
544
         */
545 29
        return $objContentArr;
546
    }
547
548
    /**
549
     * Get the content of object, resolving indirect object reference if necessary.
550
     *
551
     * @param string $pdfData PDF data
552
     * @param array  $obj     Object value
553
     *
554
     * @return array containing object data
555
     *
556
     * @throws Exception
557
     */
558 29
    protected function getObjectVal($pdfData, $xref, $obj)
559
    {
560
        if ('objref' == $obj[0]) {
561
            // reference to indirect object
562
            if (isset($this->objects[$obj[1]])) {
563
                // this object has been already parsed
564
                return $this->objects[$obj[1]];
565
            } elseif (isset($xref[$obj[1]])) {
566
                // parse new object
567
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
568 30
569
                return $this->objects[$obj[1]];
570 30
            }
571 30
        }
572
573
        return $obj;
574
    }
575
576
    /**
577
     * Get object type, raw value and offset to next object
578
     *
579
     * @param int $offset Object offset
580
     *
581
     * @return array containing object type, raw value and offset to next object
582 30
     */
583
    protected function getRawObject($pdfData, $offset = 0)
584
    {
585 30
        $objtype = ''; // object type to be returned
586
        $objval = ''; // object value to be returned
587 30
588 30
        //skip initial white space chars
589
        $offset += strspn($pdfData, $this->pdfWhitespaces, $offset);
590
591
        // get first char
592
        $char = $pdfData[$offset];
593
        // get object type
594
        switch ($char) {
595
            case '%':  // \x25 PERCENT SIGN
596
                    // skip comment and search for next token
597
                    $next = strcspn($pdfData, "\r\n", $offset);
598 30
                    if ($next > 0) {
599
                        $offset += $next;
600 30
601 30
                        return $this->getRawObject($pdfData, $offset);
602 30
                    }
603 30
                    break;
604 30
605
            case '/':  // \x2F SOLIDUS
606
                    // name object
607 30
                    $objtype = $char;
608 30
                    ++$offset;
609 30
                    $pregResult = preg_match(
610
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
611 30
                        substr($pdfData, $offset, 256),
612
                        $matches
613 30
                    );
614 30
                    if (1 == $pregResult) {
615
                        $objval = $matches[1]; // unescaped value
616 27
                        $offset += \strlen($objval);
617 27
                    }
618 27
                    break;
619 27
620 27
            case '(':   // \x28 LEFT PARENTHESIS
621 27
            case ')':  // \x29 RIGHT PARENTHESIS
622 27
                    // literal string object
623
                    $objtype = $char;
624
                    ++$offset;
625 27
                    $strpos = $offset;
626 27
                    if ('(' == $char) {
627 27
                        $open_bracket = 1;
628
                        while ($open_bracket > 0) {
629 15
                            if (!isset($pdfData[$strpos])) {
630 15
                                break;
631
                            }
632 27
                            $ch = $pdfData[$strpos];
633
                            switch ($ch) {
634
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
635
                                        // skip next character
636 27
                                        ++$strpos;
637 27
                                        break;
638 27
639
                                case '(':  // LEFT PARENHESIS (28h)
640 27
                                        ++$open_bracket;
641
                                        break;
642 27
643 27
                                case ')':  // RIGHT PARENTHESIS (29h)
644
                                        --$open_bracket;
645 27
                                        break;
646
                            }
647 30
                            ++$strpos;
648 30
                        }
649
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
650 29
                        $offset = $strpos;
651 29
                    }
652 29
                    break;
653
654 29
            case '[':   // \x5B LEFT SQUARE BRACKET
655
            case ']':  // \x5D RIGHT SQUARE BRACKET
656 29
                // array object
657
                $objtype = $char;
658 29
                ++$offset;
659 29
                if ('[' == $char) {
660 29
                    // get array content
661 29
                    $objval = [];
662
                    do {
663 29
                        $oldOffset = $offset;
664
                        // get element
665 29
                        $element = $this->getRawObject($pdfData, $offset);
666
                        $offset = $element[2];
667 30
                        $objval[] = $element;
668 30
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
669 30
                    // remove closing delimiter
670
                    array_pop($objval);
671 30
                }
672 30
                break;
673 30
674
            case '<':  // \x3C LESS-THAN SIGN
675 30
            case '>':  // \x3E GREATER-THAN SIGN
676
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
677 30
                    // dictionary object
678
                    $objtype = $char.$char;
679 30
                    $offset += 2;
680 30
                    if ('<' == $char) {
681 30
                        // get array content
682 30
                        $objval = [];
683
                        do {
684 30
                            $oldOffset = $offset;
685
                            // get element
686
                            $element = $this->getRawObject($pdfData, $offset);
687
                            $offset = $element[2];
688 12
                            $objval[] = $element;
689 12
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
690 12
                        // remove closing delimiter
691 12
                        array_pop($objval);
692 12
                    }
693
                } else {
694
                    // hexadecimal string object
695 12
                    $objtype = $char;
696
                    ++$offset;
697 12
                    $pregResult = preg_match(
698 12
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
699
                            substr($pdfData, $offset),
700
                            $matches
701
                        );
702
                    if (('<' == $char) && 1 == $pregResult) {
703 30
                        // remove white space characters
704
                        $objval = strtr($matches[1], $this->pdfWhitespaces, '');
705
                        $offset += \strlen($matches[0]);
706 30
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
707
                        $offset = $endpos + 1;
708 29
                    }
709 29
                }
710 30
                    break;
711
712 4
            default:
713 4
                    if ('endobj' == substr($pdfData, $offset, 6)) {
714 4
                        // indirect object
715 30
                        $objtype = 'endobj';
716
                        $offset += 6;
717 9
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
718 9
                        // null object
719 9
                        $objtype = 'null';
720 30
                        $offset += 4;
721
                        $objval = 'null';
722 2
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
723 2
                        // boolean true object
724 2
                        $objtype = 'boolean';
725 30
                        $offset += 4;
726
                        $objval = 'true';
727 29
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
728 29
                        // boolean false object
729 29
                        $objtype = 'boolean';
730 29
                        $offset += 5;
731 29
                        $objval = 'false';
732 29
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
733 29
                        // start stream object
734
                        $objtype = 'stream';
735 29
                        $offset += 6;
736
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
737 29
                            $offset += \strlen($matches[0]);
738 29
                            $pregResult = preg_match(
739 29
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
740
                                substr($pdfData, $offset),
741
                                $matches,
742 30
                                \PREG_OFFSET_CAPTURE
743
                            );
744 29
                            if (1 == $pregResult) {
745 29
                                $objval = substr($pdfData, $offset, $matches[0][1]);
746 30
                                $offset += $matches[1][1];
747
                            }
748 29
                        }
749 29
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
750 29
                        // end stream object
751 30
                        $objtype = 'endstream';
752
                        $offset += 9;
753 6
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
754 6
                        // indirect object reference
755 6
                        $objtype = 'objref';
756 30
                        $offset += \strlen($matches[0]);
757
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
758 29
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
759 29
                        // object start
760 29
                        $objtype = 'obj';
761
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 30
                        $offset += \strlen($matches[0]);
763
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
764
                        // numeric object
765 30
                        $objtype = 'numeric';
766
                        $objval = substr($pdfData, $offset, $numlen);
767
                        $offset += $numlen;
768
                    }
769
                    break;
770
        }
771
772
        return [$objtype, $objval, $offset];
773
    }
774
775
    /**
776
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
777
     *
778
     * @param string $pdfData
779
     * @param int    $offset  xref offset (if know)
780 30
     * @param array  $xref    previous xref array (if any)
781
     *
782 30
     * @return array containing xref and trailer data
783 30
     *
784
     * @throws Exception if it was unable to find startxref
785
     * @throws Exception if it was unable to find xref
786 30
     */
787
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
788
    {
789
        $startxrefPreg = preg_match(
790 30
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
791
            $pdfData,
792 30
            $matches,
793 30
            \PREG_OFFSET_CAPTURE,
794
            $offset
795 30
        );
796
797
        if (0 == $offset) {
798 30
            // find last startxref
799
            $pregResult = preg_match_all(
800
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
801 30
                $pdfData, $matches,
802 30
                \PREG_SET_ORDER,
803 8
                $offset
804
            );
805 5
            if (0 == $pregResult) {
806 4
                throw new Exception('Unable to find startxref');
807
            }
808 4
            $matches = array_pop($matches);
809
            $startxref = $matches[1];
810
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
811
            // Already pointing at the xref table
812
            $startxref = $offset;
813
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
814
            // Cross-Reference Stream object
815
            $startxref = $offset;
816 30
        } elseif ($startxrefPreg) {
817 1
            // startxref found
818
            $startxref = $matches[1][0];
819
        } else {
820
            throw new Exception('Unable to find startxref');
821 29
        }
822
823 24
        if ($startxref > \strlen($pdfData)) {
824
            throw new Exception('Unable to find xref (PDF corrupted?)');
825
        }
826 6
827
        // check xref position
828 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
829
            // Cross-Reference
830
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
831
        } else {
832 29
            // Cross-Reference Stream
833
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
834
        }
835
        if (empty($xref)) {
836
            throw new Exception('Unable to find xref');
837
        }
838
839
        return $xref;
840
    }
841
842
    /**
843
     * Parses PDF data and returns extracted data as array.
844
     *
845 30
     * @param string $data PDF data to parse
846
     *
847 30
     * @return array array of parsed PDF document objects
848
     *
849
     * @throws Exception if empty PDF data given
850
     * @throws Exception if PDF data missing %PDF header
851 30
     */
852
    public function parseData($data)
853
    {
854
        if (empty($data)) {
855
            throw new Exception('Empty PDF data given.');
856 30
        }
857
        // find the pdf header starting position
858
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
859 30
            throw new Exception('Invalid PDF data: missing %PDF header.');
860
        }
861
862 29
        // get PDF content string
863 29
        $pdfData = substr($data, $trimpos);
864 29
865
        // get xref and trailer data
866 29
        $xref = $this->getXrefData($pdfData);
867
868
        // parse all document objects
869
        $objects = [];
870 29
        foreach ($xref['xref'] as $obj => $offset) {
871
            if (!isset($objects[$obj]) && ($offset > 0)) {
872
                // decode objects with positive offset
873
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
874
            }
875
        }
876
877
        return [$xref, $objects];
878
    }
879
}
880