Passed
Push — master ( ef0204...57667e )
by Konrad
02:16
created

RawDataParser::getObjectHeaderPattern()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 4
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
use Smalot\PdfParser\Config;
45
46
class RawDataParser
47
{
48
    /**
49
     * @var \Smalot\PdfParser\Config
50
     */
51
    private $config;
52
53
    /**
54
     * Configuration array.
55
     */
56
    protected $cfg = [
57
        // if `true` ignore filter decoding errors
58
        'ignore_filter_decoding_errors' => true,
59
        // if `true` ignore missing filter decoding errors
60
        'ignore_missing_filter_decoders' => true,
61
    ];
62
63
    protected $filterHelper;
64
    protected $objects;
65
66
    /**
67
     * @param array $cfg Configuration array, default is []
68
     */
69 33
    public function __construct($cfg = [], Config $config = null)
70
    {
71
        // merge given array with default values
72 33
        $this->cfg = array_merge($this->cfg, $cfg);
73
74 33
        $this->filterHelper = new FilterHelper();
75 33
        $this->config = $config ?: new Config();
76 33
    }
77
78
    /**
79
     * Decode the specified stream.
80
     *
81
     * @param string $pdfData PDF data
82
     * @param array  $xref
83
     * @param array  $sdic    Stream's dictionary array
84
     * @param string $stream  Stream to decode
85
     *
86
     * @return array containing decoded stream data and remaining filters
87
     */
88 29
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
89
    {
90
        // get stream length and filters
91 29
        $slength = \strlen($stream);
92 29
        if ($slength <= 0) {
93
            return ['', []];
94
        }
95 29
        $filters = [];
96 29
        foreach ($sdic as $k => $v) {
97 29
            if ('/' == $v[0]) {
98 29
                if (('Length' == $v[1]) && (isset($sdic[($k + 1)])) && ('numeric' == $sdic[($k + 1)][0])) {
99
                    // get declared stream length
100 28
                    $declength = (int) ($sdic[($k + 1)][1]);
101 28
                    if ($declength < $slength) {
102 28
                        $stream = substr($stream, 0, $declength);
103 28
                        $slength = $declength;
104
                    }
105 29
                } elseif (('Filter' == $v[1]) && (isset($sdic[($k + 1)]))) {
106
                    // resolve indirect object
107 29
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
108 29
                    if ('/' == $objval[0]) {
109
                        // single filter
110 29
                        $filters[] = $objval[1];
111 1
                    } elseif ('[' == $objval[0]) {
112
                        // array of filters
113 1
                        foreach ($objval[1] as $flt) {
114 1
                            if ('/' == $flt[0]) {
115 1
                                $filters[] = $flt[1];
116
                            }
117
                        }
118
                    }
119
                }
120
            }
121
        }
122
123
        // decode the stream
124 29
        $remaining_filters = [];
125 29
        foreach ($filters as $filter) {
126 29
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
127
                try {
128 29
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
129 1
                } catch (Exception $e) {
130 1
                    $emsg = $e->getMessage();
131 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
132 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
133
                    ) {
134 29
                        throw new Exception($e->getMessage());
135
                    }
136
                }
137
            } else {
138
                // add missing filter to array
139 4
                $remaining_filters[] = $filter;
140
            }
141
        }
142
143 29
        return [$stream, $remaining_filters];
144
    }
145
146
    /**
147
     * Decode the Cross-Reference section
148
     *
149
     * @param string $pdfData   PDF data
150
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
151
     * @param array  $xref      Previous xref array (if any)
152
     *
153
     * @return array containing xref and trailer data
154
     */
155 24
    protected function decodeXref($pdfData, $startxref, $xref = [])
156
    {
157 24
        $startxref += 4; // 4 is the length of the word 'xref'
158
        // skip initial white space chars
159 24
        $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref);
160
        // initialize object number
161 24
        $obj_num = 0;
162
        // search for cross-reference entries or subsection
163 24
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
164 24
            if ($matches[0][1] != $offset) {
165
                // we are on another section
166 5
                break;
167
            }
168 24
            $offset += \strlen($matches[0][0]);
169 24
            if ('n' == $matches[3][0]) {
170
                // create unique object index: [object number]_[generation number]
171 24
                $index = $obj_num.'_'.(int) ($matches[2][0]);
172
                // check if object already exist
173 24
                if (!isset($xref['xref'][$index])) {
174
                    // store object offset position
175 24
                    $xref['xref'][$index] = (int) ($matches[1][0]);
176
                }
177 24
                ++$obj_num;
178 24
            } elseif ('f' == $matches[3][0]) {
179 24
                ++$obj_num;
180
            } else {
181
                // object number (index)
182 24
                $obj_num = (int) ($matches[1][0]);
183
            }
184
        }
185
        // get trailer data
186 24
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) {
187 24
            $trailer_data = $matches[1][0];
188 24
            if (!isset($xref['trailer']) || empty($xref['trailer'])) {
189
                // get only the last updated version
190 24
                $xref['trailer'] = [];
191
                // parse trailer_data
192 24
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
193 24
                    $xref['trailer']['size'] = (int) ($matches[1]);
194
                }
195 24
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196 24
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197
                }
198 24
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
199
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
200
                }
201 24
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
202 24
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
203
                }
204 24
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
205 22
                    $xref['trailer']['id'] = [];
206 22
                    $xref['trailer']['id'][0] = $matches[1];
207 22
                    $xref['trailer']['id'][1] = $matches[2];
208
                }
209
            }
210 24
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
211
                // get previous xref
212 24
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
213
            }
214
        } else {
215
            throw new Exception('Unable to find trailer');
216
        }
217
218 24
        return $xref;
219
    }
220
221
    /**
222
     * Decode the Cross-Reference Stream section
223
     *
224
     * @param string $pdfData   PDF data
225
     * @param int    $startxref Offset at which the xref section starts
226
     * @param array  $xref      Previous xref array (if any)
227
     *
228
     * @return array containing xref and trailer data
229
     *
230
     * @throws Exception if unknown PNG predictor detected
231
     */
232 6
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
233
    {
234
        // try to read Cross-Reference Stream
235 6
        $xrefobj = $this->getRawObject($pdfData, $startxref);
236 6
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
237 6
        if (!isset($xref['trailer']) || empty($xref['trailer'])) {
238
            // get only the last updated version
239 6
            $xref['trailer'] = [];
240 6
            $filltrailer = true;
241
        } else {
242 4
            $filltrailer = false;
243
        }
244 6
        if (!isset($xref['xref'])) {
245 6
            $xref['xref'] = [];
246
        }
247 6
        $valid_crs = false;
248 6
        $columns = 0;
249 6
        $predictor = null;
250 6
        $sarr = $xrefcrs[0][1];
251 6
        if (!\is_array($sarr)) {
252
            $sarr = [];
253
        }
254
255 6
        $wb = [];
256
257 6
        foreach ($sarr as $k => $v) {
258
            if (
259 6
                ('/' == $v[0])
260 6
                && ('Type' == $v[1])
261
                && (
262 6
                    isset($sarr[($k + 1)])
263 6
                    && '/' == $sarr[($k + 1)][0]
264 6
                    && 'XRef' == $sarr[($k + 1)][1]
265
                )
266
            ) {
267 6
                $valid_crs = true;
268 6
            } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) {
269
                // first object number in the subsection
270 4
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
271 6
            } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
272
                // get previous xref offset
273 4
                $prevxref = (int) ($sarr[($k + 1)][1]);
274 6
            } elseif (('/' == $v[0]) && ('W' == $v[1]) && (isset($sarr[($k + 1)]))) {
275
                // number of bytes (in the decoded stream) of the corresponding field
276 6
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
277 6
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
278 6
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
279 6
            } elseif (('/' == $v[0]) && ('DecodeParms' == $v[1]) && (isset($sarr[($k + 1)][1]))) {
280 5
                $decpar = $sarr[($k + 1)][1];
281 5
                foreach ($decpar as $kdc => $vdc) {
282
                    if (
283 5
                        '/' == $vdc[0]
284 5
                        && 'Columns' == $vdc[1]
285
                        && (
286 5
                            isset($decpar[($kdc + 1)])
287 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
288
                        )
289
                    ) {
290 5
                        $columns = (int) ($decpar[($kdc + 1)][1]);
291
                    } elseif (
292 5
                        '/' == $vdc[0]
293 5
                        && 'Predictor' == $vdc[1]
294
                        && (
295 5
                            isset($decpar[($kdc + 1)])
296 5
                            && 'numeric' == $decpar[($kdc + 1)][0]
297
                        )
298
                    ) {
299 5
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
300
                    }
301
                }
302 6
            } elseif ($filltrailer) {
303 6
                if (('/' == $v[0]) && ('Size' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) {
304 6
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
305 6
                } elseif (('/' == $v[0]) && ('Root' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
306 6
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
307 6
                } elseif (('/' == $v[0]) && ('Info' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
308 6
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
309 6
                } elseif (('/' == $v[0]) && ('Encrypt' == $v[1]) && (isset($sarr[($k + 1)]) && ('objref' == $sarr[($k + 1)][0]))) {
310
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
311 6
                } elseif (('/' == $v[0]) && ('ID' == $v[1]) && (isset($sarr[($k + 1)]))) {
312 6
                    $xref['trailer']['id'] = [];
313 6
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
314 6
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
315
                }
316
            }
317
        }
318
319
        // decode data
320 6
        if ($valid_crs && isset($xrefcrs[1][3][0])) {
321 6
            if (null !== $predictor) {
322
                // number of bytes in a row
323 5
                $rowlen = ($columns + 1);
324
                // convert the stream into an array of integers
325 5
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
326
                // split the rows
327 5
                $sdata = array_chunk($sdata, $rowlen);
328
329
                // initialize decoded array
330 5
                $ddata = [];
331
                // initialize first row with zeros
332 5
                $prev_row = array_fill(0, $rowlen, 0);
333
                // for each row apply PNG unpredictor
334 5
                foreach ($sdata as $k => $row) {
335
                    // initialize new row
336 5
                    $ddata[$k] = [];
337
                    // get PNG predictor value
338 5
                    $predictor = (10 + $row[0]);
339
                    // for each byte on the row
340 5
                    for ($i = 1; $i <= $columns; ++$i) {
341
                        // new index
342 5
                        $j = ($i - 1);
343 5
                        $row_up = $prev_row[$j];
344 5
                        if (1 == $i) {
345 5
                            $row_left = 0;
346 5
                            $row_upleft = 0;
347
                        } else {
348 5
                            $row_left = $row[($i - 1)];
349 5
                            $row_upleft = $prev_row[($j - 1)];
350
                        }
351 5
                        switch ($predictor) {
352 5
                            case 10:  // PNG prediction (on encoding, PNG None on all rows)
353
                                $ddata[$k][$j] = $row[$i];
354
                                break;
355
356 5
                            case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
357
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
358
                                break;
359
360 5
                            case 12:  // PNG prediction (on encoding, PNG Up on all rows)
361 5
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
362 5
                                break;
363
364
                            case 13:  // PNG prediction (on encoding, PNG Average on all rows)
365
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
366
                                break;
367
368
                            case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
369
                                // initial estimate
370
                                $p = ($row_left + $row_up - $row_upleft);
371
                                // distances
372
                                $pa = abs($p - $row_left);
373
                                $pb = abs($p - $row_up);
374
                                $pc = abs($p - $row_upleft);
375
                                $pmin = min($pa, $pb, $pc);
376
                                // return minimum distance
377
                                switch ($pmin) {
378
                                    case $pa:
379
                                        $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
380
                                        break;
381
382
                                    case $pb:
383
                                        $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
384
                                        break;
385
386
                                    case $pc:
387
                                        $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
388
                                        break;
389
                                }
390
                                break;
391
392
                            default:  // PNG prediction (on encoding, PNG optimum)
393
                                throw new Exception('Unknown PNG predictor: '.$predictor);
394
                        }
395
                    }
396 5
                    $prev_row = $ddata[$k];
397
                } // end for each row
398
                // complete decoding
399
            } else {
400
                // number of bytes in a row
401 2
                $rowlen = array_sum($wb);
402
                // convert the stream into an array of integers
403 2
                $sdata = unpack('C*', $xrefcrs[1][3][0]);
404
                // split the rows
405 2
                $ddata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $rowlen can also be of type double; however, parameter $length of array_chunk() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

405
                $ddata = array_chunk($sdata, /** @scrutinizer ignore-type */ $rowlen);
Loading history...
406
            }
407
408 6
            $sdata = [];
409
410
            // for every row
411 6
            foreach ($ddata as $k => $row) {
412
                // initialize new row
413 6
                $sdata[$k] = [0, 0, 0];
414 6
                if (0 == $wb[0]) {
415
                    // default type field
416
                    $sdata[$k][0] = 1;
417
                }
418 6
                $i = 0; // count bytes in the row
419
                // for every column
420 6
                for ($c = 0; $c < 3; ++$c) {
421
                    // for every byte on the column
422 6
                    for ($b = 0; $b < $wb[$c]; ++$b) {
423 6
                        if (isset($row[$i])) {
424 6
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
425
                        }
426 6
                        ++$i;
427
                    }
428
                }
429
            }
430 6
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
431
            // fill xref
432 6
            if (isset($index_first)) {
433 4
                $obj_num = $index_first;
434
            } else {
435 6
                $obj_num = 0;
436
            }
437 6
            foreach ($sdata as $k => $row) {
438 6
                switch ($row[0]) {
439 6
                    case 0:  // (f) linked list of free objects
440 6
                            break;
441
442 6
                    case 1:  // (n) objects that are in use but are not compressed
443
                            // create unique object index: [object number]_[generation number]
444 6
                            $index = $obj_num.'_'.$row[2];
445
                            // check if object already exist
446 6
                            if (!isset($xref['xref'][$index])) {
447
                                // store object offset position
448 6
                                $xref['xref'][$index] = $row[1];
449
                            }
450 6
                            break;
451
452 6
                    case 2:  // compressed objects
453
                            // $row[1] = object number of the object stream in which this object is stored
454
                            // $row[2] = index of this object within the object stream
455 6
                            $index = $row[1].'_0_'.$row[2];
456 6
                            $xref['xref'][$index] = -1;
457 6
                            break;
458
459
                    default:  // null objects
460
                            break;
461
                }
462 6
                ++$obj_num;
463
            }
464
        } // end decoding data
465 6
        if (isset($prevxref)) {
466
            // get previous xref
467 4
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
468
        }
469
470 6
        return $xref;
471
    }
472
473 29
    protected function getObjectHeaderPattern($objRefArr): string
474
    {
475
        // consider all whitespace character (PDF specifications)
476 29
        return '/'.$objRefArr[0].$this->config->getPdfWhitespacesRegex().$objRefArr[1].$this->config->getPdfWhitespacesRegex().'obj'.'/';
477
    }
478
479 29
    protected function getObjectHeaderLen($objRefArr): int
480
    {
481
        // "4 0 obj"
482
        // 2 whitespaces + strlen("obj") = 5
483 29
        return 5 + \strlen($objRefArr[0]) + \strlen($objRefArr[1]);
484
    }
485
486
    /**
487
     * Get content of indirect object.
488
     *
489
     * @param string $pdfData  PDF data
490
     * @param array  $xref
491
     * @param string $objRef   Object number and generation number separated by underscore character
492
     * @param int    $offset   Object offset
493
     * @param bool   $decoding If true decode streams
494
     *
495
     * @return array containing object data
496
     *
497
     * @throws Exception if invalid object reference found
498
     */
499 29
    protected function getIndirectObject($pdfData, $xref, $objRef, $offset = 0, $decoding = true)
500
    {
501
        /*
502
         * build indirect object header
503
         */
504
        // $objHeader = "[object number] [generation number] obj"
505 29
        $objRefArr = explode('_', $objRef);
506 29
        if (2 !== \count($objRefArr)) {
507
            throw new Exception('Invalid object reference for $obj.');
508
        }
509
510 29
        $objHeaderLen = $this->getObjectHeaderLen($objRefArr);
511
512
        /*
513
         * check if we are in position
514
         */
515
        // ignore whitespace characters at offset
516 29
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
517
        // ignore leading zeros for object number
518 29
        $offset += strspn($pdfData, '0', $offset);
519 29
        if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) {
520
            // an indirect reference to an undefined object shall be considered a reference to the null object
521 2
            return ['null', 'null', $offset];
522
        }
523
524
        /*
525
         * get content
526
         */
527
        // starting position of object content
528 29
        $offset += $objHeaderLen;
529 29
        $objContentArr = [];
530 29
        $i = 0; // object main index
531
        do {
532 29
            $oldOffset = $offset;
533
            // get element
534 29
            $element = $this->getRawObject($pdfData, $offset);
535 29
            $offset = $element[2];
536
            // decode stream using stream's dictionary information
537 29
            if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[($i - 1)][0])) && ('<<' === $objContentArr[($i - 1)][0])) {
538 29
                $element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[($i - 1)][1], $element[1]);
539
            }
540 29
            $objContentArr[$i] = $element;
541 29
            ++$i;
542 29
        } while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
543
        // remove closing delimiter
544 29
        array_pop($objContentArr);
545
546
        /*
547
         * return raw object content
548
         */
549 29
        return $objContentArr;
550
    }
551
552
    /**
553
     * Get the content of object, resolving indirect object reference if necessary.
554
     *
555
     * @param string $pdfData PDF data
556
     * @param array  $obj     Object value
557
     *
558
     * @return array containing object data
559
     *
560
     * @throws Exception
561
     */
562 29
    protected function getObjectVal($pdfData, $xref, $obj)
563
    {
564 29
        if ('objref' == $obj[0]) {
565
            // reference to indirect object
566
            if (isset($this->objects[$obj[1]])) {
567
                // this object has been already parsed
568
                return $this->objects[$obj[1]];
569
            } elseif (isset($xref[$obj[1]])) {
570
                // parse new object
571
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
572
573
                return $this->objects[$obj[1]];
574
            }
575
        }
576
577 29
        return $obj;
578
    }
579
580
    /**
581
     * Get object type, raw value and offset to next object
582
     *
583
     * @param int $offset Object offset
584
     *
585
     * @return array containing object type, raw value and offset to next object
586
     */
587 30
    protected function getRawObject($pdfData, $offset = 0)
588
    {
589 30
        $objtype = ''; // object type to be returned
590 30
        $objval = ''; // object value to be returned
591
592
        // skip initial white space chars
593 30
        $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset);
594
595
        // get first char
596 30
        $char = $pdfData[$offset];
597
        // get object type
598 30
        switch ($char) {
599 30
            case '%':  // \x25 PERCENT SIGN
600
                    // skip comment and search for next token
601
                    $next = strcspn($pdfData, "\r\n", $offset);
602
                    if ($next > 0) {
603
                        $offset += $next;
604
605
                        return $this->getRawObject($pdfData, $offset);
606
                    }
607
                    break;
608
609 30
            case '/':  // \x2F SOLIDUS
610
                    // name object
611 30
                    $objtype = $char;
612 30
                    ++$offset;
613 30
                    $pregResult = preg_match(
614 30
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
615 30
                        substr($pdfData, $offset, 256),
616
                        $matches
617
                    );
618 30
                    if (1 == $pregResult) {
619 30
                        $objval = $matches[1]; // unescaped value
620 30
                        $offset += \strlen($objval);
621
                    }
622 30
                    break;
623
624 30
            case '(':   // \x28 LEFT PARENTHESIS
625 30
            case ')':  // \x29 RIGHT PARENTHESIS
626
                    // literal string object
627 27
                    $objtype = $char;
628 27
                    ++$offset;
629 27
                    $strpos = $offset;
630 27
                    if ('(' == $char) {
631 27
                        $open_bracket = 1;
632 27
                        while ($open_bracket > 0) {
633 27
                            if (!isset($pdfData[$strpos])) {
634
                                break;
635
                            }
636 27
                            $ch = $pdfData[$strpos];
637 27
                            switch ($ch) {
638 27
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
639
                                        // skip next character
640 15
                                        ++$strpos;
641 15
                                        break;
642
643 27
                                case '(':  // LEFT PARENHESIS (28h)
644
                                        ++$open_bracket;
645
                                        break;
646
647 27
                                case ')':  // RIGHT PARENTHESIS (29h)
648 27
                                        --$open_bracket;
649 27
                                        break;
650
                            }
651 27
                            ++$strpos;
652
                        }
653 27
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
654 27
                        $offset = $strpos;
655
                    }
656 27
                    break;
657
658 30
            case '[':   // \x5B LEFT SQUARE BRACKET
659 30
            case ']':  // \x5D RIGHT SQUARE BRACKET
660
                // array object
661 29
                $objtype = $char;
662 29
                ++$offset;
663 29
                if ('[' == $char) {
664
                    // get array content
665 29
                    $objval = [];
666
                    do {
667 29
                        $oldOffset = $offset;
668
                        // get element
669 29
                        $element = $this->getRawObject($pdfData, $offset);
670 29
                        $offset = $element[2];
671 29
                        $objval[] = $element;
672 29
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
673
                    // remove closing delimiter
674 29
                    array_pop($objval);
675
                }
676 29
                break;
677
678 30
            case '<':  // \x3C LESS-THAN SIGN
679 30
            case '>':  // \x3E GREATER-THAN SIGN
680 30
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
681
                    // dictionary object
682 30
                    $objtype = $char.$char;
683 30
                    $offset += 2;
684 30
                    if ('<' == $char) {
685
                        // get array content
686 30
                        $objval = [];
687
                        do {
688 30
                            $oldOffset = $offset;
689
                            // get element
690 30
                            $element = $this->getRawObject($pdfData, $offset);
691 30
                            $offset = $element[2];
692 30
                            $objval[] = $element;
693 30
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
694
                        // remove closing delimiter
695 30
                        array_pop($objval);
696
                    }
697
                } else {
698
                    // hexadecimal string object
699 12
                    $objtype = $char;
700 12
                    ++$offset;
701 12
                    $pregResult = preg_match(
702 12
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
703 12
                            substr($pdfData, $offset),
704
                            $matches
705
                        );
706 12
                    if (('<' == $char) && 1 == $pregResult) {
707
                        // remove white space characters
708 12
                        $objval = strtr($matches[1], $this->config->getPdfWhitespaces(), '');
709 12
                        $offset += \strlen($matches[0]);
710
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
711
                        $offset = $endpos + 1;
712
                    }
713
                }
714 30
                    break;
715
716
            default:
717 30
                    if ('endobj' == substr($pdfData, $offset, 6)) {
718
                        // indirect object
719 29
                        $objtype = 'endobj';
720 29
                        $offset += 6;
721 30
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
722
                        // null object
723 4
                        $objtype = 'null';
724 4
                        $offset += 4;
725 4
                        $objval = 'null';
726 30
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
727
                        // boolean true object
728 9
                        $objtype = 'boolean';
729 9
                        $offset += 4;
730 9
                        $objval = 'true';
731 30
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
732
                        // boolean false object
733 2
                        $objtype = 'boolean';
734 2
                        $offset += 5;
735 2
                        $objval = 'false';
736 30
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
737
                        // start stream object
738 29
                        $objtype = 'stream';
739 29
                        $offset += 6;
740 29
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
741 29
                            $offset += \strlen($matches[0]);
742 29
                            $pregResult = preg_match(
743 29
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
744 29
                                substr($pdfData, $offset),
745
                                $matches,
746 29
                                \PREG_OFFSET_CAPTURE
747
                            );
748 29
                            if (1 == $pregResult) {
749 29
                                $objval = substr($pdfData, $offset, $matches[0][1]);
750 29
                                $offset += $matches[1][1];
751
                            }
752
                        }
753 30
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
754
                        // end stream object
755 29
                        $objtype = 'endstream';
756 29
                        $offset += 9;
757 30
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
758
                        // indirect object reference
759 29
                        $objtype = 'objref';
760 29
                        $offset += \strlen($matches[0]);
761 29
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
762 30
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
763
                        // object start
764 6
                        $objtype = 'obj';
765 6
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
766 6
                        $offset += \strlen($matches[0]);
767 30
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
768
                        // numeric object
769 29
                        $objtype = 'numeric';
770 29
                        $objval = substr($pdfData, $offset, $numlen);
771 29
                        $offset += $numlen;
772
                    }
773 30
                    break;
774
        }
775
776 30
        return [$objtype, $objval, $offset];
777
    }
778
779
    /**
780
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
781
     *
782
     * @param string $pdfData
783
     * @param int    $offset  xref offset (if know)
784
     * @param array  $xref    previous xref array (if any)
785
     *
786
     * @return array containing xref and trailer data
787
     *
788
     * @throws Exception if it was unable to find startxref
789
     * @throws Exception if it was unable to find xref
790
     */
791 30
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
792
    {
793 30
        $startxrefPreg = preg_match(
794 30
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
795
            $pdfData,
796
            $matches,
797 30
            \PREG_OFFSET_CAPTURE,
798
            $offset
799
        );
800
801 30
        if (0 == $offset) {
802
            // find last startxref
803 30
            $pregResult = preg_match_all(
804 30
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
805
                $pdfData, $matches,
806 30
                \PREG_SET_ORDER,
807
                $offset
808
            );
809 30
            if (0 == $pregResult) {
810
                throw new Exception('Unable to find startxref');
811
            }
812 30
            $matches = array_pop($matches);
813 30
            $startxref = $matches[1];
814 8
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
815
            // Already pointing at the xref table
816 5
            $startxref = $offset;
817 4
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
818
            // Cross-Reference Stream object
819 4
            $startxref = $offset;
820
        } elseif ($startxrefPreg) {
821
            // startxref found
822
            $startxref = $matches[1][0];
823
        } else {
824
            throw new Exception('Unable to find startxref');
825
        }
826
827 30
        if ($startxref > \strlen($pdfData)) {
828 1
            throw new Exception('Unable to find xref (PDF corrupted?)');
829
        }
830
831
        // check xref position
832 29
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
833
            // Cross-Reference
834 24
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
835
        } else {
836
            // Cross-Reference Stream
837 6
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
838
        }
839 29
        if (empty($xref)) {
840
            throw new Exception('Unable to find xref');
841
        }
842
843 29
        return $xref;
844
    }
845
846
    /**
847
     * Parses PDF data and returns extracted data as array.
848
     *
849
     * @param string $data PDF data to parse
850
     *
851
     * @return array array of parsed PDF document objects
852
     *
853
     * @throws Exception if empty PDF data given
854
     * @throws Exception if PDF data missing %PDF header
855
     */
856 30
    public function parseData($data)
857
    {
858 30
        if (empty($data)) {
859
            throw new Exception('Empty PDF data given.');
860
        }
861
        // find the pdf header starting position
862 30
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
863
            throw new Exception('Invalid PDF data: missing %PDF header.');
864
        }
865
866
        // get PDF content string
867 30
        $pdfData = substr($data, $trimpos);
868
869
        // get xref and trailer data
870 30
        $xref = $this->getXrefData($pdfData);
871
872
        // parse all document objects
873 29
        $objects = [];
874 29
        foreach ($xref['xref'] as $obj => $offset) {
875 29
            if (!isset($objects[$obj]) && ($offset > 0)) {
876
                // decode objects with positive offset
877 29
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
878
            }
879
        }
880
881 29
        return [$xref, $objects];
882
    }
883
}
884