Completed
Push — pr/238 ( 9ef5ec )
by Konrad
03:54
created

RawDataParser::decodeXref()   C

Complexity

Conditions 15
Paths 134

Size

Total Lines 64
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 15.0386

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 15
eloc 37
c 3
b 1
f 1
nc 134
nop 3
dl 0
loc 64
ccs 34
cts 36
cp 0.9444
crap 15.0386
rs 5.6333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
    protected $objects;
59
60
    /**
61
     * @param array $cfg Configuration array, default is []
62
     */
63 17
    public function __construct($cfg = [])
64
    {
65
        // merge given array with default values
66 17
        $this->cfg = array_merge($this->cfg, $cfg);
67
68 17
        $this->filterHelper = new FilterHelper();
69 17
    }
70
71
    /**
72
     * Decode the specified stream.
73
     *
74
     * @param string $pdfData PDF data
75
     * @param array  $xref
76
     * @param array  $sdic    Stream's dictionary array
77
     * @param string $stream  Stream to decode
78
     *
79
     * @return array containing decoded stream data and remaining filters
80
     */
81 16
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
82
    {
83
        // get stream length and filters
84 16
        $slength = \strlen($stream);
85 16
        if ($slength <= 0) {
86
            return ['', []];
87
        }
88 16
        $filters = [];
89 16
        foreach ($sdic as $k => $v) {
90 16
            if ('/' == $v[0]) {
91 16
                if (('Length' == $v[1]) and (isset($sdic[($k + 1)])) and ('numeric' == $sdic[($k + 1)][0])) {
92
                    // get declared stream length
93 16
                    $declength = (int) ($sdic[($k + 1)][1]);
94 16
                    if ($declength < $slength) {
95 16
                        $stream = substr($stream, 0, $declength);
96 16
                        $slength = $declength;
97
                    }
98 16
                } elseif (('Filter' == $v[1]) and (isset($sdic[($k + 1)]))) {
99
                    // resolve indirect object
100 16
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
101 16
                    if ('/' == $objval[0]) {
102
                        // single filter
103 16
                        $filters[] = $objval[1];
104 1
                    } elseif ('[' == $objval[0]) {
105
                        // array of filters
106 1
                        foreach ($objval[1] as $flt) {
107 1
                            if ('/' == $flt[0]) {
108 1
                                $filters[] = $flt[1];
109
                            }
110
                        }
111
                    }
112
                }
113
            }
114
        }
115
116
        // decode the stream
117 16
        $remaining_filters = [];
118 16
        foreach ($filters as $filter) {
119 16
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
120
                try {
121 16
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
122 1
                } catch (Exception $e) {
123 1
                    $emsg = $e->getMessage();
124 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
125 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
126
                    ) {
127 16
                        throw new Exception($e->getMessage());
128
                    }
129
                }
130
            } else {
131
                // add missing filter to array
132 1
                $remaining_filters[] = $filter;
133
            }
134
        }
135
136 16
        return [$stream, $remaining_filters];
137
    }
138
139
    /**
140
     * Decode the Cross-Reference section
141
     *
142
     * @param string $pdfData   PDF data
143
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
144
     * @param array  $xref      Previous xref array (if any)
145
     *
146
     * @return array containing xref and trailer data
147
     */
148 16
    protected function decodeXref($pdfData, $startxref, $xref = [])
149
    {
150 16
        $startxref += 4; // 4 is the length of the word 'xref'
151
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
152 16
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
153
        // initialize object number
154 16
        $obj_num = 0;
155
        // search for cross-reference entries or subsection
156 16
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
157 16
            if ($matches[0][1] != $offset) {
158
                // we are on another section
159 3
                break;
160
            }
161 16
            $offset += \strlen($matches[0][0]);
162 16
            if ('n' == $matches[3][0]) {
163
                // create unique object index: [object number]_[generation number]
164 16
                $index = $obj_num.'_'.(int) ($matches[2][0]);
165
                // check if object already exist
166 16
                if (!isset($xref['xref'][$index])) {
167
                    // store object offset position
168 16
                    $xref['xref'][$index] = (int) ($matches[1][0]);
169
                }
170 16
                ++$obj_num;
171 16
            } elseif ('f' == $matches[3][0]) {
172 16
                ++$obj_num;
173
            } else {
174
                // object number (index)
175 16
                $obj_num = (int) ($matches[1][0]);
176
            }
177
        }
178
        // get trailer data
179 16
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
180 16
            $trailer_data = $matches[1][0];
181 16
            if (!isset($xref['trailer']) or empty($xref['trailer'])) {
182
                // get only the last updated version
183 16
                $xref['trailer'] = [];
184
                // parse trailer_data
185 16
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
186 16
                    $xref['trailer']['size'] = (int) ($matches[1]);
187
                }
188 16
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
189 16
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
190
                }
191 16
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
192
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
193
                }
194 16
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
195 16
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
196
                }
197 16
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
198 15
                    $xref['trailer']['id'] = [];
199 15
                    $xref['trailer']['id'][0] = $matches[1];
200 15
                    $xref['trailer']['id'][1] = $matches[2];
201
                }
202
            }
203 16
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
204
                // get previous xref
205 16
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
206
            }
207
        } else {
208
            throw new Exception('Unable to find trailer');
209
        }
210
211 16
        return $xref;
212
    }
213
214
    /**
215
     * Decode the Cross-Reference Stream section
216
     *
217
     * @param string $pdfData   PDF data
218
     * @param int    $startxref Offset at which the xref section starts
219
     * @param array  $xref      Previous xref array (if any)
220
     *
221
     * @return array containing xref and trailer data
222
     *
223
     * @throws Exception if unknown PNG predictor detected
224
     */
225 2
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
226
    {
227
        // try to read Cross-Reference Stream
228 2
        $xrefobj = $this->getRawObject($pdfData, $startxref);
229 2
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
230 2
        if (!isset($xref['trailer']) or empty($xref['trailer'])) {
231
            // get only the last updated version
232 2
            $xref['trailer'] = [];
233 2
            $filltrailer = true;
234
        } else {
235 2
            $filltrailer = false;
236
        }
237 2
        if (!isset($xref['xref'])) {
238 2
            $xref['xref'] = [];
239
        }
240 2
        $valid_crs = false;
241 2
        $columns = 0;
242 2
        $sarr = $xrefcrs[0][1];
243 2
        if (!\is_array($sarr)) {
244
            $sarr = [];
245
        }
246
247 2
        $wb = [];
248
249 2
        foreach ($sarr as $k => $v) {
250
            if (
251 2
                ('/' == $v[0])
252 2
                && ('Type' == $v[1])
253
                && (
254 2
                    isset($sarr[($k + 1)])
255 2
                    && '/' == $sarr[($k + 1)][0]
256 2
                    && 'XRef' == $sarr[($k + 1)][1]
257
                )
258
            ) {
259 2
                $valid_crs = true;
260 2
            } elseif (('/' == $v[0]) and ('Index' == $v[1]) and (isset($sarr[($k + 1)]))) {
261
                // first object number in the subsection
262 2
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
263 2
            } elseif (('/' == $v[0]) and ('Prev' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
264
                // get previous xref offset
265 2
                $prevxref = (int) ($sarr[($k + 1)][1]);
266 2
            } elseif (('/' == $v[0]) and ('W' == $v[1]) and (isset($sarr[($k + 1)]))) {
267
                // number of bytes (in the decoded stream) of the corresponding field
268 2
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
269 2
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
270 2
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
271 2
            } elseif (('/' == $v[0]) and ('DecodeParms' == $v[1]) and (isset($sarr[($k + 1)][1]))) {
272 2
                $decpar = $sarr[($k + 1)][1];
273 2
                foreach ($decpar as $kdc => $vdc) {
274
                    if (
275 2
                        '/' == $vdc[0]
276 2
                        && 'Columns' == $vdc[1]
277
                        && (
278 2
                            isset($decpar[($kdc + 1)])
279 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
280
                        )
281
                    ) {
282 2
                        $columns = (int) ($decpar[($kdc + 1)][1]);
283
                    } elseif (
284 2
                        '/' == $vdc[0]
285 2
                        && 'Predictor' == $vdc[1]
286
                        && (
287 2
                            isset($decpar[($kdc + 1)])
288 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
289
                        )
290
                    ) {
291 2
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
292
                    }
293
                }
294 2
            } elseif ($filltrailer) {
295 2
                if (('/' == $v[0]) and ('Size' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
296 2
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
297 2
                } elseif (('/' == $v[0]) and ('Root' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
298 2
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
299 2
                } elseif (('/' == $v[0]) and ('Info' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
300 2
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
301 2
                } elseif (('/' == $v[0]) and ('Encrypt' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
302
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
303 2
                } elseif (('/' == $v[0]) and ('ID' == $v[1]) and (isset($sarr[($k + 1)]))) {
304 2
                    $xref['trailer']['id'] = [];
305 2
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
306 2
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
307
                }
308
            }
309
        }
310
311
        // decode data
312 2
        if ($valid_crs and isset($xrefcrs[1][3][0])) {
313
            // number of bytes in a row
314 2
            $rowlen = ($columns + 1);
315
            // convert the stream into an array of integers
316 2
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
317
            // split the rows
318 2
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
319
            // initialize decoded array
320 2
            $ddata = [];
321
            // initialize first row with zeros
322 2
            $prev_row = array_fill(0, $rowlen, 0);
323
            // for each row apply PNG unpredictor
324 2
            foreach ($sdata as $k => $row) {
325
                // initialize new row
326 2
                $ddata[$k] = [];
327
                // get PNG predictor value
328 2
                $predictor = (10 + $row[0]);
329
                // for each byte on the row
330 2
                for ($i = 1; $i <= $columns; ++$i) {
331
                    // new index
332 2
                    $j = ($i - 1);
333 2
                    $row_up = $prev_row[$j];
334 2
                    if (1 == $i) {
335 2
                        $row_left = 0;
336 2
                        $row_upleft = 0;
337
                    } else {
338 2
                        $row_left = $row[($i - 1)];
339 2
                        $row_upleft = $prev_row[($j - 1)];
340
                    }
341 2
                    switch ($predictor) {
342 2
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
343
                            $ddata[$k][$j] = $row[$i];
344
                            break;
345
346 2
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
347
                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
348
                            break;
349
350 2
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
351 2
                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
352 2
                            break;
353
354
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
355
                            $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
356
                            break;
357
358
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
359
                            // initial estimate
360
                            $p = ($row_left + $row_up - $row_upleft);
361
                            // distances
362
                            $pa = abs($p - $row_left);
363
                            $pb = abs($p - $row_up);
364
                            $pc = abs($p - $row_upleft);
365
                            $pmin = min($pa, $pb, $pc);
366
                            // return minimum distance
367
                            switch ($pmin) {
368
                                case $pa:
369
                                    $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
370
                                    break;
371
372
                                case $pb:
373
                                    $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
374
                                    break;
375
376
                                case $pc:
377
                                    $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
378
                                    break;
379
                            }
380
                            break;
381
382
                        default:  // PNG prediction (on encoding, PNG optimum)
383
                            throw new Exception('Unknown PNG predictor');
384
                    }
385
                }
386 2
                $prev_row = $ddata[$k];
387
            } // end for each row
388
            // complete decoding
389 2
            $sdata = [];
390
            // for every row
391 2
            foreach ($ddata as $k => $row) {
392
                // initialize new row
393 2
                $sdata[$k] = [0, 0, 0];
394 2
                if (0 == $wb[0]) {
395
                    // default type field
396
                    $sdata[$k][0] = 1;
397
                }
398 2
                $i = 0; // count bytes in the row
399
                // for every column
400 2
                for ($c = 0; $c < 3; ++$c) {
401
                    // for every byte on the column
402 2
                    for ($b = 0; $b < $wb[$c]; ++$b) {
403 2
                        if (isset($row[$i])) {
404 2
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
405
                        }
406 2
                        ++$i;
407
                    }
408
                }
409
            }
410 2
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
411
            // fill xref
412 2
            if (isset($index_first)) {
413 2
                $obj_num = $index_first;
414
            } else {
415 2
                $obj_num = 0;
416
            }
417 2
            foreach ($sdata as $k => $row) {
418 2
                switch ($row[0]) {
419 2
                    case 0:  // (f) linked list of free objects
420 2
                            break;
421
422 2
                    case 1:  // (n) objects that are in use but are not compressed
423
                            // create unique object index: [object number]_[generation number]
424 2
                            $index = $obj_num.'_'.$row[2];
425
                            // check if object already exist
426 2
                            if (!isset($xref['xref'][$index])) {
427
                                // store object offset position
428 2
                                $xref['xref'][$index] = $row[1];
429
                            }
430 2
                            break;
431
432 2
                    case 2:  // compressed objects
433
                            // $row[1] = object number of the object stream in which this object is stored
434
                            // $row[2] = index of this object within the object stream
435 2
                            $index = $row[1].'_0_'.$row[2];
436 2
                            $xref['xref'][$index] = -1;
437 2
                            break;
438
439
                    default:  // null objects
440
                            break;
441
                }
442 2
                ++$obj_num;
443
            }
444
        } // end decoding data
445 2
        if (isset($prevxref)) {
446
            // get previous xref
447 2
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
448
        }
449
450 2
        return $xref;
451
    }
452
453
    /**
454
     * Get content of indirect object.
455
     *
456
     * @param string $pdfData  PDF data
457
     * @param array  $xref
458
     * @param string $obj_ref  Object number and generation number separated by underscore character
459
     * @param int    $offset   Object offset
460
     * @param bool   $decoding If true decode streams
461
     *
462
     * @return array containing object data
463
     *
464
     * @throws Exception if invalid object reference found
465
     */
466 17
    protected function getIndirectObject($pdfData, $xref, $obj_ref, $offset = 0, $decoding = true)
467
    {
468 17
        $obj = explode('_', $obj_ref);
469 17
        if (2 != \count($obj)) {
470
            throw new Exception('Invalid object reference for $obj.');
471
        }
472 17
        $objref = $obj[0].' '.$obj[1].' obj';
473
        // ignore leading zeros
474 17
        $offset += strspn($pdfData, '0', $offset);
475 17
        if (strpos($pdfData, $objref, $offset) != $offset) {
476
            // an indirect reference to an undefined object shall be considered a reference to the null object
477 2
            return ['null', 'null', $offset];
478
        }
479
        // starting position of object content
480 16
        $offset += \strlen($objref);
481
        // get array of object content
482 16
        $objdata = [];
483 16
        $i = 0; // object main index
484
        do {
485 16
            $oldoffset = $offset;
486
            // get element
487 16
            $element = $this->getRawObject($pdfData, $offset);
488 16
            $offset = $element[2];
489
            // decode stream using stream's dictionary information
490 16
            if ($decoding and ('stream' == $element[0]) and (isset($objdata[($i - 1)][0])) and ('<<' == $objdata[($i - 1)][0])) {
491 16
                $element[3] = $this->decodeStream($pdfData, $xref, $objdata[($i - 1)][1], $element[1]);
492
            }
493 16
            $objdata[$i] = $element;
494 16
            ++$i;
495 16
        } while (('endobj' != $element[0]) and ($offset != $oldoffset));
496
497
        // remove closing delimiter
498 16
        array_pop($objdata);
499
500
        // return raw object content
501 16
        return $objdata;
502
    }
503
504
    /**
505
     * Get the content of object, resolving indect object reference if necessary.
506
     *
507
     * @param string $pdfData PDF data
508
     * @param array  $obj     Object value
509
     *
510
     * @return array containing object data
511
     */
512 16
    protected function getObjectVal($pdfData, $xref, $obj)
513
    {
514 16
        if ('objref' == $obj[0]) {
515
            // reference to indirect object
516
            if (isset($this->objects[$obj[1]])) {
517
                // this object has been already parsed
518
                return $this->objects[$obj[1]];
519
            } elseif (isset($xref[$obj[1]])) {
520
                // parse new object
521
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
522
523
                return $this->objects[$obj[1]];
524
            }
525
        }
526
527 16
        return $obj;
528
    }
529
530
    /**
531
     * Get object type, raw value and offset to next object
532
     *
533
     * @param int $offset Object offset
534
     *
535
     * @return array containing object type, raw value and offset to next object
536
     */
537 16
    protected function getRawObject($pdfData, $offset = 0)
538
    {
539 16
        $objtype = ''; // object type to be returned
540 16
        $objval = ''; // object value to be returned
541
542
        /*
543
         * skip initial white space chars:
544
         *      \x00 null (NUL)
545
         *      \x09 horizontal tab (HT)
546
         *      \x0A line feed (LF)
547
         *      \x0C form feed (FF)
548
         *      \x0D carriage return (CR)
549
         *      \x20 space (SP)
550
         */
551 16
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
552
553
        // get first char
554 16
        $char = $pdfData[$offset];
555
        // get object type
556 16
        switch ($char) {
557 16
            case '%':  // \x25 PERCENT SIGN
558
                    // skip comment and search for next token
559
                    $next = strcspn($pdfData, "\r\n", $offset);
560
                    if ($next > 0) {
561
                        $offset += $next;
562
563
                        return $this->getRawObject($pdfData, $offset);
564
                    }
565
                    break;
566
567 16
            case '/':  // \x2F SOLIDUS
568
                    // name object
569 16
                    $objtype = $char;
570 16
                    ++$offset;
571 16
                    $pregResult = preg_match(
572 16
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
573 16
                        substr($pdfData, $offset, 256),
574
                        $matches
575
                    );
576 16
                    if (1 == $pregResult) {
577 16
                        $objval = $matches[1]; // unescaped value
578 16
                        $offset += \strlen($objval);
579
                    }
580 16
                    break;
581
582 16
            case '(':   // \x28 LEFT PARENTHESIS
583 16
            case ')':  // \x29 RIGHT PARENTHESIS
584
                    // literal string object
585 16
                    $objtype = $char;
586 16
                    ++$offset;
587 16
                    $strpos = $offset;
588 16
                    if ('(' == $char) {
589 16
                        $open_bracket = 1;
590 16
                        while ($open_bracket > 0) {
591 16
                            if (!isset($pdfData[$strpos])) {
592
                                break;
593
                            }
594 16
                            $ch = $pdfData[$strpos];
595 16
                            switch ($ch) {
596 16
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
597
                                        // skip next character
598 14
                                        ++$strpos;
599 14
                                        break;
600
601 16
                                case '(':  // LEFT PARENHESIS (28h)
602
                                        ++$open_bracket;
603
                                        break;
604
605 16
                                case ')':  // RIGHT PARENTHESIS (29h)
606 16
                                        --$open_bracket;
607 16
                                        break;
608
                            }
609 16
                            ++$strpos;
610
                        }
611 16
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
612 16
                        $offset = $strpos;
613
                    }
614 16
                    break;
615
616 16
            case '[':   // \x5B LEFT SQUARE BRACKET
617 16
            case ']':  // \x5D RIGHT SQUARE BRACKET
618
                    // array object
619 16
                    $objtype = $char;
620 16
                    ++$offset;
621 16
                    if ('[' == $char) {
622
                        // get array content
623 16
                        $objval = [];
624
                        do {
625
                            // get element
626 16
                            $element = $this->getRawObject($pdfData, $offset);
627 16
                            $offset = $element[2];
628 16
                            $objval[] = $element;
629 16
                        } while (']' != $element[0]);
630
                        // remove closing delimiter
631 16
                        array_pop($objval);
632
                    }
633 16
                    break;
634
635 16
            case '<':  // \x3C LESS-THAN SIGN
636 16
            case '>':  // \x3E GREATER-THAN SIGN
637 16
                    if (isset($pdfData[($offset + 1)]) and ($pdfData[($offset + 1)] == $char)) {
638
                        // dictionary object
639 16
                        $objtype = $char.$char;
640 16
                        $offset += 2;
641 16
                        if ('<' == $char) {
642
                            // get array content
643 16
                            $objval = [];
644
                            do {
645
                                // get element
646 16
                                $element = $this->getRawObject($pdfData, $offset);
647 16
                                $offset = $element[2];
648 16
                                $objval[] = $element;
649 16
                            } while ('>>' != $element[0]);
650
                            // remove closing delimiter
651 16
                            array_pop($objval);
652
                        }
653
                    } else {
654
                        // hexadecimal string object
655 4
                        $objtype = $char;
656 4
                        ++$offset;
657 4
                        $pregResult = preg_match(
658 4
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
659 4
                            substr($pdfData, $offset),
660
                            $matches
661
                        );
662 4
                        if (('<' == $char) && 1 == $pregResult) {
663
                            // remove white space characters
664 4
                            $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
665 4
                            $offset += \strlen($matches[0]);
666
                        } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
667
                            $offset = $endpos + 1;
668
                        }
669
                    }
670 16
                    break;
671
672
            default:
673 16
                    if ('endobj' == substr($pdfData, $offset, 6)) {
674
                        // indirect object
675 16
                        $objtype = 'endobj';
676 16
                        $offset += 6;
677 16
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
678
                        // null object
679 2
                        $objtype = 'null';
680 2
                        $offset += 4;
681 2
                        $objval = 'null';
682 16
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
683
                        // boolean true object
684 4
                        $objtype = 'boolean';
685 4
                        $offset += 4;
686 4
                        $objval = 'true';
687 16
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
688
                        // boolean false object
689 1
                        $objtype = 'boolean';
690 1
                        $offset += 5;
691 1
                        $objval = 'false';
692 16
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
693
                        // start stream object
694 16
                        $objtype = 'stream';
695 16
                        $offset += 6;
696 16
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
697 16
                            $offset += \strlen($matches[0]);
698 16
                            $pregResult = preg_match(
699 16
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
700 16
                                substr($pdfData, $offset),
701
                                $matches,
702 16
                                PREG_OFFSET_CAPTURE
703
                            );
704 16
                            if (1 == $pregResult) {
705 16
                                $objval = substr($pdfData, $offset, $matches[0][1]);
706 16
                                $offset += $matches[1][1];
707
                            }
708
                        }
709 16
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
710
                        // end stream object
711 16
                        $objtype = 'endstream';
712 16
                        $offset += 9;
713 16
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
714
                        // indirect object reference
715 16
                        $objtype = 'objref';
716 16
                        $offset += \strlen($matches[0]);
717 16
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
718 16
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
719
                        // object start
720 2
                        $objtype = 'obj';
721 2
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
722 2
                        $offset += \strlen($matches[0]);
723 16
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
724
                        // numeric object
725 16
                        $objtype = 'numeric';
726 16
                        $objval = substr($pdfData, $offset, $numlen);
727 16
                        $offset += $numlen;
728
                    }
729 16
                    break;
730
        }
731
732 16
        return [$objtype, $objval, $offset];
733
    }
734
735
    /**
736
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
737
     *
738
     * @param string $pdfData
739
     * @param int    $offset  xref offset (if know)
740
     * @param array  $xref    previous xref array (if any)
741
     *
742
     * @return array containing xref and trailer data
743
     *
744
     * @throws Exception if it was unable to find startxref
745
     * @throws Exception if it was unable to find xref
746
     */
747 17
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
748
    {
749 17
        $startxrefPreg = preg_match(
750 17
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
751
            $pdfData,
752
            $matches,
753 17
            PREG_OFFSET_CAPTURE,
754
            $offset
755
        );
756
757 17
        if (0 == $offset) {
758
            // find last startxref
759 17
            $pregResult = preg_match_all(
760 17
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
761
                $pdfData, $matches,
762 17
                PREG_SET_ORDER,
763
                $offset
764
            );
765 17
            if (0 == $pregResult) {
766
                throw new Exception('Unable to find startxref');
767
            }
768 17
            $matches = array_pop($matches);
769 17
            $startxref = $matches[1];
770 4
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
771
            // Already pointing at the xref table
772 3
            $startxref = $offset;
773 2
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
774
            // Cross-Reference Stream object
775 2
            $startxref = $offset;
776
        } elseif ($startxrefPreg) {
777
            // startxref found
778
            $startxref = $matches[1][0];
779
        } else {
780
            throw new Exception('Unable to find startxref');
781
        }
782
783
        // check xref position
784 17
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
785
            // Cross-Reference
786 16
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
787
        } else {
788
            // Cross-Reference Stream
789 2
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
790
        }
791 17
        if (empty($xref)) {
792
            throw new Exception('Unable to find xref');
793
        }
794
795 17
        return $xref;
796
    }
797
798
    /**
799
     * Parses PDF data and returns extracted data as array.
800
     *
801
     * @param string $data PDF data to parse
802
     *
803
     * @return array array of parsed PDF document objects
804
     *
805
     * @throws Exception if empty PDF data given
806
     * @throws Exception if PDF data missing %PDF header
807
     */
808 17
    public function parseData($data)
809
    {
810 17
        if (empty($data)) {
811
            throw new Exception('Empty PDF data given.');
812
        }
813
        // find the pdf header starting position
814 17
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
815
            throw new Exception('Invalid PDF data: missing %PDF header.');
816
        }
817
818
        // get PDF content string
819 17
        $pdfData = substr($data, $trimpos);
820
821
        // get xref and trailer data
822 17
        $xref = $this->getXrefData($pdfData);
823
824
        // parse all document objects
825 17
        $objects = [];
826 17
        foreach ($xref['xref'] as $obj => $offset) {
827 17
            if (!isset($objects[$obj]) and ($offset > 0)) {
828
                // decode objects with positive offset
829 17
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
830
            }
831
        }
832
833 17
        return [$xref, $objects];
834
    }
835
}
836