Completed
Push — master ( 66ad27...2fab78 )
by Konrad
12:00 queued 08:18
created

RawDataParser::decodeXref()   C

Complexity

Conditions 15
Paths 134

Size

Total Lines 64
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 15.0386

Importance

Changes 3
Bugs 1 Features 1
Metric Value
cc 15
eloc 37
c 3
b 1
f 1
nc 134
nop 3
dl 0
loc 64
ccs 34
cts 36
cp 0.9444
crap 15.0386
rs 5.6333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
59
    protected $xrefCache;
60
61
    /**
62
     * @param array $cfg Configuration array, default is []
63
     */
64 16
    public function __construct($cfg = [])
65
    {
66
        // merge given array with default values
67 16
        $this->cfg = array_merge($this->cfg, $cfg);
68
69 16
        $this->filterHelper = new FilterHelper();
70 16
    }
71
72
    /**
73
     * Decode the specified stream.
74
     *
75
     * @param string $pdfData PDF data
76
     * @param array  $xref
77
     * @param array  $sdic    Stream's dictionary array
78
     * @param string $stream  Stream to decode
79
     *
80
     * @return array containing decoded stream data and remaining filters
81
     */
82 16
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
83
    {
84
        // get stream length and filters
85 16
        $slength = \strlen($stream);
86 16
        if ($slength <= 0) {
87
            return ['', []];
88
        }
89 16
        $filters = [];
90 16
        foreach ($sdic as $k => $v) {
91 16
            if ('/' == $v[0]) {
92 16
                if (('Length' == $v[1]) and (isset($sdic[($k + 1)])) and ('numeric' == $sdic[($k + 1)][0])) {
93
                    // get declared stream length
94 16
                    $declength = (int) ($sdic[($k + 1)][1]);
95 16
                    if ($declength < $slength) {
96 16
                        $stream = substr($stream, 0, $declength);
97 16
                        $slength = $declength;
98
                    }
99 16
                } elseif (('Filter' == $v[1]) and (isset($sdic[($k + 1)]))) {
100
                    // resolve indirect object
101 16
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
102 16
                    if ('/' == $objval[0]) {
103
                        // single filter
104 16
                        $filters[] = $objval[1];
105 1
                    } elseif ('[' == $objval[0]) {
106
                        // array of filters
107 1
                        foreach ($objval[1] as $flt) {
108 1
                            if ('/' == $flt[0]) {
109 16
                                $filters[] = $flt[1];
110
                            }
111
                        }
112
                    }
113
                }
114
            }
115
        }
116
117
        // decode the stream
118 16
        $remaining_filters = [];
119 16
        foreach ($filters as $filter) {
120 16
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
121
                try {
122 16
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
123 1
                } catch (Exception $e) {
124 1
                    $emsg = $e->getMessage();
125 1
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
126 1
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
127
                    ) {
128 16
                        throw new Exception($e->getMessage());
129
                    }
130
                }
131
            } else {
132
                // add missing filter to array
133 16
                $remaining_filters[] = $filter;
134
            }
135
        }
136
137 16
        return [$stream, $remaining_filters];
138
    }
139
140
    /**
141
     * Decode the Cross-Reference section
142
     *
143
     * @param string $pdfData   PDF data
144
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
145
     * @param array  $xref      Previous xref array (if any)
146
     *
147
     * @return array containing xref and trailer data
148
     */
149 15
    protected function decodeXref($pdfData, $startxref, $xref = [])
150
    {
151 15
        $startxref += 4; // 4 is the length of the word 'xref'
152
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
153 15
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
154
        // initialize object number
155 15
        $obj_num = 0;
156
        // search for cross-reference entries or subsection
157 15
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
158 15
            if ($matches[0][1] != $offset) {
159
                // we are on another section
160 2
                break;
161
            }
162 15
            $offset += \strlen($matches[0][0]);
163 15
            if ('n' == $matches[3][0]) {
164
                // create unique object index: [object number]_[generation number]
165 15
                $index = $obj_num.'_'.(int) ($matches[2][0]);
166
                // check if object already exist
167 15
                if (!isset($xref['xref'][$index])) {
168
                    // store object offset position
169 15
                    $xref['xref'][$index] = (int) ($matches[1][0]);
170
                }
171 15
                ++$obj_num;
172 15
            } elseif ('f' == $matches[3][0]) {
173 15
                ++$obj_num;
174
            } else {
175
                // object number (index)
176 15
                $obj_num = (int) ($matches[1][0]);
177
            }
178
        }
179
        // get trailer data
180 15
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
181 15
            $trailer_data = $matches[1][0];
182 15
            if (!isset($xref['trailer']) or empty($xref['trailer'])) {
183
                // get only the last updated version
184 15
                $xref['trailer'] = [];
185
                // parse trailer_data
186 15
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
187 15
                    $xref['trailer']['size'] = (int) ($matches[1]);
188
                }
189 15
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
190 15
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
191
                }
192 15
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
193
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
194
                }
195 15
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196 14
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197
                }
198 15
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
199 14
                    $xref['trailer']['id'] = [];
200 14
                    $xref['trailer']['id'][0] = $matches[1];
201 14
                    $xref['trailer']['id'][1] = $matches[2];
202
                }
203
            }
204 15
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
205
                // get previous xref
206 15
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
207
            }
208
        } else {
209
            throw new Exception('Unable to find trailer');
210
        }
211
212 15
        return $xref;
213
    }
214
215
    /**
216
     * Decode the Cross-Reference Stream section
217
     *
218
     * @param string $pdfData   PDF data
219
     * @param int    $startxref Offset at which the xref section starts
220
     * @param array  $xref      Previous xref array (if any)
221
     *
222
     * @return array containing xref and trailer data
223
     *
224
     * @throws Exception if unknown PNG predictor detected
225
     */
226 2
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
227
    {
228
        // try to read Cross-Reference Stream
229 2
        $xrefobj = $this->getRawObject($pdfData, $startxref);
230 2
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
231 2
        if (!isset($xref['trailer']) or empty($xref['trailer'])) {
232
            // get only the last updated version
233 2
            $xref['trailer'] = [];
234 2
            $filltrailer = true;
235
        } else {
236 2
            $filltrailer = false;
237
        }
238 2
        if (!isset($xref['xref'])) {
239 2
            $xref['xref'] = [];
240
        }
241 2
        $valid_crs = false;
242 2
        $columns = 0;
243 2
        $sarr = $xrefcrs[0][1];
244 2
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248 2
        $wb = [];
249
250 2
        foreach ($sarr as $k => $v) {
251
            if (
252 2
                ('/' == $v[0])
253 2
                && ('Type' == $v[1])
254
                && (
255 2
                    isset($sarr[($k + 1)])
256 2
                    && '/' == $sarr[($k + 1)][0]
257 2
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260 2
                $valid_crs = true;
261 2
            } elseif (('/' == $v[0]) and ('Index' == $v[1]) and (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263 2
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264 2
            } elseif (('/' == $v[0]) and ('Prev' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266 2
                $prevxref = (int) ($sarr[($k + 1)][1]);
267 2
            } elseif (('/' == $v[0]) and ('W' == $v[1]) and (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269 2
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270 2
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271 2
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272 2
            } elseif (('/' == $v[0]) and ('DecodeParms' == $v[1]) and (isset($sarr[($k + 1)][1]))) {
273 2
                $decpar = $sarr[($k + 1)][1];
274 2
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276 2
                        '/' == $vdc[0]
277 2
                        && 'Columns' == $vdc[1]
278
                        && (
279 2
                            isset($decpar[($kdc + 1)])
280 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283 2
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285 2
                        '/' == $vdc[0]
286 2
                        && 'Predictor' == $vdc[1]
287
                        && (
288 2
                            isset($decpar[($kdc + 1)])
289 2
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292 2
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
293
                    }
294
                }
295 2
            } elseif ($filltrailer) {
296 2
                if (('/' == $v[0]) and ('Size' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
297 2
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298 2
                } elseif (('/' == $v[0]) and ('Root' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
299 2
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300 2
                } elseif (('/' == $v[0]) and ('Info' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
301 2
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302 2
                } elseif (('/' == $v[0]) and ('Encrypt' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304 2
                } elseif (('/' == $v[0]) and ('ID' == $v[1]) and (isset($sarr[($k + 1)]))) {
305 2
                    $xref['trailer']['id'] = [];
306 2
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307 2
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313 2
        if ($valid_crs and isset($xrefcrs[1][3][0])) {
314
            // number of bytes in a row
315 2
            $rowlen = ($columns + 1);
316
            // convert the stream into an array of integers
317 2
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
318
            // split the rows
319 2
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

319
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
320
            // initialize decoded array
321 2
            $ddata = [];
322
            // initialize first row with zeros
323 2
            $prev_row = array_fill(0, $rowlen, 0);
324
            // for each row apply PNG unpredictor
325 2
            foreach ($sdata as $k => $row) {
326
                // initialize new row
327 2
                $ddata[$k] = [];
328
                // get PNG predictor value
329 2
                $predictor = (10 + $row[0]);
330
                // for each byte on the row
331 2
                for ($i = 1; $i <= $columns; ++$i) {
332
                    // new index
333 2
                    $j = ($i - 1);
334 2
                    $row_up = $prev_row[$j];
335 2
                    if (1 == $i) {
336 2
                        $row_left = 0;
337 2
                        $row_upleft = 0;
338
                    } else {
339 2
                        $row_left = $row[($i - 1)];
340 2
                        $row_upleft = $prev_row[($j - 1)];
341
                    }
342
                    switch ($predictor) {
343 2
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
344
                            $ddata[$k][$j] = $row[$i];
345
                            break;
346
347 2
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
348
                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
349
                            break;
350
351 2
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
352 2
                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
353 2
                            break;
354
355
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
356
                            $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
357
                            break;
358
359
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
360
                            // initial estimate
361
                            $p = ($row_left + $row_up - $row_upleft);
362
                            // distances
363
                            $pa = abs($p - $row_left);
364
                            $pb = abs($p - $row_up);
365
                            $pc = abs($p - $row_upleft);
366
                            $pmin = min($pa, $pb, $pc);
367
                            // return minimum distance
368
                            switch ($pmin) {
369
                                case $pa:
370
                                    $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
371
                                    break;
372
373
                                case $pb:
374
                                    $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
375
                                    break;
376
377
                                case $pc:
378
                                    $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
379
                                    break;
380
                            }
381
                            break;
382
383
                        default:  // PNG prediction (on encoding, PNG optimum)
384
                            throw new Exception('Unknown PNG predictor');
385
                    }
386
                }
387 2
                $prev_row = $ddata[$k];
388
            } // end for each row
389
            // complete decoding
390 2
            $sdata = [];
391
            // for every row
392 2
            foreach ($ddata as $k => $row) {
393
                // initialize new row
394 2
                $sdata[$k] = [0, 0, 0];
395 2
                if (0 == $wb[0]) {
396
                    // default type field
397
                    $sdata[$k][0] = 1;
398
                }
399 2
                $i = 0; // count bytes in the row
400
                // for every column
401 2
                for ($c = 0; $c < 3; ++$c) {
402
                    // for every byte on the column
403 2
                    for ($b = 0; $b < $wb[$c]; ++$b) {
404 2
                        if (isset($row[$i])) {
405 2
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
406
                        }
407 2
                        ++$i;
408
                    }
409
                }
410
            }
411 2
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
412
            // fill xref
413 2
            if (isset($index_first)) {
414 2
                $obj_num = $index_first;
415
            } else {
416 2
                $obj_num = 0;
417
            }
418 2
            foreach ($sdata as $k => $row) {
419 2
                switch ($row[0]) {
420 2
                    case 0:  // (f) linked list of free objects
421 2
                            break;
422
423 2
                    case 1:  // (n) objects that are in use but are not compressed
424
                            // create unique object index: [object number]_[generation number]
425 2
                            $index = $obj_num.'_'.$row[2];
426
                            // check if object already exist
427 2
                            if (!isset($xref['xref'][$index])) {
428
                                // store object offset position
429 2
                                $xref['xref'][$index] = $row[1];
430
                            }
431 2
                            break;
432
433 2
                    case 2:  // compressed objects
434
                            // $row[1] = object number of the object stream in which this object is stored
435
                            // $row[2] = index of this object within the object stream
436 2
                            $index = $row[1].'_0_'.$row[2];
437 2
                            $xref['xref'][$index] = -1;
438 2
                            break;
439
440
                    default:  // null objects
441
                            break;
442
                }
443 2
                ++$obj_num;
444
            }
445
        } // end decoding data
446 2
        if (isset($prevxref)) {
447
            // get previous xref
448 2
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
449
        }
450
451 2
        return $xref;
452
    }
453
454
    /**
455
     * Get content of indirect object.
456
     *
457
     * @param string $pdfData  PDF data
458
     * @param array  $xref
459
     * @param string $obj_ref  Object number and generation number separated by underscore character
460
     * @param int    $offset   Object offset
461
     * @param bool   $decoding If true decode streams
462
     *
463
     * @return array containing object data
464
     *
465
     * @throws Exception if invalid object reference found
466
     */
467 16
    protected function getIndirectObject($pdfData, $xref, $obj_ref, $offset = 0, $decoding = true)
468
    {
469 16
        $obj = explode('_', $obj_ref);
470 16
        if ((false === $obj) or (2 != \count($obj))) {
471
            throw new Exception('Invalid object reference for $obj.');
472
        }
473 16
        $objref = $obj[0].' '.$obj[1].' obj';
474
        // ignore leading zeros
475 16
        $offset += strspn($pdfData, '0', $offset);
476 16
        if (strpos($pdfData, $objref, $offset) != $offset) {
477
            // an indirect reference to an undefined object shall be considered a reference to the null object
478
            return ['null', 'null', $offset];
479
        }
480
        // starting position of object content
481 16
        $offset += \strlen($objref);
482
        // get array of object content
483 16
        $objdata = [];
484 16
        $i = 0; // object main index
485
        do {
486 16
            $oldoffset = $offset;
487
            // get element
488 16
            $element = $this->getRawObject($pdfData, $offset);
489 16
            $offset = $element[2];
490
            // decode stream using stream's dictionary information
491 16
            if ($decoding and ('stream' == $element[0]) and (isset($objdata[($i - 1)][0])) and ('<<' == $objdata[($i - 1)][0])) {
492 16
                $element[3] = $this->decodeStream($pdfData, $xref, $objdata[($i - 1)][1], $element[1]);
493
            }
494 16
            $objdata[$i] = $element;
495 16
            ++$i;
496 16
        } while (('endobj' != $element[0]) and ($offset != $oldoffset));
497
498
        // remove closing delimiter
499 16
        array_pop($objdata);
500
501
        // return raw object content
502 16
        return $objdata;
503
    }
504
505
    /**
506
     * Get the content of object, resolving indect object reference if necessary.
507
     *
508
     * @param string $pdfData PDF data
509
     * @param string $obj     Object value
510
     *
511
     * @return array containing object data
512
     */
513 16
    protected function getObjectVal($pdfData, $xref, $obj)
514
    {
515 16
        if ('objref' == $obj[0]) {
516
            // reference to indirect object
517
            if (isset($this->objects[$obj[1]])) {
518
                // this object has been already parsed
519
                return $this->objects[$obj[1]];
520
            } elseif (isset($xref[$obj[1]])) {
521
                // parse new object
522
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
0 ignored issues
show
Bug Best Practice introduced by
The property objects does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
523
524
                return $this->objects[$obj[1]];
525
            }
526
        }
527
528 16
        return $obj;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $obj returns the type string which is incompatible with the documented return type array.
Loading history...
529
    }
530
531
    /**
532
     * Get object type, raw value and offset to next object
533
     *
534
     * @param int $offset Object offset
535
     *
536
     * @return array containing object type, raw value and offset to next object
537
     */
538 16
    protected function getRawObject($pdfData, $offset = 0)
539
    {
540 16
        $objtype = ''; // object type to be returned
541 16
        $objval = ''; // object value to be returned
542
543
        /*
544
         * skip initial white space chars:
545
         *      \x00 null (NUL)
546
         *      \x09 horizontal tab (HT)
547
         *      \x0A line feed (LF)
548
         *      \x0C form feed (FF)
549
         *      \x0D carriage return (CR)
550
         *      \x20 space (SP)
551
         */
552 16
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
553
554
        // get first char
555 16
        $char = $pdfData[$offset];
556
        // get object type
557
        switch ($char) {
558 16
            case '%':  // \x25 PERCENT SIGN
559
                    // skip comment and search for next token
560
                    $next = strcspn($pdfData, "\r\n", $offset);
561
                    if ($next > 0) {
562
                        $offset += $next;
563
564
                        return $this->getRawObject($pdfData, $offset);
565
                    }
566
                    break;
567
568 16
            case '/':  // \x2F SOLIDUS
569
                    // name object
570 16
                    $objtype = $char;
571 16
                    ++$offset;
572 16
                    $pregResult = preg_match(
573 16
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
574 16
                        substr($pdfData, $offset, 256),
575 16
                        $matches
576
                    );
577 16
                    if (1 == $pregResult) {
578 16
                        $objval = $matches[1]; // unescaped value
579 16
                        $offset += \strlen($objval);
580
                    }
581 16
                    break;
582
583 16
            case '(':   // \x28 LEFT PARENTHESIS
584 16
            case ')':  // \x29 RIGHT PARENTHESIS
585
                    // literal string object
586 16
                    $objtype = $char;
587 16
                    ++$offset;
588 16
                    $strpos = $offset;
589 16
                    if ('(' == $char) {
590 16
                        $open_bracket = 1;
591 16
                        while ($open_bracket > 0) {
592 16
                            if (!isset($pdfData[$strpos])) {
593
                                break;
594
                            }
595 16
                            $ch = $pdfData[$strpos];
596
                            switch ($ch) {
597 16
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
598
                                        // skip next character
599 14
                                        ++$strpos;
600 14
                                        break;
601
602 16
                                case '(':  // LEFT PARENHESIS (28h)
603
                                        ++$open_bracket;
604
                                        break;
605
606 16
                                case ')':  // RIGHT PARENTHESIS (29h)
607 16
                                        --$open_bracket;
608 16
                                        break;
609
                            }
610 16
                            ++$strpos;
611
                        }
612 16
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
613 16
                        $offset = $strpos;
614
                    }
615 16
                    break;
616
617 16
            case '[':   // \x5B LEFT SQUARE BRACKET
618 16
            case ']':  // \x5D RIGHT SQUARE BRACKET
619
                    // array object
620 16
                    $objtype = $char;
621 16
                    ++$offset;
622 16
                    if ('[' == $char) {
623
                        // get array content
624 16
                        $objval = [];
625
                        do {
626
                            // get element
627 16
                            $element = $this->getRawObject($pdfData, $offset);
628 16
                            $offset = $element[2];
629 16
                            $objval[] = $element;
630 16
                        } while (']' != $element[0]);
631
                        // remove closing delimiter
632 16
                        array_pop($objval);
633
                    }
634 16
                    break;
635
636 16
            case '<':  // \x3C LESS-THAN SIGN
637 16
            case '>':  // \x3E GREATER-THAN SIGN
638 16
                    if (isset($pdfData[($offset + 1)]) and ($pdfData[($offset + 1)] == $char)) {
639
                        // dictionary object
640 16
                        $objtype = $char.$char;
641 16
                        $offset += 2;
642 16
                        if ('<' == $char) {
643
                            // get array content
644 16
                            $objval = [];
645
                            do {
646
                                // get element
647 16
                                $element = $this->getRawObject($pdfData, $offset);
648 16
                                $offset = $element[2];
649 16
                                $objval[] = $element;
650 16
                            } while ('>>' != $element[0]);
651
                            // remove closing delimiter
652 16
                            array_pop($objval);
653
                        }
654
                    } else {
655
                        // hexadecimal string object
656 4
                        $objtype = $char;
657 4
                        ++$offset;
658 4
                        $pregResult = preg_match(
659 4
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
660 4
                            substr($pdfData, $offset),
661 4
                            $matches
662
                        );
663 4
                        if (('<' == $char) && 1 == $pregResult) {
664
                            // remove white space characters
665 4
                            $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
666 4
                            $offset += \strlen($matches[0]);
667
                        } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
668
                            $offset = $endpos + 1;
669
                        }
670
                    }
671 16
                    break;
672
673
            default:
674 16
                    if ('endobj' == substr($pdfData, $offset, 6)) {
675
                        // indirect object
676 16
                        $objtype = 'endobj';
677 16
                        $offset += 6;
678 16
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
679
                        // null object
680 2
                        $objtype = 'null';
681 2
                        $offset += 4;
682 2
                        $objval = 'null';
683 16
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
684
                        // boolean true object
685 4
                        $objtype = 'boolean';
686 4
                        $offset += 4;
687 4
                        $objval = 'true';
688 16
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
689
                        // boolean false object
690
                        $objtype = 'boolean';
691
                        $offset += 5;
692
                        $objval = 'false';
693 16
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
694
                        // start stream object
695 16
                        $objtype = 'stream';
696 16
                        $offset += 6;
697 16
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
698 16
                            $offset += \strlen($matches[0]);
699 16
                            $pregResult = preg_match(
700 16
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
701 16
                                substr($pdfData, $offset),
702 16
                                $matches,
703 16
                                PREG_OFFSET_CAPTURE
704
                            );
705 16
                            if (1 == $pregResult) {
706 16
                                $objval = substr($pdfData, $offset, $matches[0][1]);
707 16
                                $offset += $matches[1][1];
708
                            }
709
                        }
710 16
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
711
                        // end stream object
712 16
                        $objtype = 'endstream';
713 16
                        $offset += 9;
714 16
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
715
                        // indirect object reference
716 16
                        $objtype = 'objref';
717 16
                        $offset += \strlen($matches[0]);
718 16
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
719 16
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
720
                        // object start
721 2
                        $objtype = 'obj';
722 2
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
723 2
                        $offset += \strlen($matches[0]);
724 16
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
725
                        // numeric object
726 16
                        $objtype = 'numeric';
727 16
                        $objval = substr($pdfData, $offset, $numlen);
728 16
                        $offset += $numlen;
729
                    }
730 16
                    break;
731
        }
732
733 16
        return [$objtype, $objval, $offset];
734
    }
735
736
    /**
737
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
738
     *
739
     * @param string $pdfData
740
     * @param int    $offset  xref offset (if know)
741
     * @param array  $xref    previous xref array (if any)
742
     *
743
     * @return array containing xref and trailer data
744
     *
745
     * @throws Exception if it was unable to find startxref
746
     * @throws Exception if it was unable to find xref
747
     */
748 16
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
749
    {
750 16
        $startxrefPreg = preg_match(
751 16
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
752 16
            $pdfData,
753 16
            $matches,
754 16
            PREG_OFFSET_CAPTURE,
755 16
            $offset
756
        );
757
758 16
        if (0 == $offset) {
759
            // find last startxref
760 16
            $pregResult = preg_match_all(
761 16
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
762 16
                $pdfData, $matches,
763 16
                PREG_SET_ORDER,
764 16
                $offset
765
            );
766 16
            if (0 == $pregResult) {
767
                throw new Exception('Unable to find startxref');
768
            }
769 16
            $matches = array_pop($matches);
770 16
            $startxref = $matches[1];
771 4
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
772
            // Already pointing at the xref table
773 2
            $startxref = $offset;
774 2
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
775
            // Cross-Reference Stream object
776 2
            $startxref = $offset;
777
        } elseif ($startxrefPreg) {
778
            // startxref found
779
            $startxref = $matches[1][0];
780
        } else {
781
            throw new Exception('Unable to find startxref');
782
        }
783
784
        // check xref position
785 16
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
786
            // Cross-Reference
787 15
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
788
        } else {
789
            // Cross-Reference Stream
790 2
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
791
        }
792 16
        if (empty($xref)) {
793
            throw new Exception('Unable to find xref');
794
        }
795
796 16
        return $xref;
797
    }
798
799
    /**
800
     * Parses PDF data and returns extracted data as array.
801
     *
802
     * @param string $data PDF data to parse
803
     *
804
     * @return array array of parsed PDF document objects
805
     *
806
     * @throws Exception if empty PDF data given
807
     * @throws Exception if PDF data missing %PDF header
808
     */
809 16
    public function parseData($data)
810
    {
811 16
        if (empty($data)) {
812
            throw new Exception('Empty PDF data given.');
813
        }
814
        // find the pdf header starting position
815 16
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
816
            throw new Exception('Invalid PDF data: missing %PDF header.');
817
        }
818
819
        // get PDF content string
820 16
        $pdfData = substr($data, $trimpos);
821
822
        // get xref and trailer data
823 16
        $xref = $this->getXrefData($pdfData);
824
825
        // parse all document objects
826 16
        $objects = [];
827 16
        foreach ($xref['xref'] as $obj => $offset) {
828 16
            if (!isset($objects[$obj]) and ($offset > 0)) {
829
                // decode objects with positive offset
830 16
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
831
            }
832
        }
833
834 16
        return [$xref, $objects];
835
    }
836
}
837