Completed
Push — feature/remove-tcpdf-lib ( d32e07 )
by Konrad
02:34
created

RawDataParser::decodeStream()   D

Complexity

Conditions 21
Paths 51

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 1
Metric Value
cc 21
eloc 32
c 2
b 1
f 1
nc 51
nop 3
dl 0
loc 56
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    /**
58
     * @param array $cfg Configuration array, default is []
59
     */
60
    public function __construct($cfg = [])
61
    {
62
        // merge given array with default values
63
        $this->cfg = array_merge($this->cfg, $cfg);
64
65
        $this->filterHelper = new FilterHelper();
0 ignored issues
show
Bug Best Practice introduced by
The property filterHelper does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
66
    }
67
68
    /**
69
     * Decode the specified stream.
70
     *
71
     * @param string $pdfData PDF data
72
     * @param array  $sdic    Stream's dictionary array
73
     * @param string $stream  Stream to decode
74
     *
75
     * @return array containing decoded stream data and remaining filters
76
     */
77
    public function decodeStream($pdfData, $sdic, $stream)
78
    {
79
        // get stream length and filters
80
        $slength = \strlen($stream);
81
        if ($slength <= 0) {
82
            return ['', []];
83
        }
84
        $filters = [];
85
        foreach ($sdic as $k => $v) {
86
            if ('/' == $v[0]) {
87
                if (('Length' == $v[1]) and (isset($sdic[($k + 1)])) and ('numeric' == $sdic[($k + 1)][0])) {
88
                    // get declared stream length
89
                    $declength = (int) ($sdic[($k + 1)][1]);
90
                    if ($declength < $slength) {
91
                        $stream = substr($stream, 0, $declength);
92
                        $slength = $declength;
93
                    }
94
                } elseif (('Filter' == $v[1]) and (isset($sdic[($k + 1)]))) {
95
                    // resolve indirect object
96
                    $objval = $this->getObjectVal($pdfData, $sdic[($k + 1)]);
97
                    if ('/' == $objval[0]) {
98
                        // single filter
99
                        $filters[] = $objval[1];
100
                    } elseif ('[' == $objval[0]) {
101
                        // array of filters
102
                        foreach ($objval[1] as $flt) {
103
                            if ('/' == $flt[0]) {
104
                                $filters[] = $flt[1];
105
                            }
106
                        }
107
                    }
108
                }
109
            }
110
        }
111
112
        // decode the stream
113
        $remaining_filters = [];
114
        foreach ($filters as $filter) {
115
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
116
                try {
117
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
118
                } catch (Exception $e) {
119
                    $emsg = $e->getMessage();
120
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
121
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
122
                    ) {
123
                        throw new Exception($e->getMessage());
124
                    }
125
                }
126
            } else {
127
                // add missing filter to array
128
                $remaining_filters[] = $filter;
129
            }
130
        }
131
132
        return [$stream, $remaining_filters];
133
    }
134
135
    /**
136
     * Decode the Cross-Reference section
137
     *
138
     * @param string $pdfData   PDF data
139
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
140
     * @param array  $xref      Previous xref array (if any)
141
     *
142
     * @return array containing xref and trailer data
143
     */
144
    public function decodeXref($pdfData, $startxref, $xref = [])
145
    {
146
        $startxref += 4; // 4 is the length of the word 'xref'
147
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
148
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
149
        // initialize object number
150
        $obj_num = 0;
151
        // search for cross-reference entries or subsection
152
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
153
            if ($matches[0][1] != $offset) {
154
                // we are on another section
155
                break;
156
            }
157
            $offset += \strlen($matches[0][0]);
158
            if ('n' == $matches[3][0]) {
159
                // create unique object index: [object number]_[generation number]
160
                $index = $obj_num.'_'.(int) ($matches[2][0]);
161
                // check if object already exist
162
                if (!isset($xref['xref'][$index])) {
163
                    // store object offset position
164
                    $xref['xref'][$index] = (int) ($matches[1][0]);
165
                }
166
                ++$obj_num;
167
            } elseif ('f' == $matches[3][0]) {
168
                ++$obj_num;
169
            } else {
170
                // object number (index)
171
                $obj_num = (int) ($matches[1][0]);
172
            }
173
        }
174
        // get trailer data
175
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
176
            $trailer_data = $matches[1][0];
177
            if (!isset($xref['trailer']) or empty($xref['trailer'])) {
178
                // get only the last updated version
179
                $xref['trailer'] = [];
180
                // parse trailer_data
181
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
182
                    $xref['trailer']['size'] = (int) ($matches[1]);
183
                }
184
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
185
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
186
                }
187
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
188
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
189
                }
190
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
191
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
192
                }
193
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
194
                    $xref['trailer']['id'] = [];
195
                    $xref['trailer']['id'][0] = $matches[1];
196
                    $xref['trailer']['id'][1] = $matches[2];
197
                }
198
            }
199
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
200
                // get previous xref
201
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
202
            }
203
        } else {
204
            throw new Exception('Unable to find trailer');
205
        }
206
207
        return $xref;
208
    }
209
210
    /**
211
     * Decode the Cross-Reference Stream section
212
     *
213
     * @param string $pdfData   PDF data
214
     * @param int    $startxref Offset at which the xref section starts
215
     * @param array  $xref      Previous xref array (if any)
216
     *
217
     * @return array containing xref and trailer data
218
     *
219
     * @throws Exception if unknown PNG predictor detected
220
     */
221
    public function decodeXrefStream($pdfData, $startxref, $xref = [])
222
    {
223
        // try to read Cross-Reference Stream
224
        $xrefobj = $this->getRawObject($pdfData, $startxref);
225
        $xrefcrs = $this->getIndirectObject($pdfData, $xrefobj[1], $startxref, true);
226
        if (!isset($xref['trailer']) or empty($xref['trailer'])) {
227
            // get only the last updated version
228
            $xref['trailer'] = [];
229
            $filltrailer = true;
230
        } else {
231
            $filltrailer = false;
232
        }
233
        if (!isset($xref['xref'])) {
234
            $xref['xref'] = [];
235
        }
236
        $valid_crs = false;
237
        $columns = 0;
238
        $sarr = $xrefcrs[0][1];
239
        if (!\is_array($sarr)) {
240
            $sarr = [];
241
        }
242
        foreach ($sarr as $k => $v) {
243
            if (('/' == $v[0]) and ('Type' == $v[1]) and (isset($sarr[($k + 1)]) and ('/' == $sarr[($k + 1)][0]) and ('XRef' == $sarr[($k + 1)][1]))) {
244
                $valid_crs = true;
245
            } elseif (('/' == $v[0]) and ('Index' == $v[1]) and (isset($sarr[($k + 1)]))) {
246
                // first object number in the subsection
247
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
248
                // number of entries in the subsection
249
                $index_entries = (int) ($sarr[($k + 1)][1][1][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $index_entries is dead and can be removed.
Loading history...
250
            } elseif (('/' == $v[0]) and ('Prev' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
251
                // get previous xref offset
252
                $prevxref = (int) ($sarr[($k + 1)][1]);
253
            } elseif (('/' == $v[0]) and ('W' == $v[1]) and (isset($sarr[($k + 1)]))) {
254
                // number of bytes (in the decoded stream) of the corresponding field
255
                $wb = [];
256
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
257
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
258
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
259
            } elseif (('/' == $v[0]) and ('DecodeParms' == $v[1]) and (isset($sarr[($k + 1)][1]))) {
260
                $decpar = $sarr[($k + 1)][1];
261
                foreach ($decpar as $kdc => $vdc) {
262
                    if (('/' == $vdc[0]) and ('Columns' == $vdc[1]) and (isset($decpar[($kdc + 1)]) and ('numeric' == $decpar[($kdc + 1)][0]))) {
263
                        $columns = (int) ($decpar[($kdc + 1)][1]);
264
                    } elseif (('/' == $vdc[0]) and ('Predictor' == $vdc[1]) and (isset($decpar[($kdc + 1)]) and ('numeric' == $decpar[($kdc + 1)][0]))) {
265
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
266
                    }
267
                }
268
            } elseif ($filltrailer) {
269
                if (('/' == $v[0]) and ('Size' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
270
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
271
                } elseif (('/' == $v[0]) and ('Root' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
272
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
273
                } elseif (('/' == $v[0]) and ('Info' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
274
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
275
                } elseif (('/' == $v[0]) and ('Encrypt' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
276
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
277
                } elseif (('/' == $v[0]) and ('ID' == $v[1]) and (isset($sarr[($k + 1)]))) {
278
                    $xref['trailer']['id'] = [];
279
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
280
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
281
                }
282
            }
283
        }
284
        // decode data
285
        if ($valid_crs and isset($xrefcrs[1][3][0])) {
286
            // number of bytes in a row
287
            $rowlen = ($columns + 1);
288
            // convert the stream into an array of integers
289
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
290
            // split the rows
291
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
292
            // initialize decoded array
293
            $ddata = [];
294
            // initialize first row with zeros
295
            $prev_row = array_fill(0, $rowlen, 0);
296
            // for each row apply PNG unpredictor
297
            foreach ($sdata as $k => $row) {
298
                // initialize new row
299
                $ddata[$k] = [];
300
                // get PNG predictor value
301
                $predictor = (10 + $row[0]);
302
                // for each byte on the row
303
                for ($i = 1; $i <= $columns; ++$i) {
304
                    // new index
305
                    $j = ($i - 1);
306
                    $row_up = $prev_row[$j];
307
                    if (1 == $i) {
308
                        $row_left = 0;
309
                        $row_upleft = 0;
310
                    } else {
311
                        $row_left = $row[($i - 1)];
312
                        $row_upleft = $prev_row[($j - 1)];
313
                    }
314
                    switch ($predictor) {
315
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
316
                                $ddata[$k][$j] = $row[$i];
317
                                break;
318
319
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
320
                                $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
321
                                break;
322
323
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
324
                                $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
325
                                break;
326
327
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
328
                                $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
329
                                break;
330
331
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
332
                                // initial estimate
333
                                $p = ($row_left + $row_up - $row_upleft);
334
                                // distances
335
                                $pa = abs($p - $row_left);
336
                                $pb = abs($p - $row_up);
337
                                $pc = abs($p - $row_upleft);
338
                                $pmin = min($pa, $pb, $pc);
339
                                // return minimum distance
340
                                switch ($pmin) {
341
                                    case $pa:
342
                                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
343
                                            break;
344
345
                                    case $pb:
346
                                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
347
                                            break;
348
349
                                    case $pc:
350
                                            $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
351
                                            break;
352
                                }
353
                                break;
354
355
                        default:  // PNG prediction (on encoding, PNG optimum)
356
                                throw new Exception('Unknown PNG predictor');
357
                                break;
358
                    }
359
                }
360
                $prev_row = $ddata[$k];
361
            } // end for each row
362
            // complete decoding
363
            $sdata = [];
364
            // for every row
365
            foreach ($ddata as $k => $row) {
366
                // initialize new row
367
                $sdata[$k] = [0, 0, 0];
368
                if (0 == $wb[0]) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $wb does not seem to be defined for all execution paths leading up to this point.
Loading history...
369
                    // default type field
370
                    $sdata[$k][0] = 1;
371
                }
372
                $i = 0; // count bytes in the row
373
                // for every column
374
                for ($c = 0; $c < 3; ++$c) {
375
                    // for every byte on the column
376
                    for ($b = 0; $b < $wb[$c]; ++$b) {
377
                        if (isset($row[$i])) {
378
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
379
                        }
380
                        ++$i;
381
                    }
382
                }
383
            }
384
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
385
            // fill xref
386
            if (isset($index_first)) {
387
                $obj_num = $index_first;
388
            } else {
389
                $obj_num = 0;
390
            }
391
            foreach ($sdata as $k => $row) {
392
                switch ($row[0]) {
393
                    case 0:  // (f) linked list of free objects
394
                            break;
395
396
                    case 1:  // (n) objects that are in use but are not compressed
397
                            // create unique object index: [object number]_[generation number]
398
                            $index = $obj_num.'_'.$row[2];
399
                            // check if object already exist
400
                            if (!isset($xref['xref'][$index])) {
401
                                // store object offset position
402
                                $xref['xref'][$index] = $row[1];
403
                            }
404
                            break;
405
406
                    case 2:  // compressed objects
407
                            // $row[1] = object number of the object stream in which this object is stored
408
                            // $row[2] = index of this object within the object stream
409
                            $index = $row[1].'_0_'.$row[2];
410
                            $xref['xref'][$index] = -1;
411
                            break;
412
413
                    default:  // null objects
414
                            break;
415
                }
416
                ++$obj_num;
417
            }
418
        } // end decoding data
419
        if (isset($prevxref)) {
420
            // get previous xref
421
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
422
        }
423
424
        return $xref;
425
    }
426
427
    /**
428
     * Get content of indirect object.
429
     *
430
     * @param string $pdfData  PDF data
431
     * @param string $obj_ref  Object number and generation number separated by underscore character
432
     * @param int    $offset   Object offset
433
     * @param bool   $decoding If true decode streams
434
     *
435
     * @return array containing object data
436
     *
437
     * @throws Exception if invalid object reference found
438
     */
439
    public function getIndirectObject($pdfData, $obj_ref, $offset = 0, $decoding = true)
440
    {
441
        $obj = explode('_', $obj_ref);
442
        if ((false === $obj) or (2 != \count($obj))) {
443
            throw new Exception('Invalid object reference: '.$obj);
0 ignored issues
show
Bug introduced by
Are you sure $obj of type string[] can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

443
            throw new Exception('Invalid object reference: './** @scrutinizer ignore-type */ $obj);
Loading history...
444
445
            return;
0 ignored issues
show
Unused Code introduced by
return is not reachable.

This check looks for unreachable code. It uses sophisticated control flow analysis techniques to find statements which will never be executed.

Unreachable code is most often the result of return, die or exit statements that have been added for debug purposes.

function fx() {
    try {
        doSomething();
        return true;
    }
    catch (\Exception $e) {
        return false;
    }

    return false;
}

In the above example, the last return false will never be executed, because a return statement has already been met in every possible execution path.

Loading history...
446
        }
447
        $objref = $obj[0].' '.$obj[1].' obj';
448
        // ignore leading zeros
449
        $offset += strspn($pdfData, '0', $offset);
450
        if (strpos($pdfData, $objref, $offset) != $offset) {
451
            // an indirect reference to an undefined object shall be considered a reference to the null object
452
            return ['null', 'null', $offset];
453
        }
454
        // starting position of object content
455
        $offset += \strlen($objref);
456
        // get array of object content
457
        $objdata = [];
458
        $i = 0; // object main index
459
        do {
460
            $oldoffset = $offset;
461
            // get element
462
            $element = $this->getRawObject($pdfData, $offset);
463
            $offset = $element[2];
464
            // decode stream using stream's dictionary information
465
            if ($decoding and ('stream' == $element[0]) and (isset($objdata[($i - 1)][0])) and ('<<' == $objdata[($i - 1)][0])) {
466
                $element[3] = $this->decodeStream($pdfData, $objdata[($i - 1)][1], $element[1]);
467
            }
468
            $objdata[$i] = $element;
469
            ++$i;
470
        } while (('endobj' != $element[0]) and ($offset != $oldoffset));
471
        // remove closing delimiter
472
        array_pop($objdata);
473
        // return raw object content
474
        return $objdata;
475
    }
476
477
    /**
478
     * Get the content of object, resolving indect object reference if necessary.
479
     *
480
     * @param string $pdfData PDF data
481
     * @param string $obj     Object value
482
     *
483
     * @return array containing object data
484
     */
485
    public function getObjectVal($pdfData, $obj)
486
    {
487
        if ('objref' == $obj[0]) {
488
            // reference to indirect object
489
            if (isset($this->objects[$obj[1]])) {
490
                // this object has been already parsed
491
                return $this->objects[$obj[1]];
492
            } elseif (isset($this->xref[$obj[1]])) {
0 ignored issues
show
Bug Best Practice introduced by
The property xref does not exist on Smalot\PdfParser\RawData\RawDataParser. Did you maybe forget to declare it?
Loading history...
493
                // parse new object
494
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $obj[1], $this->xref[$obj[1]], false);
0 ignored issues
show
Bug Best Practice introduced by
The property objects does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
495
496
                return $this->objects[$obj[1]];
497
            }
498
        }
499
500
        return $obj;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $obj returns the type string which is incompatible with the documented return type array.
Loading history...
501
    }
502
503
    /**
504
     * Parses a PDF and returns extracted data.
505
     *
506
     * @param string $data PDF data to parse
507
     *
508
     * @return array array of parsed PDF document objects
509
     *
510
     * @throws Exception if empty PDF data given
511
     * @throws Exception if PDF data missing %PDF header
512
     */
513
    public function getParsedData($data)
514
    {
515
        if (empty($data)) {
516
            throw new Exception('Empty PDF data given.');
517
        }
518
        // find the pdf header starting position
519
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
520
            throw new Exception('Invalid PDF data: missing %PDF header.');
521
        }
522
523
        // get PDF content string
524
        $pdfData = substr($data, $trimpos);
525
526
        // get xref and trailer data
527
        $xref = $this->getXrefData($pdfData);
528
529
        // parse all document objects
530
        $objects = [];
531
        foreach ($xref['xref'] as $obj => $offset) {
532
            if (!isset($objects[$obj]) and ($offset > 0)) {
533
                // decode objects with positive offset
534
                $objects[$obj] = $this->getIndirectObject($pdfData, $obj, $offset, true);
535
            }
536
        }
537
        // release some memory
538
        unset($pdfData);
539
        $pdfData = '';
0 ignored issues
show
Unused Code introduced by
The assignment to $pdfData is dead and can be removed.
Loading history...
540
541
        return [$xref, $objects];
542
    }
543
544
    /**
545
     * Get object type, raw value and offset to next object
546
     *
547
     * @param int $offset Object offset
548
     *
549
     * @return array containing object type, raw value and offset to next object
550
     */
551
    public function getRawObject($pdfData, $offset = 0)
552
    {
553
        $objtype = ''; // object type to be returned
554
        $objval = ''; // object value to be returned
555
556
        /*
557
         * skip initial white space chars:
558
         *      \x00 null (NUL)
559
         *      \x09 horizontal tab (HT)
560
         *      \x0A line feed (LF)
561
         *      \x0C form feed (FF)
562
         *      \x0D carriage return (CR)
563
         *      \x20 space (SP)
564
         */
565
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
566
567
        // get first char
568
        $char = $pdfData[$offset];
569
        // get object type
570
        switch ($char) {
571
            case '%':  // \x25 PERCENT SIGN
572
                    // skip comment and search for next token
573
                    $next = strcspn($pdfData, "\r\n", $offset);
574
                    if ($next > 0) {
575
                        $offset += $next;
576
577
                        return $this->getRawObject($pdfData, $offset);
578
                    }
579
                    break;
580
581
            case '/':  // \x2F SOLIDUS
582
                    // name object
583
                    $objtype = $char;
584
                    ++$offset;
585
                    $pregResult = preg_match(
586
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
587
                        substr($pdfData, $offset, 256),
588
                        $matches
589
                    );
590
                    if (1 == $pregResult) {
591
                        $objval = $matches[1]; // unescaped value
592
                        $offset += \strlen($objval);
593
                    }
594
                    break;
595
596
            case '(':   // \x28 LEFT PARENTHESIS
597
            case ')':  // \x29 RIGHT PARENTHESIS
598
                    // literal string object
599
                    $objtype = $char;
600
                    ++$offset;
601
                    $strpos = $offset;
602
                    if ('(' == $char) {
603
                        $open_bracket = 1;
604
                        while ($open_bracket > 0) {
605
                            if (!isset($pdfData[$strpos])) {
606
                                break;
607
                            }
608
                            $ch = $pdfData[$strpos];
609
                            switch ($ch) {
610
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
611
                                        // skip next character
612
                                        ++$strpos;
613
                                        break;
614
615
                                case '(':  // LEFT PARENHESIS (28h)
616
                                        ++$open_bracket;
617
                                        break;
618
619
                                case ')':  // RIGHT PARENTHESIS (29h)
620
                                        --$open_bracket;
621
                                        break;
622
                            }
623
                            ++$strpos;
624
                        }
625
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
626
                        $offset = $strpos;
627
                    }
628
                    break;
629
630
            case '[':   // \x5B LEFT SQUARE BRACKET
631
            case ']':  // \x5D RIGHT SQUARE BRACKET
632
                    // array object
633
                    $objtype = $char;
634
                    ++$offset;
635
                    if ('[' == $char) {
636
                        // get array content
637
                        $objval = [];
638
                        do {
639
                            // get element
640
                            $element = $this->getRawObject($pdfData, $offset);
641
                            $offset = $element[2];
642
                            $objval[] = $element;
643
                        } while (']' != $element[0]);
644
                        // remove closing delimiter
645
                        array_pop($objval);
646
                    }
647
                    break;
648
649
            case '<':  // \x3C LESS-THAN SIGN
650
            case '>':  // \x3E GREATER-THAN SIGN
651
                    if (isset($pdfData[($offset + 1)]) and ($pdfData[($offset + 1)] == $char)) {
652
                        // dictionary object
653
                        $objtype = $char.$char;
654
                        $offset += 2;
655
                        if ('<' == $char) {
656
                            // get array content
657
                            $objval = [];
658
                            do {
659
                                // get element
660
                                $element = $this->getRawObject($pdfData, $offset);
661
                                $offset = $element[2];
662
                                $objval[] = $element;
663
                            } while ('>>' != $element[0]);
664
                            // remove closing delimiter
665
                            array_pop($objval);
666
                        }
667
                    } else {
668
                        // hexadecimal string object
669
                        $objtype = $char;
670
                        ++$offset;
671
                        $pregResult = preg_match(
672
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
673
                            substr($pdfData, $offset),
674
                            $matches
675
                        );
676
                        if (('<' == $char) && 1 == $pregResult) {
677
                            // remove white space characters
678
                            $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
679
                            $offset += \strlen($matches[0]);
680
                        } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
681
                            $offset = $endpos + 1;
682
                        }
683
                    }
684
                    break;
685
686
            default:
687
                    if ('endobj' == substr($pdfData, $offset, 6)) {
688
                        // indirect object
689
                        $objtype = 'endobj';
690
                        $offset += 6;
691
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
692
                        // null object
693
                        $objtype = 'null';
694
                        $offset += 4;
695
                        $objval = 'null';
696
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
697
                        // boolean true object
698
                        $objtype = 'boolean';
699
                        $offset += 4;
700
                        $objval = 'true';
701
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
702
                        // boolean false object
703
                        $objtype = 'boolean';
704
                        $offset += 5;
705
                        $objval = 'false';
706
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
707
                        // start stream object
708
                        $objtype = 'stream';
709
                        $offset += 6;
710
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
711
                            $offset += \strlen($matches[0]);
712
                            $pregResult = preg_match(
713
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
714
                                substr($pdfData, $offset),
715
                                $matches,
716
                                PREG_OFFSET_CAPTURE
717
                            );
718
                            if (1 == $pregResult) {
719
                                $objval = substr($pdfData, $offset, $matches[0][1]);
720
                                $offset += $matches[1][1];
721
                            }
722
                        }
723
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
724
                        // end stream object
725
                        $objtype = 'endstream';
726
                        $offset += 9;
727
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
728
                        // indirect object reference
729
                        $objtype = 'objref';
730
                        $offset += \strlen($matches[0]);
731
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
732
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
733
                        // object start
734
                        $objtype = 'obj';
735
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
736
                        $offset += \strlen($matches[0]);
737
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
738
                        // numeric object
739
                        $objtype = 'numeric';
740
                        $objval = substr($pdfData, $offset, $numlen);
741
                        $offset += $numlen;
742
                    }
743
                    break;
744
        }
745
746
        return [$objtype, $objval, $offset];
747
    }
748
749
    /**
750
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
751
     *
752
     * @param string $pdfData
753
     * @param int    $offset  xref offset (if know)
754
     * @param array  $xref    previous xref array (if any)
755
     *
756
     * @return array containing xref and trailer data
757
     *
758
     * @throws Exception if it was unable to find startxref
759
     * @throws Exception if it was unable to find xref
760
     */
761
    public function getXrefData($pdfData, $offset = 0, $xref = [])
762
    {
763
        $startxrefPreg = preg_match(
764
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
765
            $pdfData,
766
            $matches,
767
            PREG_OFFSET_CAPTURE,
768
            $offset
769
        );
770
771
        if (0 == $offset) {
772
            // find last startxref
773
            $pregResult = preg_match_all(
774
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
775
                $pdfData, $matches,
776
                PREG_SET_ORDER,
777
                $offset
778
            );
779
            if (0 == $pregResult) {
780
                throw new Exception('Unable to find startxref');
781
            }
782
            $matches = array_pop($matches);
783
            $startxref = $matches[1];
784
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
785
            // Already pointing at the xref table
786
            $startxref = $offset;
787
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
788
            // Cross-Reference Stream object
789
            $startxref = $offset;
790
        } elseif ($startxrefPreg) {
791
            // startxref found
792
            $startxref = $matches[1][0];
793
        } else {
794
            throw new Exception('Unable to find startxref');
795
        }
796
797
        // check xref position
798
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
799
            // Cross-Reference
800
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
801
        } else {
802
            // Cross-Reference Stream
803
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
804
        }
805
        if (empty($xref)) {
806
            throw new Exception('Unable to find xref');
807
        }
808
809
        return $xref;
810
    }
811
}
812