Completed
Pull Request — master (#299)
by Konrad
04:47 queued 02:52
created

RawDataParser   F

Complexity

Total Complexity 180

Size/Duplication

Total Lines 790
Duplicated Lines 0 %

Importance

Changes 5
Bugs 2 Features 1
Metric Value
eloc 427
c 5
b 2
f 1
dl 0
loc 790
rs 2
wmc 180

9 Methods

Rating   Name   Duplication   Size   Complexity  
F getRawObject() 0 196 37
D decodeStream() 0 56 21
A getObjectVal() 0 16 4
F decodeXrefStream() 0 226 78
C decodeXref() 0 64 15
B getXrefData() 0 49 8
A parseData() 0 26 6
A __construct() 0 6 1
B getIndirectObject() 0 36 10

How to fix   Complexity   

Complex Class

Complex classes like RawDataParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use RawDataParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * This file is based on code of tecnickcom/TCPDF PDF library.
5
 *
6
 * Original author Nicola Asuni ([email protected]) and
7
 * contributors (https://github.com/tecnickcom/TCPDF/graphs/contributors).
8
 *
9
 * @see https://github.com/tecnickcom/TCPDF
10
 *
11
 * Original code was licensed on the terms of the LGPL v3.
12
 *
13
 * ------------------------------------------------------------------------------
14
 *
15
 * @file This file is part of the PdfParser library.
16
 *
17
 * @author  Konrad Abicht <[email protected]>
18
 * @date    2020-01-06
19
 *
20
 * @license LGPLv3
21
 * @url     <https://github.com/smalot/pdfparser>
22
 *
23
 *  PdfParser is a pdf library written in PHP, extraction oriented.
24
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
25
 *
26
 *  This program is free software: you can redistribute it and/or modify
27
 *  it under the terms of the GNU Lesser General Public License as published by
28
 *  the Free Software Foundation, either version 3 of the License, or
29
 *  (at your option) any later version.
30
 *
31
 *  This program is distributed in the hope that it will be useful,
32
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
33
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34
 *  GNU Lesser General Public License for more details.
35
 *
36
 *  You should have received a copy of the GNU Lesser General Public License
37
 *  along with this program.
38
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
39
 */
40
41
namespace Smalot\PdfParser\RawData;
42
43
use Exception;
44
45
class RawDataParser
46
{
47
    /**
48
     * Configuration array.
49
     */
50
    protected $cfg = [
51
        // if `true` ignore filter decoding errors
52
        'ignore_filter_decoding_errors' => true,
53
        // if `true` ignore missing filter decoding errors
54
        'ignore_missing_filter_decoders' => true,
55
    ];
56
57
    protected $filterHelper;
58
59
    protected $xrefCache;
60
61
    /**
62
     * @param array $cfg Configuration array, default is []
63
     */
64
    public function __construct($cfg = [])
65
    {
66
        // merge given array with default values
67
        $this->cfg = array_merge($this->cfg, $cfg);
68
69
        $this->filterHelper = new FilterHelper();
70
    }
71
72
    /**
73
     * Decode the specified stream.
74
     *
75
     * @param string $pdfData PDF data
76
     * @param array  $xref
77
     * @param array  $sdic    Stream's dictionary array
78
     * @param string $stream  Stream to decode
79
     *
80
     * @return array containing decoded stream data and remaining filters
81
     */
82
    protected function decodeStream($pdfData, $xref, $sdic, $stream)
83
    {
84
        // get stream length and filters
85
        $slength = \strlen($stream);
86
        if ($slength <= 0) {
87
            return ['', []];
88
        }
89
        $filters = [];
90
        foreach ($sdic as $k => $v) {
91
            if ('/' == $v[0]) {
92
                if (('Length' == $v[1]) and (isset($sdic[($k + 1)])) and ('numeric' == $sdic[($k + 1)][0])) {
93
                    // get declared stream length
94
                    $declength = (int) ($sdic[($k + 1)][1]);
95
                    if ($declength < $slength) {
96
                        $stream = substr($stream, 0, $declength);
97
                        $slength = $declength;
98
                    }
99
                } elseif (('Filter' == $v[1]) and (isset($sdic[($k + 1)]))) {
100
                    // resolve indirect object
101
                    $objval = $this->getObjectVal($pdfData, $xref, $sdic[($k + 1)]);
102
                    if ('/' == $objval[0]) {
103
                        // single filter
104
                        $filters[] = $objval[1];
105
                    } elseif ('[' == $objval[0]) {
106
                        // array of filters
107
                        foreach ($objval[1] as $flt) {
108
                            if ('/' == $flt[0]) {
109
                                $filters[] = $flt[1];
110
                            }
111
                        }
112
                    }
113
                }
114
            }
115
        }
116
117
        // decode the stream
118
        $remaining_filters = [];
119
        foreach ($filters as $filter) {
120
            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
121
                try {
122
                    $stream = $this->filterHelper->decodeFilter($filter, $stream);
123
                } catch (Exception $e) {
124
                    $emsg = $e->getMessage();
125
                    if ((('~' == $emsg[0]) && !$this->cfg['ignore_missing_filter_decoders'])
126
                        || (('~' != $emsg[0]) && !$this->cfg['ignore_filter_decoding_errors'])
127
                    ) {
128
                        throw new Exception($e->getMessage());
129
                    }
130
                }
131
            } else {
132
                // add missing filter to array
133
                $remaining_filters[] = $filter;
134
            }
135
        }
136
137
        return [$stream, $remaining_filters];
138
    }
139
140
    /**
141
     * Decode the Cross-Reference section
142
     *
143
     * @param string $pdfData   PDF data
144
     * @param int    $startxref Offset at which the xref section starts (position of the 'xref' keyword)
145
     * @param array  $xref      Previous xref array (if any)
146
     *
147
     * @return array containing xref and trailer data
148
     */
149
    protected function decodeXref($pdfData, $startxref, $xref = [])
150
    {
151
        $startxref += 4; // 4 is the length of the word 'xref'
152
        // skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
153
        $offset = $startxref + strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $startxref);
154
        // initialize object number
155
        $obj_num = 0;
156
        // search for cross-reference entries or subsection
157
        while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
158
            if ($matches[0][1] != $offset) {
159
                // we are on another section
160
                break;
161
            }
162
            $offset += \strlen($matches[0][0]);
163
            if ('n' == $matches[3][0]) {
164
                // create unique object index: [object number]_[generation number]
165
                $index = $obj_num.'_'.(int) ($matches[2][0]);
166
                // check if object already exist
167
                if (!isset($xref['xref'][$index])) {
168
                    // store object offset position
169
                    $xref['xref'][$index] = (int) ($matches[1][0]);
170
                }
171
                ++$obj_num;
172
            } elseif ('f' == $matches[3][0]) {
173
                ++$obj_num;
174
            } else {
175
                // object number (index)
176
                $obj_num = (int) ($matches[1][0]);
177
            }
178
        }
179
        // get trailer data
180
        if (preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
181
            $trailer_data = $matches[1][0];
182
            if (!isset($xref['trailer']) or empty($xref['trailer'])) {
183
                // get only the last updated version
184
                $xref['trailer'] = [];
185
                // parse trailer_data
186
                if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
187
                    $xref['trailer']['size'] = (int) ($matches[1]);
188
                }
189
                if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
190
                    $xref['trailer']['root'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
191
                }
192
                if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
193
                    $xref['trailer']['encrypt'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
194
                }
195
                if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
196
                    $xref['trailer']['info'] = (int) ($matches[1]).'_'.(int) ($matches[2]);
197
                }
198
                if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
199
                    $xref['trailer']['id'] = [];
200
                    $xref['trailer']['id'][0] = $matches[1];
201
                    $xref['trailer']['id'][1] = $matches[2];
202
                }
203
            }
204
            if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
205
                // get previous xref
206
                $xref = $this->getXrefData($pdfData, (int) ($matches[1]), $xref);
207
            }
208
        } else {
209
            throw new Exception('Unable to find trailer');
210
        }
211
212
        return $xref;
213
    }
214
215
    /**
216
     * Decode the Cross-Reference Stream section
217
     *
218
     * @param string $pdfData   PDF data
219
     * @param int    $startxref Offset at which the xref section starts
220
     * @param array  $xref      Previous xref array (if any)
221
     *
222
     * @return array containing xref and trailer data
223
     *
224
     * @throws Exception if unknown PNG predictor detected
225
     */
226
    protected function decodeXrefStream($pdfData, $startxref, $xref = [])
227
    {
228
        // try to read Cross-Reference Stream
229
        $xrefobj = $this->getRawObject($pdfData, $startxref);
230
        $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true);
231
        if (!isset($xref['trailer']) or empty($xref['trailer'])) {
232
            // get only the last updated version
233
            $xref['trailer'] = [];
234
            $filltrailer = true;
235
        } else {
236
            $filltrailer = false;
237
        }
238
        if (!isset($xref['xref'])) {
239
            $xref['xref'] = [];
240
        }
241
        $valid_crs = false;
242
        $columns = 0;
243
        $sarr = $xrefcrs[0][1];
244
        if (!\is_array($sarr)) {
245
            $sarr = [];
246
        }
247
248
        $wb = [];
249
250
        foreach ($sarr as $k => $v) {
251
            if (
252
                ('/' == $v[0])
253
                && ('Type' == $v[1])
254
                && (
255
                    isset($sarr[($k + 1)])
256
                    && '/' == $sarr[($k + 1)][0]
257
                    && 'XRef' == $sarr[($k + 1)][1]
258
                )
259
            ) {
260
                $valid_crs = true;
261
            } elseif (('/' == $v[0]) and ('Index' == $v[1]) and (isset($sarr[($k + 1)]))) {
262
                // first object number in the subsection
263
                $index_first = (int) ($sarr[($k + 1)][1][0][1]);
264
            } elseif (('/' == $v[0]) and ('Prev' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
265
                // get previous xref offset
266
                $prevxref = (int) ($sarr[($k + 1)][1]);
267
            } elseif (('/' == $v[0]) and ('W' == $v[1]) and (isset($sarr[($k + 1)]))) {
268
                // number of bytes (in the decoded stream) of the corresponding field
269
                $wb[0] = (int) ($sarr[($k + 1)][1][0][1]);
270
                $wb[1] = (int) ($sarr[($k + 1)][1][1][1]);
271
                $wb[2] = (int) ($sarr[($k + 1)][1][2][1]);
272
            } elseif (('/' == $v[0]) and ('DecodeParms' == $v[1]) and (isset($sarr[($k + 1)][1]))) {
273
                $decpar = $sarr[($k + 1)][1];
274
                foreach ($decpar as $kdc => $vdc) {
275
                    if (
276
                        '/' == $vdc[0]
277
                        && 'Columns' == $vdc[1]
278
                        && (
279
                            isset($decpar[($kdc + 1)])
280
                            && 'numeric' == $decpar[($kdc + 1)][0]
281
                        )
282
                    ) {
283
                        $columns = (int) ($decpar[($kdc + 1)][1]);
284
                    } elseif (
285
                        '/' == $vdc[0]
286
                        && 'Predictor' == $vdc[1]
287
                        && (
288
                            isset($decpar[($kdc + 1)])
289
                            && 'numeric' == $decpar[($kdc + 1)][0]
290
                        )
291
                    ) {
292
                        $predictor = (int) ($decpar[($kdc + 1)][1]);
0 ignored issues
show
Unused Code introduced by
The assignment to $predictor is dead and can be removed.
Loading history...
293
                    }
294
                }
295
            } elseif ($filltrailer) {
296
                if (('/' == $v[0]) and ('Size' == $v[1]) and (isset($sarr[($k + 1)]) and ('numeric' == $sarr[($k + 1)][0]))) {
297
                    $xref['trailer']['size'] = $sarr[($k + 1)][1];
298
                } elseif (('/' == $v[0]) and ('Root' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
299
                    $xref['trailer']['root'] = $sarr[($k + 1)][1];
300
                } elseif (('/' == $v[0]) and ('Info' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
301
                    $xref['trailer']['info'] = $sarr[($k + 1)][1];
302
                } elseif (('/' == $v[0]) and ('Encrypt' == $v[1]) and (isset($sarr[($k + 1)]) and ('objref' == $sarr[($k + 1)][0]))) {
303
                    $xref['trailer']['encrypt'] = $sarr[($k + 1)][1];
304
                } elseif (('/' == $v[0]) and ('ID' == $v[1]) and (isset($sarr[($k + 1)]))) {
305
                    $xref['trailer']['id'] = [];
306
                    $xref['trailer']['id'][0] = $sarr[($k + 1)][1][0][1];
307
                    $xref['trailer']['id'][1] = $sarr[($k + 1)][1][1][1];
308
                }
309
            }
310
        }
311
312
        // decode data
313
        if ($valid_crs and isset($xrefcrs[1][3][0])) {
314
            // number of bytes in a row
315
            $rowlen = ($columns + 1);
316
            // convert the stream into an array of integers
317
            $sdata = unpack('C*', $xrefcrs[1][3][0]);
318
            // split the rows
319
            $sdata = array_chunk($sdata, $rowlen);
0 ignored issues
show
Bug introduced by
It seems like $sdata can also be of type false; however, parameter $input of array_chunk() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

319
            $sdata = array_chunk(/** @scrutinizer ignore-type */ $sdata, $rowlen);
Loading history...
320
            // initialize decoded array
321
            $ddata = [];
322
            // initialize first row with zeros
323
            $prev_row = array_fill(0, $rowlen, 0);
324
            // for each row apply PNG unpredictor
325
            foreach ($sdata as $k => $row) {
326
                // initialize new row
327
                $ddata[$k] = [];
328
                // get PNG predictor value
329
                $predictor = (10 + $row[0]);
330
                // for each byte on the row
331
                for ($i = 1; $i <= $columns; ++$i) {
332
                    // new index
333
                    $j = ($i - 1);
334
                    $row_up = $prev_row[$j];
335
                    if (1 == $i) {
336
                        $row_left = 0;
337
                        $row_upleft = 0;
338
                    } else {
339
                        $row_left = $row[($i - 1)];
340
                        $row_upleft = $prev_row[($j - 1)];
341
                    }
342
                    switch ($predictor) {
343
                        case 10:  // PNG prediction (on encoding, PNG None on all rows)
344
                            $ddata[$k][$j] = $row[$i];
345
                            break;
346
347
                        case 11:  // PNG prediction (on encoding, PNG Sub on all rows)
348
                            $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
349
                            break;
350
351
                        case 12:  // PNG prediction (on encoding, PNG Up on all rows)
352
                            $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
353
                            break;
354
355
                        case 13:  // PNG prediction (on encoding, PNG Average on all rows)
356
                            $ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
357
                            break;
358
359
                        case 14:  // PNG prediction (on encoding, PNG Paeth on all rows)
360
                            // initial estimate
361
                            $p = ($row_left + $row_up - $row_upleft);
362
                            // distances
363
                            $pa = abs($p - $row_left);
364
                            $pb = abs($p - $row_up);
365
                            $pc = abs($p - $row_upleft);
366
                            $pmin = min($pa, $pb, $pc);
367
                            // return minimum distance
368
                            switch ($pmin) {
369
                                case $pa:
370
                                    $ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
371
                                    break;
372
373
                                case $pb:
374
                                    $ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
375
                                    break;
376
377
                                case $pc:
378
                                    $ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
379
                                    break;
380
                            }
381
                            break;
382
383
                        default:  // PNG prediction (on encoding, PNG optimum)
384
                            throw new Exception('Unknown PNG predictor');
385
                    }
386
                }
387
                $prev_row = $ddata[$k];
388
            } // end for each row
389
            // complete decoding
390
            $sdata = [];
391
            // for every row
392
            foreach ($ddata as $k => $row) {
393
                // initialize new row
394
                $sdata[$k] = [0, 0, 0];
395
                if (0 == $wb[0]) {
396
                    // default type field
397
                    $sdata[$k][0] = 1;
398
                }
399
                $i = 0; // count bytes in the row
400
                // for every column
401
                for ($c = 0; $c < 3; ++$c) {
402
                    // for every byte on the column
403
                    for ($b = 0; $b < $wb[$c]; ++$b) {
404
                        if (isset($row[$i])) {
405
                            $sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
406
                        }
407
                        ++$i;
408
                    }
409
                }
410
            }
411
            $ddata = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $ddata is dead and can be removed.
Loading history...
412
            // fill xref
413
            if (isset($index_first)) {
414
                $obj_num = $index_first;
415
            } else {
416
                $obj_num = 0;
417
            }
418
            foreach ($sdata as $k => $row) {
419
                switch ($row[0]) {
420
                    case 0:  // (f) linked list of free objects
421
                            break;
422
423
                    case 1:  // (n) objects that are in use but are not compressed
424
                            // create unique object index: [object number]_[generation number]
425
                            $index = $obj_num.'_'.$row[2];
426
                            // check if object already exist
427
                            if (!isset($xref['xref'][$index])) {
428
                                // store object offset position
429
                                $xref['xref'][$index] = $row[1];
430
                            }
431
                            break;
432
433
                    case 2:  // compressed objects
434
                            // $row[1] = object number of the object stream in which this object is stored
435
                            // $row[2] = index of this object within the object stream
436
                            $index = $row[1].'_0_'.$row[2];
437
                            $xref['xref'][$index] = -1;
438
                            break;
439
440
                    default:  // null objects
441
                            break;
442
                }
443
                ++$obj_num;
444
            }
445
        } // end decoding data
446
        if (isset($prevxref)) {
447
            // get previous xref
448
            $xref = $this->getXrefData($pdfData, $prevxref, $xref);
449
        }
450
451
        return $xref;
452
    }
453
454
    /**
455
     * Get content of indirect object.
456
     *
457
     * @param string $pdfData  PDF data
458
     * @param array  $xref
459
     * @param string $obj_ref  Object number and generation number separated by underscore character
460
     * @param int    $offset   Object offset
461
     * @param bool   $decoding If true decode streams
462
     *
463
     * @return array containing object data
464
     *
465
     * @throws Exception if invalid object reference found
466
     */
467
    protected function getIndirectObject($pdfData, $xref, $obj_ref, $offset = 0, $decoding = true)
468
    {
469
        $obj = explode('_', $obj_ref);
470
        if ((false === $obj) or (2 != \count($obj))) {
471
            throw new Exception('Invalid object reference for $obj.');
472
        }
473
        $objref = $obj[0].' '.$obj[1].' obj';
474
        // ignore leading zeros
475
        $offset += strspn($pdfData, '0', $offset);
476
        if (strpos($pdfData, $objref, $offset) != $offset) {
477
            // an indirect reference to an undefined object shall be considered a reference to the null object
478
            return ['null', 'null', $offset];
479
        }
480
        // starting position of object content
481
        $offset += \strlen($objref);
482
        // get array of object content
483
        $objdata = [];
484
        $i = 0; // object main index
485
        do {
486
            $oldoffset = $offset;
487
            // get element
488
            $element = $this->getRawObject($pdfData, $offset);
489
            $offset = $element[2];
490
            // decode stream using stream's dictionary information
491
            if ($decoding and ('stream' == $element[0]) and (isset($objdata[($i - 1)][0])) and ('<<' == $objdata[($i - 1)][0])) {
492
                $element[3] = $this->decodeStream($pdfData, $xref, $objdata[($i - 1)][1], $element[1]);
493
            }
494
            $objdata[$i] = $element;
495
            ++$i;
496
        } while (('endobj' != $element[0]) and ($offset != $oldoffset));
497
498
        // remove closing delimiter
499
        array_pop($objdata);
500
501
        // return raw object content
502
        return $objdata;
503
    }
504
505
    /**
506
     * Get the content of object, resolving indect object reference if necessary.
507
     *
508
     * @param string $pdfData PDF data
509
     * @param string $obj     Object value
510
     *
511
     * @return array containing object data
512
     */
513
    protected function getObjectVal($pdfData, $xref, $obj)
514
    {
515
        if ('objref' == $obj[0]) {
516
            // reference to indirect object
517
            if (isset($this->objects[$obj[1]])) {
518
                // this object has been already parsed
519
                return $this->objects[$obj[1]];
520
            } elseif (isset($xref[$obj[1]])) {
521
                // parse new object
522
                $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false);
0 ignored issues
show
Bug Best Practice introduced by
The property objects does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
523
524
                return $this->objects[$obj[1]];
525
            }
526
        }
527
528
        return $obj;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $obj returns the type string which is incompatible with the documented return type array.
Loading history...
529
    }
530
531
    /**
532
     * Get object type, raw value and offset to next object
533
     *
534
     * @param int $offset Object offset
535
     *
536
     * @return array containing object type, raw value and offset to next object
537
     */
538
    protected function getRawObject($pdfData, $offset = 0)
539
    {
540
        $objtype = ''; // object type to be returned
541
        $objval = ''; // object value to be returned
542
543
        /*
544
         * skip initial white space chars:
545
         *      \x00 null (NUL)
546
         *      \x09 horizontal tab (HT)
547
         *      \x0A line feed (LF)
548
         *      \x0C form feed (FF)
549
         *      \x0D carriage return (CR)
550
         *      \x20 space (SP)
551
         */
552
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
553
554
        // get first char
555
        $char = $pdfData[$offset];
556
        // get object type
557
        switch ($char) {
558
            case '%':  // \x25 PERCENT SIGN
559
                    // skip comment and search for next token
560
                    $next = strcspn($pdfData, "\r\n", $offset);
561
                    if ($next > 0) {
562
                        $offset += $next;
563
564
                        return $this->getRawObject($pdfData, $offset);
565
                    }
566
                    break;
567
568
            case '/':  // \x2F SOLIDUS
569
                    // name object
570
                    $objtype = $char;
571
                    ++$offset;
572
                    $pregResult = preg_match(
573
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
574
                        substr($pdfData, $offset, 256),
575
                        $matches
576
                    );
577
                    if (1 == $pregResult) {
578
                        $objval = $matches[1]; // unescaped value
579
                        $offset += \strlen($objval);
580
                    }
581
                    break;
582
583
            case '(':   // \x28 LEFT PARENTHESIS
584
            case ')':  // \x29 RIGHT PARENTHESIS
585
                    // literal string object
586
                    $objtype = $char;
587
                    ++$offset;
588
                    $strpos = $offset;
589
                    if ('(' == $char) {
590
                        $open_bracket = 1;
591
                        while ($open_bracket > 0) {
592
                            if (!isset($pdfData[$strpos])) {
593
                                break;
594
                            }
595
                            $ch = $pdfData[$strpos];
596
                            switch ($ch) {
597
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
598
                                        // skip next character
599
                                        ++$strpos;
600
                                        break;
601
602
                                case '(':  // LEFT PARENHESIS (28h)
603
                                        ++$open_bracket;
604
                                        break;
605
606
                                case ')':  // RIGHT PARENTHESIS (29h)
607
                                        --$open_bracket;
608
                                        break;
609
                            }
610
                            ++$strpos;
611
                        }
612
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
613
                        $offset = $strpos;
614
                    }
615
                    break;
616
617
            case '[':   // \x5B LEFT SQUARE BRACKET
618
            case ']':  // \x5D RIGHT SQUARE BRACKET
619
                    // array object
620
                    $objtype = $char;
621
                    ++$offset;
622
                    if ('[' == $char) {
623
                        // get array content
624
                        $objval = [];
625
                        do {
626
                            // get element
627
                            $element = $this->getRawObject($pdfData, $offset);
628
                            $offset = $element[2];
629
                            $objval[] = $element;
630
                        } while (']' != $element[0]);
631
                        // remove closing delimiter
632
                        array_pop($objval);
633
                    }
634
                    break;
635
636
            case '<':  // \x3C LESS-THAN SIGN
637
            case '>':  // \x3E GREATER-THAN SIGN
638
                    if (isset($pdfData[($offset + 1)]) and ($pdfData[($offset + 1)] == $char)) {
639
                        // dictionary object
640
                        $objtype = $char.$char;
641
                        $offset += 2;
642
                        if ('<' == $char) {
643
                            // get array content
644
                            $objval = [];
645
                            do {
646
                                // get element
647
                                $element = $this->getRawObject($pdfData, $offset);
648
                                $offset = $element[2];
649
                                $objval[] = $element;
650
                            } while ('>>' != $element[0]);
651
                            // remove closing delimiter
652
                            array_pop($objval);
653
                        }
654
                    } else {
655
                        // hexadecimal string object
656
                        $objtype = $char;
657
                        ++$offset;
658
                        $pregResult = preg_match(
659
                            '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
660
                            substr($pdfData, $offset),
661
                            $matches
662
                        );
663
                        if (('<' == $char) && 1 == $pregResult) {
664
                            // remove white space characters
665
                            $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
666
                            $offset += \strlen($matches[0]);
667
                        } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
668
                            $offset = $endpos + 1;
669
                        }
670
                    }
671
                    break;
672
673
            default:
674
                    if ('endobj' == substr($pdfData, $offset, 6)) {
675
                        // indirect object
676
                        $objtype = 'endobj';
677
                        $offset += 6;
678
                    } elseif ('null' == substr($pdfData, $offset, 4)) {
679
                        // null object
680
                        $objtype = 'null';
681
                        $offset += 4;
682
                        $objval = 'null';
683
                    } elseif ('true' == substr($pdfData, $offset, 4)) {
684
                        // boolean true object
685
                        $objtype = 'boolean';
686
                        $offset += 4;
687
                        $objval = 'true';
688
                    } elseif ('false' == substr($pdfData, $offset, 5)) {
689
                        // boolean false object
690
                        $objtype = 'boolean';
691
                        $offset += 5;
692
                        $objval = 'false';
693
                    } elseif ('stream' == substr($pdfData, $offset, 6)) {
694
                        // start stream object
695
                        $objtype = 'stream';
696
                        $offset += 6;
697
                        if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
698
                            $offset += \strlen($matches[0]);
699
                            $pregResult = preg_match(
700
                                '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
701
                                substr($pdfData, $offset),
702
                                $matches,
703
                                PREG_OFFSET_CAPTURE
704
                            );
705
                            if (1 == $pregResult) {
706
                                $objval = substr($pdfData, $offset, $matches[0][1]);
707
                                $offset += $matches[1][1];
708
                            }
709
                        }
710
                    } elseif ('endstream' == substr($pdfData, $offset, 9)) {
711
                        // end stream object
712
                        $objtype = 'endstream';
713
                        $offset += 9;
714
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
715
                        // indirect object reference
716
                        $objtype = 'objref';
717
                        $offset += \strlen($matches[0]);
718
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
719
                    } elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
720
                        // object start
721
                        $objtype = 'obj';
722
                        $objval = (int) ($matches[1]).'_'.(int) ($matches[2]);
723
                        $offset += \strlen($matches[0]);
724
                    } elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
725
                        // numeric object
726
                        $objtype = 'numeric';
727
                        $objval = substr($pdfData, $offset, $numlen);
728
                        $offset += $numlen;
729
                    }
730
                    break;
731
        }
732
733
        return [$objtype, $objval, $offset];
734
    }
735
736
    /**
737
     * Get Cross-Reference (xref) table and trailer data from PDF document data.
738
     *
739
     * @param string $pdfData
740
     * @param int    $offset  xref offset (if know)
741
     * @param array  $xref    previous xref array (if any)
742
     *
743
     * @return array containing xref and trailer data
744
     *
745
     * @throws Exception if it was unable to find startxref
746
     * @throws Exception if it was unable to find xref
747
     */
748
    protected function getXrefData($pdfData, $offset = 0, $xref = [])
749
    {
750
        $startxrefPreg = preg_match(
751
            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
752
            $pdfData,
753
            $matches,
754
            PREG_OFFSET_CAPTURE,
755
            $offset
756
        );
757
758
        if (0 == $offset) {
759
            // find last startxref
760
            $pregResult = preg_match_all(
761
                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
762
                $pdfData, $matches,
763
                PREG_SET_ORDER,
764
                $offset
765
            );
766
            if (0 == $pregResult) {
767
                throw new Exception('Unable to find startxref');
768
            }
769
            $matches = array_pop($matches);
770
            $startxref = $matches[1];
771
        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
772
            // Already pointing at the xref table
773
            $startxref = $offset;
774
        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, PREG_OFFSET_CAPTURE, $offset)) {
775
            // Cross-Reference Stream object
776
            $startxref = $offset;
777
        } elseif ($startxrefPreg) {
778
            // startxref found
779
            $startxref = $matches[1][0];
780
        } else {
781
            throw new Exception('Unable to find startxref');
782
        }
783
784
        // check xref position
785
        if (strpos($pdfData, 'xref', $startxref) == $startxref) {
786
            // Cross-Reference
787
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
788
        } else {
789
            // Cross-Reference Stream
790
            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
791
        }
792
        if (empty($xref)) {
793
            throw new Exception('Unable to find xref');
794
        }
795
796
        return $xref;
797
    }
798
799
    /**
800
     * Parses PDF data and returns extracted data as array.
801
     *
802
     * @param string $data PDF data to parse
803
     *
804
     * @return array array of parsed PDF document objects
805
     *
806
     * @throws Exception if empty PDF data given
807
     * @throws Exception if PDF data missing %PDF header
808
     */
809
    public function parseData($data)
810
    {
811
        if (empty($data)) {
812
            throw new Exception('Empty PDF data given.');
813
        }
814
        // find the pdf header starting position
815
        if (false === ($trimpos = strpos($data, '%PDF-'))) {
816
            throw new Exception('Invalid PDF data: missing %PDF header.');
817
        }
818
819
        // get PDF content string
820
        $pdfData = substr($data, $trimpos);
821
822
        // get xref and trailer data
823
        $xref = $this->getXrefData($pdfData);
824
825
        // parse all document objects
826
        $objects = [];
827
        foreach ($xref['xref'] as $obj => $offset) {
828
            if (!isset($objects[$obj]) and ($offset > 0)) {
829
                // decode objects with positive offset
830
                $objects[$obj] = $this->getIndirectObject($pdfData, $xref, $obj, $offset, true);
831
            }
832
        }
833
834
        return [$xref, $objects];
835
    }
836
}
837