Test Failed
Pull Request — master (#368)
by
unknown
07:25
created

PDFObject::get()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\XObject\Form;
34
use Smalot\PdfParser\XObject\Image;
35
36
/**
37
 * Class PDFObject
38
 */
39
class PDFObject
40
{
41
    const TYPE = 't';
42
43
    const OPERATOR = 'o';
44
45
    const COMMAND = 'c';
46
47
    /**
48
     * The recursion stack.
49
     *
50
     * @var array
51
     */
52
    public static $recursionStack = [];
53
54
    /**
55
     * @var Document
56
     */
57
    protected $document = null;
58
59
    /**
60
     * @var Header
61
     */
62
    protected $header = null;
63
64
    /**
65
     * @var string
66
     */
67
    protected $content = null;
68
69
    /**
70
     * @param Header $header
71
     * @param string $content
72
     */
73 37
    public function __construct(Document $document, Header $header = null, $content = null)
74
    {
75 37
        $this->document = $document;
76 37
        $this->header = null !== $header ? $header : new Header();
77 37
        $this->content = $content;
78 37
    }
79
80 32
    public function init()
81
    {
82 32
    }
83
84
    /**
85
     * @return Header|null
86
     */
87 32
    public function getHeader()
88
    {
89 32
        return $this->header;
90
    }
91
92
    /**
93
     * @param string $name
94
     *
95
     * @return Element|PDFObject
96
     */
97 27
    public function get($name)
98
    {
99 27
        return $this->header->get($name);
100
    }
101
102
    /**
103
     * @param string $name
104
     *
105
     * @return bool
106
     */
107 26
    public function has($name)
108
    {
109 26
        return $this->header->has($name);
110
    }
111
112
    /**
113
     * @param bool $deep
114
     *
115
     * @return array
116
     */
117 1
    public function getDetails($deep = true)
118
    {
119 1
        return $this->header->getDetails($deep);
120
    }
121
122
    /**
123
     * @return string|null
124
     */
125 23
    public function getContent()
126
    {
127 23
        return $this->content;
128
    }
129
130
    /**
131
     * @param string $content
132
     */
133 17
    public function cleanContent($content, $char = 'X')
134
    {
135 17
        $char = $char[0];
136 17
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
137
138
        // Remove image bloc with binary content
139 17
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
140 17
        foreach ($matches[0] as $part) {
141
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
142
        }
143
144
        // Clean content in square brackets [.....]
145 17
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, PREG_OFFSET_CAPTURE);
146 17
        foreach ($matches[1] as $part) {
147 13
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
148
        }
149
150
        // Clean content in round brackets (.....)
151 17
        preg_match_all('/\((.*?)\)/s', $content, $matches, PREG_OFFSET_CAPTURE);
152 17
        foreach ($matches[1] as $part) {
153 12
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean structure
157 17
        if ($parts = preg_split('/(<|>)/s', $content, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE)) {
158 17
            $content = '';
159 17
            $level = 0;
160 17
            foreach ($parts as $part) {
161 17
                if ('<' == $part) {
162 11
                    ++$level;
163
                }
164
165 17
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
166
167 17
                if ('>' == $part) {
168 11
                    --$level;
169
                }
170
            }
171
        }
172
173
        // Clean BDC and EMC markup
174 17
        preg_match_all(
175 17
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
176
            $content,
177
            $matches,
178 17
            PREG_OFFSET_CAPTURE
179
        );
180 17
        foreach ($matches[1] as $part) {
181 4
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
182
        }
183
184 17
        preg_match_all('/\s(EMC)\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
185 17
        foreach ($matches[1] as $part) {
186 7
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
187
        }
188
189 17
        return $content;
190
    }
191
192
    /**
193
     * @param string $content
194
     *
195
     * @return array
196
     */
197 16
    public function getSectionsText($content)
198
    {
199 16
        $sections = [];
200 16
        $content = ' '.$content.' ';
201 16
        $textCleaned = $this->cleanContent($content, '_');
202
203
        // Extract text blocks.
204 16
        if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
205 16
            foreach ($matches[1] as $part) {
206 16
                $text = $part[0];
207 16
                if ('' === $text) {
208
                    continue;
209
                }
210 16
                $offset = $part[1];
211 16
                $section = substr($content, $offset, \strlen($text));
212
213
                // Removes BDC and EMC markup.
214 16
                $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' ');
215
216 16
                $sections[] = $section;
217
            }
218
        }
219
220
        // Extract 'do' commands.
221 16
        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
222 3
            foreach ($matches[1] as $part) {
223 3
                $text = $part[0];
224 3
                $offset = $part[1];
225 3
                $section = substr($content, $offset, \strlen($text));
226
227 3
                $sections[] = $section;
228
            }
229
        }
230
231 16
        return $sections;
232
    }
233
234 9
    private function getDefaultFont(Page $page = null)
235
    {
236 9
        $fonts = [];
237 9
        if (null !== $page) {
238 9
            $fonts = $page->getFonts();
239
        }
240
241 9
        $fonts = array_merge($fonts, array_values($this->document->getFonts()));
242
243 9
        if (\count($fonts) > 0) {
244 9
            return reset($fonts);
245
        }
246
247
        return new Font($this->document);
248
    }
249
250
    /**
251
     * @param Page $page
252
     *
253
     * @return string
254
     *
255
     * @throws \Exception
256
     */
257 9
    public function getText(Page $page = null)
258
    {
259 9
        $text = '';
260 9
        $sections = $this->getSectionsText($this->content);
261 9
        $current_font = $this->getDefaultFont($page);
262
263 9
        $current_position_tm = ['x' => false, 'y' => false];
264
265 9
        array_push(self::$recursionStack, $this->getUniqueId());
266
267 9
        foreach ($sections as $section) {
268 9
            $commands = $this->getCommandsText($section);
269
270 9
            foreach ($commands as $command) {
271 9
                switch ($command[self::OPERATOR]) {
272
                    // set character spacing
273 9
                    case 'Tc':
274 2
                        break;
275
276
                    // move text current point
277 9
                    case 'Td':
278 9
                    case 'TD':
279 7
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
280 7
                        $y = array_pop($args);
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

280
                        $y = array_pop(/** @scrutinizer ignore-type */ $args);
Loading history...
281 7
                        $x = array_pop($args);
282 7
                        if ($y != 0) {
283
                            // vertical offset
284 6
                            $text .= "\n";
285 6
                        } elseif ((float) $x > 0) {
286
                            // horizontal offset
287 6
                            $text .= ' ';
288
                        }
289
290 7
                        break;
291
292 9
                    case 'Tf':
293 9
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
294 9
                        $id = trim($id, '/');
295 9
                        if (null !== $page) {
296 9
                            $new_font = $page->getFont($id);
297
                            // If an invalid font ID is given, do not update the font.
298
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
299
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
300
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
301
                            // But we want to make sure that malformed PDFs do not simply crash.
302 9
                            if (null !== $new_font) {
303 8
                                $current_font = $new_font;
304
                            }
305
                        }
306 9
                        break;
307
308 9
                    case "'":
309 1
                        $text .= "\n";
310
                        // no break
311 9
                    case 'Tj':
312 6
                        $command[self::COMMAND] = [$command];
313
                        // no break
314 9
                    case 'TJ':
315 9
                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
316 9
                        $text .= $sub_text;
317 9
                        break;
318
319
                    // set leading
320 8
                    case 'TL':
321 1
                        $text .= ' ';
322 1
                        break;
323
324 8
                    case 'Tm':
325 8
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
326 8
                        $y = array_pop($args);
327 8
                        $x = array_pop($args);
328 8
                        if (false !== $current_position_tm['x']) {
329 8
                            $delta = abs((float) $x - (float) ($current_position_tm['x']));
330 8
                            if ($delta > 10) {
331 6
                                $text .= "\t";
332
                            }
333
                        }
334 8
                        if (false !== $current_position_tm['y']) {
335 8
                            $delta = abs((float) $y - (float) ($current_position_tm['y']));
336 8
                            if ($delta > 10) {
337 5
                                $text .= "\n";
338
                            }
339
                        }
340 8
                        $current_position_tm = ['x' => $x, 'y' => $y];
341 8
                        break;
342
343
                    // set super/subscripting text rise
344 6
                    case 'Ts':
345
                        break;
346
347
                    // set word spacing
348 6
                    case 'Tw':
349 2
                        break;
350
351
                    // set horizontal scaling
352 6
                    case 'Tz':
353
                        $text .= "\n";
354
                        break;
355
356
                    // move to start of next line
357 6
                    case 'T*':
358 3
                        $text .= "\n";
359 3
                        break;
360
361 5
                    case 'Da':
362
                        break;
363
364 5
                    case 'Do':
365 3
                        if (null !== $page) {
366 3
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
367 3
                            $id = trim(array_pop($args), '/ ');
368 3
                            $xobject = $page->getXObject($id);
369
370
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
371 3
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
372
                                // Not a circular reference.
373 3
                                $text .= $xobject->getText($page);
374
                            }
375
                        }
376 3
                        break;
377
378 5
                    case 'rg':
379 5
                    case 'RG':
380 2
                        break;
381
382 5
                    case 're':
383
                        break;
384
385 5
                    case 'co':
386
                        break;
387
388 5
                    case 'cs':
389 1
                        break;
390
391 5
                    case 'gs':
392 4
                        break;
393
394 4
                    case 'en':
395
                        break;
396
397 4
                    case 'sc':
398 4
                    case 'SC':
399
                        break;
400
401 4
                    case 'g':
402 4
                    case 'G':
403 2
                        break;
404
405 3
                    case 'V':
406
                        break;
407
408 3
                    case 'vo':
409 3
                    case 'Vo':
410
                        break;
411
412
                    default:
413
                }
414
            }
415
        }
416
417 9
        array_pop(self::$recursionStack);
418
419 9
        return $text.' ';
420
    }
421
422
    /**
423
     * @param Page $page
424
     *
425
     * @return array
426
     *
427
     * @throws \Exception
428
     */
429 3
    public function getTextArray(Page $page = null)
430
    {
431 3
        $text = [];
432 3
        $sections = $this->getSectionsText($this->content);
433 3
        $current_font = new Font($this->document);
434
435 3
        foreach ($sections as $section) {
436 3
            $commands = $this->getCommandsText($section);
437
438 3
            foreach ($commands as $command) {
439 3
                switch ($command[self::OPERATOR]) {
440
                    // set character spacing
441 3
                    case 'Tc':
442 2
                        break;
443
444
                    // move text current point
445 3
                    case 'Td':
446 3
                        break;
447
448
                    // move text current point and set leading
449 3
                    case 'TD':
450
                        break;
451
452 3
                    case 'Tf':
453 3
                        if (null !== $page) {
454 3
                            list($id) = preg_split('/\s/s', $command[self::COMMAND]);
455 3
                            $id = trim($id, '/');
456 3
                            $current_font = $page->getFont($id);
457
                        }
458 3
                        break;
459
460 3
                    case "'":
461 3
                    case 'Tj':
462 3
                        $command[self::COMMAND] = [$command];
463
                        // no break
464 3
                    case 'TJ':
465 3
                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
466 3
                        $text[] = $sub_text;
467 3
                        break;
468
469
                    // set leading
470 3
                    case 'TL':
471 2
                        break;
472
473 3
                    case 'Tm':
474 2
                        break;
475
476
                    // set super/subscripting text rise
477 3
                    case 'Ts':
478
                        break;
479
480
                    // set word spacing
481 3
                    case 'Tw':
482 1
                        break;
483
484
                    // set horizontal scaling
485 3
                    case 'Tz':
486
                        //$text .= "\n";
487
                        break;
488
489
                    // move to start of next line
490 3
                    case 'T*':
491
                        //$text .= "\n";
492 2
                        break;
493
494 3
                    case 'Da':
495
                        break;
496
497 3
                    case 'Do':
498
                        if (null !== $page) {
499
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
500
                            $id = trim(array_pop($args), '/ ');
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

500
                            $id = trim(array_pop(/** @scrutinizer ignore-type */ $args), '/ ');
Loading history...
501
                            if ($xobject = $page->getXObject($id)) {
502
                                $text[] = $xobject->getText($page);
503
                            }
504
                        }
505
                        break;
506
507 3
                    case 'rg':
508 3
                    case 'RG':
509 2
                        break;
510
511 3
                    case 're':
512
                        break;
513
514 3
                    case 'co':
515
                        break;
516
517 3
                    case 'cs':
518
                        break;
519
520 3
                    case 'gs':
521
                        break;
522
523 3
                    case 'en':
524
                        break;
525
526 3
                    case 'sc':
527 3
                    case 'SC':
528
                        break;
529
530 3
                    case 'g':
531 3
                    case 'G':
532 2
                        break;
533
534 1
                    case 'V':
535
                        break;
536
537 1
                    case 'vo':
538 1
                    case 'Vo':
539
                        break;
540
541
                    default:
542
                }
543
            }
544
        }
545
546 3
        return $text;
547
    }
548
549
    /**
550
     * @param string $text_part
551
     * @param int    $offset
552
     *
553
     * @return array
554
     */
555 16
    public function getCommandsText($text_part, &$offset = 0)
556
    {
557 16
        $commands = $matches = [];
558
559 16
        while ($offset < \strlen($text_part)) {
560 16
            $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset);
561 16
            $char = $text_part[$offset];
562
563 16
            $operator = '';
564 16
            $type = '';
565 16
            $command = false;
566
567 16
            switch ($char) {
568 16
                case '/':
569 16
                    $type = $char;
570 16
                    if (preg_match(
571 16
                        '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
572 16
                        substr($text_part, $offset),
573
                        $matches
574
                    )
575
                    ) {
576 16
                        $operator = $matches[2];
577 16
                        $command = $matches[1];
578 16
                        $offset += \strlen($matches[0]);
579
                    } elseif (preg_match(
580 5
                        '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
581 5
                        substr($text_part, $offset),
582
                        $matches
583
                    )
584
                    ) {
585 5
                        $operator = $matches[2];
586 5
                        $command = $matches[1];
587 5
                        $offset += \strlen($matches[0]);
588
                    }
589 16
                    break;
590
591 16
                case '[':
592 16
                case ']':
593
                    // array object
594 15
                    $type = $char;
595 15
                    if ('[' == $char) {
596 15
                        ++$offset;
597
                        // get elements
598 15
                        $command = $this->getCommandsText($text_part, $offset);
599
600 15
                        if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
601 15
                            $operator = trim($matches[0]);
602 15
                            $offset += \strlen($matches[0]);
603
                        }
604
                    } else {
605 15
                        ++$offset;
606 15
                        break;
607
                    }
608 15
                    break;
609
610 16
                case '<':
611 16
                case '>':
612
                    // array object
613 7
                    $type = $char;
614 7
                    ++$offset;
615 7
                    if ('<' == $char) {
616 7
                        $strpos = strpos($text_part, '>', $offset);
617 7
                        $command = substr($text_part, $offset, ($strpos - $offset));
618 7
                        $offset = $strpos + 1;
619
                    }
620
621 7
                    if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
622 6
                        $operator = trim($matches[0]);
623 6
                        $offset += \strlen($matches[0]);
624
                    }
625 7
                    break;
626
627 16
                case '(':
628 16
                case ')':
629 12
                    ++$offset;
630 12
                    $type = $char;
631 12
                    $strpos = $offset;
632 12
                    if ('(' == $char) {
633 12
                        $open_bracket = 1;
634 12
                        while ($open_bracket > 0) {
635 12
                            if (!isset($text_part[$strpos])) {
636
                                break;
637
                            }
638 12
                            $ch = $text_part[$strpos];
639 12
                            switch ($ch) {
640 12
                                case '\\':
641
                                 // REVERSE SOLIDUS (5Ch) (Backslash)
642
                                    // skip next character
643 9
                                    ++$strpos;
644 9
                                    break;
645
646 12
                                case '(':
647
                                 // LEFT PARENHESIS (28h)
648
                                    ++$open_bracket;
649
                                    break;
650
651 12
                                case ')':
652
                                 // RIGHT PARENTHESIS (29h)
653 12
                                    --$open_bracket;
654 12
                                    break;
655
                            }
656 12
                            ++$strpos;
657
                        }
658 12
                        $command = substr($text_part, $offset, ($strpos - $offset - 1));
659 12
                        $offset = $strpos;
660
661 12
                        if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) {
662 10
                            $operator = $matches[1];
663 10
                            $offset += \strlen($matches[0]);
664
                        }
665
                    }
666 12
                    break;
667
668
                default:
669
670 16
                    if ('ET' == substr($text_part, $offset, 2)) {
671 1
                        break;
672
                    } elseif (preg_match(
673 16
                        '/^\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si',
674 16
                        substr($text_part, $offset),
675
                        $matches
676
                    )
677
                    ) {
678 16
                        $operator = trim($matches['id']);
679 16
                        $command = trim($matches['data']);
680 16
                        $offset += \strlen($matches[0]);
681 14
                    } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) {
682 14
                        $type = 'n';
683 14
                        $command = trim($matches[0]);
684 14
                        $offset += \strlen($matches[0]);
685 9
                    } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) {
686 9
                        $type = '';
687 9
                        $operator = $matches[1];
688 9
                        $command = '';
689 9
                        $offset += \strlen($matches[0]);
690
                    }
691
            }
692
693 16
            if (false !== $command) {
694 16
                $commands[] = [
695 16
                    self::TYPE => $type,
696 16
                    self::OPERATOR => $operator,
697 16
                    self::COMMAND => $command,
698
                ];
699
            } else {
700 15
                break;
701
            }
702
        }
703
704 16
        return $commands;
705
    }
706
707
    /**
708
     * @param string $content
709
     *
710
     * @return PDFObject
711
     */
712 25
    public static function factory(Document $document, Header $header, $content)
713
    {
714 25
        switch ($header->get('Type')->getContent()) {
715 25
            case 'XObject':
716 4
                switch ($header->get('Subtype')->getContent()) {
717 4
                    case 'Image':
718 2
                        return new Image($document, $header, $content);
719
720 3
                    case 'Form':
721 3
                        return new Form($document, $header, $content);
722
                }
723
724
                return new self($document, $header, $content);
725
726 25
            case 'Pages':
727 24
                return new Pages($document, $header, $content);
728
729 25
            case 'Page':
730 24
                return new Page($document, $header, $content);
731
732 25
            case 'Encoding':
733 4
                return new Encoding($document, $header, $content);
734
735 25
            case 'Font':
736 24
                $subtype = $header->get('Subtype')->getContent();
737 24
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
738
739 24
                if (class_exists($classname)) {
740 24
                    return new $classname($document, $header, $content);
741
                }
742
743
                return new Font($document, $header, $content);
744
745
            default:
746 25
                return new self($document, $header, $content);
747
        }
748
    }
749
750
    /**
751
     * Returns unique id identifying the object.
752
     *
753
     * @return string
754
     */
755 9
    protected function getUniqueId()
756
    {
757 9
        return spl_object_hash($this);
758
    }
759
}
760