Test Failed
Push — pr/257 ( 57d61f )
by Konrad
05:10 queued 13s
created

PDFObject::factory()   B

Complexity

Conditions 9
Paths 9

Size

Total Lines 35
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 22
c 0
b 0
f 0
nc 9
nop 3
dl 0
loc 35
rs 8.0555
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\XObject\Form;
34
use Smalot\PdfParser\XObject\Image;
35
36
/**
37
 * Class PDFObject
38
 */
39
class PDFObject
40
{
41
    const TYPE = 't';
42
43
    const OPERATOR = 'o';
44
45
    const COMMAND = 'c';
46
47
    /**
48
     * The recursion stack.
49
     *
50
     * @var array
51
     */
52
    public static $recursionStack = [];
53
54
    /**
55
     * @var Document
56
     */
57
    protected $document = null;
58
59
    /**
60
     * @var Header
61
     */
62
    protected $header = null;
63
64
    /**
65
     * @var string
66
     */
67
    protected $content = null;
68
69
    /**
70
     * @param Header $header
71
     * @param string $content
72
     */
73
    public function __construct(Document $document, Header $header = null, $content = null)
74
    {
75
        $this->document = $document;
76
        $this->header = null !== $header ? $header : new Header();
77
        $this->content = $content;
78
    }
79
80
    public function init()
81
    {
82
    }
83
84
    /**
85
     * @return Header|null
86
     */
87
    public function getHeader()
88
    {
89
        return $this->header;
90
    }
91
92
    /**
93
     * @param string $name
94
     *
95
     * @return Element|PDFObject
96
     */
97
    public function get($name)
98
    {
99
        return $this->header->get($name);
100
    }
101
102
    /**
103
     * @param string $name
104
     *
105
     * @return bool
106
     */
107
    public function has($name)
108
    {
109
        return $this->header->has($name);
110
    }
111
112
    /**
113
     * @param bool $deep
114
     *
115
     * @return array
116
     */
117
    public function getDetails($deep = true)
118
    {
119
        return $this->header->getDetails($deep);
120
    }
121
122
    /**
123
     * @return string|null
124
     */
125
    public function getContent()
126
    {
127
        return $this->content;
128
    }
129
130
    /**
131
     * @param string $content
132
     */
133
    public function cleanContent($content, $char = 'X')
134
    {
135
        $char = $char[0];
136
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
137
138
        // Remove image bloc with binary content
139
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
140
        foreach ($matches[0] as $part) {
141
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
142
        }
143
144
        // Clean content in square brackets [.....]
145
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, PREG_OFFSET_CAPTURE);
146
        foreach ($matches[1] as $part) {
147
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
148
        }
149
150
        // Clean content in round brackets (.....)
151
        preg_match_all('/\((.*?)\)/s', $content, $matches, PREG_OFFSET_CAPTURE);
152
        foreach ($matches[1] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean structure
157
        if ($parts = preg_split('/(<|>)/s', $content, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE)) {
158
            $content = '';
159
            $level = 0;
160
            foreach ($parts as $part) {
161
                if ('<' == $part) {
162
                    ++$level;
163
                }
164
165
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
166
167
                if ('>' == $part) {
168
                    --$level;
169
                }
170
            }
171
        }
172
173
        // Clean BDC and EMC markup
174
        preg_match_all(
175
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
176
            $content,
177
            $matches,
178
            PREG_OFFSET_CAPTURE
179
        );
180
        foreach ($matches[1] as $part) {
181
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
182
        }
183
184
        preg_match_all('/\s(EMC)\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
185
        foreach ($matches[1] as $part) {
186
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
187
        }
188
189
        return $content;
190
    }
191
192
    /**
193
     * @param string $content
194
     *
195
     * @return array
196
     */
197
    public function getSectionsText($content)
198
    {
199
        $sections = [];
200
        $content = ' '.$content.' ';
201
        $textCleaned = $this->cleanContent($content, '_');
202
203
        // Extract text blocks.
204
        if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
205
            foreach ($matches[1] as $part) {
206
                $text = $part[0];
207
                if ('' === $text) {
208
                    continue;
209
                }
210
                $offset = $part[1];
211
                $section = substr($content, $offset, \strlen($text));
212
213
                // Removes BDC and EMC markup.
214
                $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' ');
215
216
                $sections[] = $section;
217
            }
218
        }
219
220
        // Extract 'do' commands.
221
        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
222
            foreach ($matches[1] as $part) {
223
                $text = $part[0];
224
                $offset = $part[1];
225
                $section = substr($content, $offset, \strlen($text));
226
227
                $sections[] = $section;
228
            }
229
        }
230
231
        return $sections;
232
    }
233
234
    private function getDefaultFont(Page $page = null)
235
    {
236
        $fonts = [];
237
        if (!is_null($page)) {
238
            $fonts = $page->getFonts();
239
        }
240
241
        $fonts = array_merge($fonts, array_values($this->document->getFonts()));
242
243
        if (count($fonts) > 0)
244
        {
245
            return reset($fonts);
246
        }
247
248
        return new Font($this->document);
249
    }
250
251
    /**
252
     * @param Page $page
253
     *
254
     * @return string
255
     *
256
     * @throws \Exception
257
     */
258
    public function getText(Page $page = null)
259
    {
260
        $text                = '';
261
        $sections            = $this->getSectionsText($this->content);
262
        $current_font        = $this->getDefaultFont($page);
263
264
        $current_position_td = ['x' => false, 'y' => false];
265
        $current_position_tm = ['x' => false, 'y' => false];
266
267
        array_push(self::$recursionStack, $this->getUniqueId());
268
269
        foreach ($sections as $section) {
270
            $commands = $this->getCommandsText($section);
271
272
            foreach ($commands as $command) {
273
                switch ($command[self::OPERATOR]) {
274
                    // set character spacing
275
                    case 'Tc':
276
                        break;
277
278
                    // move text current point
279
                    case 'Td':
280
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
281
                        $y = array_pop($args);
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

281
                        $y = array_pop(/** @scrutinizer ignore-type */ $args);
Loading history...
282
                        $x = array_pop($args);
283
                        if (((float) $x <= 0) ||
284
                            (false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y']))
285
                        ) {
286
                            // vertical offset
287
                            $text .= "\n";
288
                        } elseif (false !== $current_position_td['x'] && (float) $x > (float) (
289
                                $current_position_td['x']
290
                            )
291
                        ) {
292
                            // horizontal offset
293
                            $text .= ' ';
294
                        }
295
                        $current_position_td = ['x' => $x, 'y' => $y];
296
                        break;
297
298
                    // move text current point and set leading
299
                    case 'TD':
300
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
301
                        $y = array_pop($args);
302
                        $x = array_pop($args);
303
                        if ((float) $y < 0) {
304
                            $text .= "\n";
305
                        } elseif ((float) $x <= 0) {
306
                            $text .= ' ';
307
                        }
308
                        break;
309
310
                    case 'Tf':
311
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
312
                        $id = trim($id, '/');
313
                        if (null !== $page) {
314
                            $current_font = $page->getFont($id);
315
                        }
316
                        break;
317
318
                    case "'":
319
                    case 'Tj':
320
                        $command[self::COMMAND] = [$command];
321
                        // no break
322
                    case 'TJ':
323
                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
324
                        $text .= $sub_text;
325
                        break;
326
327
                    // set leading
328
                    case 'TL':
329
                        $text .= ' ';
330
                        break;
331
332
                    case 'Tm':
333
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
334
                        $y = array_pop($args);
335
                        $x = array_pop($args);
336
                        if (false !== $current_position_tm['x']) {
337
                            $delta = abs((float) $x - (float) ($current_position_tm['x']));
338
                            if ($delta > 10) {
339
                                $text .= "\t";
340
                            }
341
                        }
342
                        if (false !== $current_position_tm['y']) {
343
                            $delta = abs((float) $y - (float) ($current_position_tm['y']));
344
                            if ($delta > 10) {
345
                                $text .= "\n";
346
                            }
347
                        }
348
                        $current_position_tm = ['x' => $x, 'y' => $y];
349
                        break;
350
351
                    // set super/subscripting text rise
352
                    case 'Ts':
353
                        break;
354
355
                    // set word spacing
356
                    case 'Tw':
357
                        break;
358
359
                    // set horizontal scaling
360
                    case 'Tz':
361
                        $text .= "\n";
362
                        break;
363
364
                    // move to start of next line
365
                    case 'T*':
366
                        $text .= "\n";
367
                        break;
368
369
                    case 'Da':
370
                        break;
371
372
                    case 'Do':
373
                        if (null !== $page) {
374
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
375
                            $id = trim(array_pop($args), '/ ');
376
                            $xobject = $page->getXObject($id);
377
378
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
379
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
380
                                // Not a circular reference.
381
                                $text .= $xobject->getText($page);
382
                            }
383
                        }
384
                        break;
385
386
                    case 'rg':
387
                    case 'RG':
388
                        break;
389
390
                    case 're':
391
                        break;
392
393
                    case 'co':
394
                        break;
395
396
                    case 'cs':
397
                        break;
398
399
                    case 'gs':
400
                        break;
401
402
                    case 'en':
403
                        break;
404
405
                    case 'sc':
406
                    case 'SC':
407
                        break;
408
409
                    case 'g':
410
                    case 'G':
411
                        break;
412
413
                    case 'V':
414
                        break;
415
416
                    case 'vo':
417
                    case 'Vo':
418
                        break;
419
420
                    default:
421
                }
422
            }
423
        }
424
425
        array_pop(self::$recursionStack);
426
427
        return $text.' ';
428
    }
429
430
	/**
431
	 * @param Page
432
	 *
433
	 * @return array
434
	 * @throws \Exception
435
	 */
436
	public function getTextArray(Page $page = null)
437
	{
438
		$text                = array();
439
		$sections            = $this->getSectionsText($this->content);
440
		$current_font        = $this->getDefaultFont($page);
441
442
		foreach ($sections as $section) {
443
444
			$commands = $this->getCommandsText($section);
445
446
			foreach ($commands as $command) {
447
448
				switch ($command[self::OPERATOR]) {
449
					// set character spacing
450
					case 'Tc':
451
						break;
452
453
					// move text current point
454
					case 'Td':
455
						break;
456
457
					// move text current point and set leading
458
					case 'TD':
459
						break;
460
461
					case 'Tf':
462
						if (!is_null($page)) {
463
							list($id,) = preg_split('/\s/s', $command[self::COMMAND]);
464
							$id           = trim($id, '/');
465
							$current_font = $page->getFont($id);
466
						}
467
						break;
468
469
					case "'":
470
					case 'Tj':
471
						$command[self::COMMAND] = array($command);
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment if this fall-through is intended.
Loading history...
472
					case 'TJ':
473
						$sub_text = $current_font->decodeText($command[self::COMMAND]);
474
						$text[] = $sub_text;
475
						break;
476
477
					// set leading
478
					case 'TL':
479
						break;
480
481
					case 'Tm':
482
						break;
483
484
					// set super/subscripting text rise
485
					case 'Ts':
486
						break;
487
488
					// set word spacing
489
					case 'Tw':
490
						break;
491
492
					// set horizontal scaling
493
					case 'Tz':
494
						//$text .= "\n";
495
						break;
496
497
					// move to start of next line
498
					case 'T*':
499
						//$text .= "\n";
500
						break;
501
502
					case 'Da':
503
						break;
504
505
					case 'Do':
506
						if (!is_null($page)) {
507
							$args = preg_split('/\s/s', $command[self::COMMAND]);
508
							$id   = trim(array_pop($args), '/ ');
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

508
							$id   = trim(array_pop(/** @scrutinizer ignore-type */ $args), '/ ');
Loading history...
509
							if ($xobject = $page->getXObject($id)) {
510
								$text[] = $xobject->getText($page);
511
							}
512
						}
513
						break;
514
515
					case 'rg':
516
					case 'RG':
517
						break;
518
519
					case 're':
520
						break;
521
522
					case 'co':
523
						break;
524
525
					case 'cs':
526
						break;
527
528
					case 'gs':
529
						break;
530
531
					case 'en':
532
						break;
533
534
                    case 'vo':
535
                    case 'Vo':
536
                        break;
537
538
                    default:
539
                }
540
            }
541
        }
542
543
        return $text;
544
    }
545
546
    /**
547
     * @param string $text_part
548
     * @param int    $offset
549
     *
550
     * @return array
551
     */
552
    public function getCommandsText($text_part, &$offset = 0)
553
    {
554
        $commands = $matches = [];
555
556
        while ($offset < \strlen($text_part)) {
557
            $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset);
558
            $char = $text_part[$offset];
559
560
            $operator = '';
561
            $type = '';
562
            $command = false;
563
564
            switch ($char) {
565
                case '/':
566
                    $type = $char;
567
                    if (preg_match(
568
                        '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
569
                        substr($text_part, $offset),
570
                        $matches
571
                    )
572
                    ) {
573
                        $operator = $matches[2];
574
                        $command = $matches[1];
575
                        $offset += \strlen($matches[0]);
576
                    } elseif (preg_match(
577
                        '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
578
                        substr($text_part, $offset),
579
                        $matches
580
                    )
581
                    ) {
582
                        $operator = $matches[2];
583
                        $command = $matches[1];
584
                        $offset += \strlen($matches[0]);
585
                    }
586
                    break;
587
588
                case '[':
589
                case ']':
590
                    // array object
591
                    $type = $char;
592
                    if ('[' == $char) {
593
                        ++$offset;
594
                        // get elements
595
                        $command = $this->getCommandsText($text_part, $offset);
596
597
                        if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
598
                            $operator = trim($matches[0]);
599
                            $offset += \strlen($matches[0]);
600
                        }
601
                    } else {
602
                        ++$offset;
603
                        break;
604
                    }
605
                    break;
606
607
                case '<':
608
                case '>':
609
                    // array object
610
                    $type = $char;
611
                    ++$offset;
612
                    if ('<' == $char) {
613
                        $strpos = strpos($text_part, '>', $offset);
614
                        $command = substr($text_part, $offset, ($strpos - $offset));
615
                        $offset = $strpos + 1;
616
                    }
617
618
                    if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
619
                        $operator = trim($matches[0]);
620
                        $offset += \strlen($matches[0]);
621
                    }
622
                    break;
623
624
                case '(':
625
                case ')':
626
                    ++$offset;
627
                    $type = $char;
628
                    $strpos = $offset;
629
                    if ('(' == $char) {
630
                        $open_bracket = 1;
631
                        while ($open_bracket > 0) {
632
                            if (!isset($text_part[$strpos])) {
633
                                break;
634
                            }
635
                            $ch = $text_part[$strpos];
636
                            switch ($ch) {
637
                                case '\\':
638
                                 // REVERSE SOLIDUS (5Ch) (Backslash)
639
                                    // skip next character
640
                                    ++$strpos;
641
                                    break;
642
643
                                case '(':
644
                                 // LEFT PARENHESIS (28h)
645
                                    ++$open_bracket;
646
                                    break;
647
648
                                case ')':
649
                                 // RIGHT PARENTHESIS (29h)
650
                                    --$open_bracket;
651
                                    break;
652
                            }
653
                            ++$strpos;
654
                        }
655
                        $command = substr($text_part, $offset, ($strpos - $offset - 1));
656
                        $offset = $strpos;
657
658
                        if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) {
659
                            $operator = $matches[1];
660
                            $offset += \strlen($matches[0]);
661
                        }
662
                    }
663
                    break;
664
665
                default:
666
667
                    if ('ET' == substr($text_part, $offset, 2)) {
668
                        break;
669
                    } elseif (preg_match(
670
                        '/^\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si',
671
                        substr($text_part, $offset),
672
                        $matches
673
                    )
674
                    ) {
675
                        $operator = trim($matches['id']);
676
                        $command = trim($matches['data']);
677
                        $offset += \strlen($matches[0]);
678
                    } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) {
679
                        $type = 'n';
680
                        $command = trim($matches[0]);
681
                        $offset += \strlen($matches[0]);
682
                    } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) {
683
                        $type = '';
684
                        $operator = $matches[1];
685
                        $command = '';
686
                        $offset += \strlen($matches[0]);
687
                    }
688
            }
689
690
            if (false !== $command) {
691
                $commands[] = [
692
                    self::TYPE => $type,
693
                    self::OPERATOR => $operator,
694
                    self::COMMAND => $command,
695
                ];
696
            } else {
697
                break;
698
            }
699
        }
700
701
        return $commands;
702
    }
703
704
    /**
705
     * @param string $content
706
     *
707
     * @return PDFObject
708
     */
709
    public static function factory(Document $document, Header $header, $content)
710
    {
711
        switch ($header->get('Type')->getContent()) {
712
            case 'XObject':
713
                switch ($header->get('Subtype')->getContent()) {
714
                    case 'Image':
715
                        return new Image($document, $header, $content);
716
717
                    case 'Form':
718
                        return new Form($document, $header, $content);
719
                }
720
721
                return new self($document, $header, $content);
722
723
            case 'Pages':
724
                return new Pages($document, $header, $content);
725
726
            case 'Page':
727
                return new Page($document, $header, $content);
728
729
            case 'Encoding':
730
                return new Encoding($document, $header, $content);
731
732
            case 'Font':
733
                $subtype = $header->get('Subtype')->getContent();
734
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
735
736
                if (class_exists($classname)) {
737
                    return new $classname($document, $header, $content);
738
                }
739
740
                return new Font($document, $header, $content);
741
742
            default:
743
                return new self($document, $header, $content);
744
        }
745
    }
746
747
    /**
748
     * Returns unique id identifying the object.
749
     *
750
     * @return string
751
     */
752
    protected function getUniqueId()
753
    {
754
        return spl_object_hash($this);
755
    }
756
}
757