Passed
Branch master (f7fac8)
by Sebastien
02:47
created

PDFObject::getCommandsText()   F

Complexity

Conditions 27
Paths 65

Size

Total Lines 151
Code Lines 105

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 27
eloc 105
c 0
b 0
f 0
nc 65
nop 2
dl 0
loc 151
rs 3.3333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 * @license LGPLv3
10
 * @url     <https://github.com/smalot/pdfparser>
11
 *
12
 *  PdfParser is a pdf library written in PHP, extraction oriented.
13
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
14
 *
15
 *  This program is free software: you can redistribute it and/or modify
16
 *  it under the terms of the GNU Lesser General Public License as published by
17
 *  the Free Software Foundation, either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  This program is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU Lesser General Public License for more details.
24
 *
25
 *  You should have received a copy of the GNU Lesser General Public License
26
 *  along with this program.
27
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
28
 *
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\XObject\Form;
34
use Smalot\PdfParser\XObject\Image;
35
36
/**
37
 * Class PDFObject
38
 *
39
 * @package Smalot\PdfParser
40
 */
41
class PDFObject
42
{
43
    const TYPE = 't';
44
45
    const OPERATOR = 'o';
46
47
    const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    static $recursionStack = array();
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document = null;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header = null;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content = null;
70
71
    /**
72
     * @param Document $document
73
     * @param Header   $header
74
     * @param string   $content
75
     */
76
    public function __construct(Document $document, Header $header = null, $content = null)
77
    {
78
        $this->document = $document;
79
        $this->header   = !is_null($header) ? $header : new Header();
80
        $this->content  = $content;
81
    }
82
83
    /**
84
     *
85
     */
86
    public function init()
87
    {
88
89
    }
90
91
    /**
92
     * @return null|Header
93
     */
94
    public function getHeader()
95
    {
96
        return $this->header;
97
    }
98
99
    /**
100
     * @param string $name
101
     *
102
     * @return Element|PDFObject
103
     */
104
    public function get($name)
105
    {
106
        return $this->header->get($name);
107
    }
108
109
    /**
110
     * @param $name
111
     *
112
     * @return bool
113
     */
114
    public function has($name)
115
    {
116
        return $this->header->has($name);
117
    }
118
119
    /**
120
     * @param bool $deep
121
     *
122
     * @return array
123
     */
124
    public function getDetails($deep = true)
125
    {
126
        return $this->header->getDetails($deep);
127
    }
128
129
    /**
130
     * @return null|string
131
     */
132
    public function getContent()
133
    {
134
        return $this->content;
135
    }
136
137
    /**
138
     * @param $content
139
     */
140
    public function cleanContent($content, $char = 'X')
141
    {
142
        $char    = $char[0];
143
        $content = str_replace(array('\\\\', '\\)', '\\('), $char . $char, $content);
144
145
        // Remove image bloc with binary content
146
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
147
        foreach ($matches[0] as $part) {
148
            $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0]));
149
        }
150
151
        // Clean content in square brackets [.....]
152
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, PREG_OFFSET_CAPTURE);
153
        foreach ($matches[1] as $part) {
154
            $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0]));
155
        }
156
157
        // Clean content in round brackets (.....)
158
        preg_match_all('/\((.*?)\)/s', $content, $matches, PREG_OFFSET_CAPTURE);
159
        foreach ($matches[1] as $part) {
160
            $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0]));
161
        }
162
163
        // Clean structure
164
        if ($parts = preg_split('/(<|>)/s', $content, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE)) {
165
            $content = '';
166
            $level   = 0;
167
            foreach ($parts as $part) {
168
                if ($part == '<') {
169
                    $level++;
170
                }
171
172
                $content .= ($level == 0 ? $part : str_repeat($char, strlen($part)));
173
174
                if ($part == '>') {
175
                    $level--;
176
                }
177
            }
178
        }
179
180
        // Clean BDC and EMC markup
181
        preg_match_all(
182
            '/(\/[A-Za-z0-9\_]*\s*' . preg_quote($char) . '*BDC)/s',
183
            $content,
184
            $matches,
185
            PREG_OFFSET_CAPTURE
186
        );
187
        foreach ($matches[1] as $part) {
188
            $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0]));
189
        }
190
191
        preg_match_all('/\s(EMC)\s/s', $content, $matches, PREG_OFFSET_CAPTURE);
192
        foreach ($matches[1] as $part) {
193
            $content = substr_replace($content, str_repeat($char, strlen($part[0])), $part[1], strlen($part[0]));
194
        }
195
196
        return $content;
197
    }
198
199
    /**
200
     * @param $content
201
     *
202
     * @return array
203
     */
204
    public function getSectionsText($content)
205
    {
206
        $sections    = array();
207
        $content     = ' ' . $content . ' ';
208
        $textCleaned = $this->cleanContent($content, '_');
209
210
        // Extract text blocks.
211
        if (preg_match_all('/\s+BT[\s|\(|\[]+(.*?)\s*ET/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
212
            foreach ($matches[1] as $part) {
213
                $text    = $part[0];
214
                if ($text === '') {
215
                    continue;
216
                }
217
                $offset  = $part[1];
218
                $section = substr($content, $offset, strlen($text));
219
220
                // Removes BDC and EMC markup.
221
                $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section . ' ');
222
223
                $sections[] = $section;
224
            }
225
        }
226
227
        // Extract 'do' commands.
228
        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, PREG_OFFSET_CAPTURE)) {
229
            foreach ($matches[1] as $part) {
230
                $text    = $part[0];
231
                $offset  = $part[1];
232
                $section = substr($content, $offset, strlen($text));
233
234
                $sections[] = $section;
235
            }
236
        }
237
238
        return $sections;
239
    }
240
241
    /**
242
     * @param Page
243
     *
244
     * @return string
245
     * @throws \Exception
246
     */
247
    public function getText(Page $page = null)
248
    {
249
        $text                = '';
250
        $sections            = $this->getSectionsText($this->content);
251
        $current_font = null;
252
253
        foreach ($this->document->getObjects() as $obj) {
254
            if ($obj instanceof Font) {
255
                $current_font = $obj;
256
                break;
257
            }
258
        }
259
260
        if ($current_font === null) {
261
            $current_font = new Font($this->document);
262
        }
263
264
        $current_position_td = array('x' => false, 'y' => false);
265
        $current_position_tm = array('x' => false, 'y' => false);
266
267
        array_push(self::$recursionStack, $this->getUniqueId());
268
269
        foreach ($sections as $section) {
270
271
            $commands = $this->getCommandsText($section);
272
273
            foreach ($commands as $command) {
274
275
                switch ($command[self::OPERATOR]) {
276
                    // set character spacing
277
                    case 'Tc':
278
                        break;
279
280
                    // move text current point
281
                    case 'Td':
282
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
283
                        $y    = array_pop($args);
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                        $y    = array_pop(/** @scrutinizer ignore-type */ $args);
Loading history...
284
                        $x    = array_pop($args);
285
                        if ((floatval($x) <= 0) ||
286
                            ($current_position_td['y'] !== false && floatval($y) < floatval($current_position_td['y']))
287
                        ) {
288
                            // vertical offset
289
                            $text .= "\n";
290
                        } elseif ($current_position_td['x'] !== false && floatval($x) > floatval(
291
                                $current_position_td['x']
292
                            )
293
                        ) {
294
                            // horizontal offset
295
                            $text .= ' ';
296
                        }
297
                        $current_position_td = array('x' => $x, 'y' => $y);
298
                        break;
299
300
                    // move text current point and set leading
301
                    case 'TD':
302
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
303
                        $y    = array_pop($args);
304
                        $x    = array_pop($args);
305
                        if (floatval($y) < 0) {
306
                            $text .= "\n";
307
                        } elseif (floatval($x) <= 0) {
308
                            $text .= ' ';
309
                        }
310
                        break;
311
312
                    case 'Tf':
313
                        list($id,) = preg_split('/\s/s', $command[self::COMMAND]);
314
                        $id           = trim($id, '/');
315
                        if (!is_null($page)) {
316
                            $current_font = $page->getFont($id);
317
                        }
318
                        break;
319
320
                    case "'":
321
                    case 'Tj':
322
                        $command[self::COMMAND] = array($command);
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment if this fall-through is intended.
Loading history...
323
                    case 'TJ':
324
                        // Skip if not previously defined, should never happened.
325
                        if (is_null($current_font)) {
326
                            // Fallback
327
                            // TODO : Improve
328
                            $text .= $command[self::COMMAND][0][self::COMMAND];
329
                            break;
330
                        }
331
332
                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
333
                        $text .= $sub_text;
334
                        break;
335
336
                    // set leading
337
                    case 'TL':
338
                        $text .= ' ';
339
                        break;
340
341
                    case 'Tm':
342
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
343
                        $y    = array_pop($args);
344
                        $x    = array_pop($args);
345
                        if ($current_position_tm['x'] !== false) {
346
                            $delta = abs(floatval($x) - floatval($current_position_tm['x']));
347
                            if ($delta > 10) {
348
                                $text .= "\t";
349
                            }
350
                        }
351
                        if ($current_position_tm['y'] !== false) {
352
                            $delta = abs(floatval($y) - floatval($current_position_tm['y']));
353
                            if ($delta > 10) {
354
                                $text .= "\n";
355
                            }
356
                        }
357
                        $current_position_tm = array('x' => $x, 'y' => $y);
358
                        break;
359
360
                    // set super/subscripting text rise
361
                    case 'Ts':
362
                        break;
363
364
                    // set word spacing
365
                    case 'Tw':
366
                        break;
367
368
                    // set horizontal scaling
369
                    case 'Tz':
370
                        $text .= "\n";
371
                        break;
372
373
                    // move to start of next line
374
                    case 'T*':
375
                        $text .= "\n";
376
                        break;
377
378
                    case 'Da':
379
                        break;
380
381
                    case 'Do':
382
                        if (!is_null($page)) {
383
                            $args    = preg_split('/\s/s', $command[self::COMMAND]);
384
                            $id      = trim(array_pop($args), '/ ');
385
                            $xobject = $page->getXObject($id);
386
387
388
                             // @todo $xobject could be a ElementXRef object, which would then throw an error
389
                             if ( is_object($xobject) && $xobject instanceof PDFObject && !in_array($xobject->getUniqueId(), self::$recursionStack) ) {
390
                                // Not a circular reference.
391
                                $text .= $xobject->getText($page);
392
                            }
393
                        }
394
                        break;
395
396
                    case 'rg':
397
                    case 'RG':
398
                        break;
399
400
                    case 're':
401
                        break;
402
403
                    case 'co':
404
                        break;
405
406
                    case 'cs':
407
                        break;
408
409
                    case 'gs':
410
                        break;
411
412
                    case 'en':
413
                        break;
414
415
                    case 'sc':
416
                    case 'SC':
417
                        break;
418
419
                    case 'g':
420
                    case 'G':
421
                        break;
422
423
                    case 'V':
424
                        break;
425
426
                    case 'vo':
427
                    case 'Vo':
428
                        break;
429
430
                    default:
431
                }
432
            }
433
        }
434
435
        array_pop(self::$recursionStack);
436
437
        return $text . ' ';
438
    }
439
440
	/**
441
	 * @param Page
442
	 *
443
	 * @return array
444
	 * @throws \Exception
445
	 */
446
	public function getTextArray(Page $page = null)
447
	{
448
		$text                = array();
449
		$sections            = $this->getSectionsText($this->content);
450
		$current_font        = new Font($this->document);
451
452
		foreach ($sections as $section) {
453
454
			$commands = $this->getCommandsText($section);
455
456
			foreach ($commands as $command) {
457
458
				switch ($command[self::OPERATOR]) {
459
					// set character spacing
460
					case 'Tc':
461
						break;
462
463
					// move text current point
464
					case 'Td':
465
						break;
466
467
					// move text current point and set leading
468
					case 'TD':
469
						break;
470
471
					case 'Tf':
472
						list($id,) = preg_split('/\s/s', $command[self::COMMAND]);
473
						$id           = trim($id, '/');
474
						$current_font = $page->getFont($id);
0 ignored issues
show
Bug introduced by
The method getFont() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

474
						/** @scrutinizer ignore-call */ 
475
      $current_font = $page->getFont($id);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
475
						break;
476
477
					case "'":
478
					case 'Tj':
479
						$command[self::COMMAND] = array($command);
0 ignored issues
show
Coding Style Comprehensibility introduced by
Consider adding a comment if this fall-through is intended.
Loading history...
480
					case 'TJ':
481
						// Skip if not previously defined, should never happened.
482
						if (is_null($current_font)) {
483
							// Fallback
484
							// TODO : Improve
485
							$text[] = $command[self::COMMAND][0][self::COMMAND];
486
							break;
487
						}
488
489
						$sub_text = $current_font->decodeText($command[self::COMMAND]);
490
						$text[] = $sub_text;
491
						break;
492
493
					// set leading
494
					case 'TL':
495
						break;
496
497
					case 'Tm':
498
						break;
499
500
					// set super/subscripting text rise
501
					case 'Ts':
502
						break;
503
504
					// set word spacing
505
					case 'Tw':
506
						break;
507
508
					// set horizontal scaling
509
					case 'Tz':
510
						//$text .= "\n";
511
						break;
512
513
					// move to start of next line
514
					case 'T*':
515
						//$text .= "\n";
516
						break;
517
518
					case 'Da':
519
						break;
520
521
					case 'Do':
522
						if (!is_null($page)) {
523
							$args = preg_split('/\s/s', $command[self::COMMAND]);
524
							$id   = trim(array_pop($args), '/ ');
0 ignored issues
show
Bug introduced by
It seems like $args can also be of type false; however, parameter $array of array_pop() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

524
							$id   = trim(array_pop(/** @scrutinizer ignore-type */ $args), '/ ');
Loading history...
525
							if ($xobject = $page->getXObject($id)) {
526
								$text[] = $xobject->getText($page);
527
							}
528
						}
529
						break;
530
531
					case 'rg':
532
					case 'RG':
533
						break;
534
535
					case 're':
536
						break;
537
538
					case 'co':
539
						break;
540
541
					case 'cs':
542
						break;
543
544
					case 'gs':
545
						break;
546
547
					case 'en':
548
						break;
549
550
					case 'sc':
551
					case 'SC':
552
						break;
553
554
					case 'g':
555
					case 'G':
556
						break;
557
558
					case 'V':
559
						break;
560
561
					case 'vo':
562
					case 'Vo':
563
						break;
564
565
					default:
566
				}
567
			}
568
		}
569
570
		return $text;
571
	}
572
573
574
    /**
575
     * @param string $text_part
576
     * @param int    $offset
577
     *
578
     * @return array
579
     */
580
    public function getCommandsText($text_part, &$offset = 0)
581
    {
582
        $commands = $matches = array();
583
584
        while ($offset < strlen($text_part)) {
585
            $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset);
586
            $char = $text_part[$offset];
587
588
            $operator = '';
589
            $type     = '';
590
            $command  = false;
591
592
            switch ($char) {
593
                case '/':
594
                    $type = $char;
595
                    if (preg_match(
596
                        '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
597
                        substr($text_part, $offset),
598
                        $matches
599
                    )
600
                    ) {
601
                        $operator = $matches[2];
602
                        $command  = $matches[1];
603
                        $offset += strlen($matches[0]);
604
                    } elseif (preg_match(
605
                        '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
606
                        substr($text_part, $offset),
607
                        $matches
608
                    )
609
                    ) {
610
                        $operator = $matches[2];
611
                        $command  = $matches[1];
612
                        $offset += strlen($matches[0]);
613
                    }
614
                    break;
615
616
                case '[':
617
                case ']':
618
                    // array object
619
                    $type = $char;
620
                    if ($char == '[') {
621
                        ++$offset;
622
                        // get elements
623
                        $command = $this->getCommandsText($text_part, $offset);
624
625
                        if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
626
                            $operator = trim($matches[0]);
627
                            $offset += strlen($matches[0]);
628
                        }
629
                    } else {
630
                        ++$offset;
631
                        break;
632
                    }
633
                    break;
634
635
                case '<':
636
                case '>':
637
                    // array object
638
                    $type = $char;
639
                    ++$offset;
640
                    if ($char == '<') {
641
                        $strpos  = strpos($text_part, '>', $offset);
642
                        $command = substr($text_part, $offset, ($strpos - $offset));
643
                        $offset  = $strpos + 1;
644
                    }
645
646
                    if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) {
647
                        $operator = trim($matches[0]);
648
                        $offset += strlen($matches[0]);
649
                    }
650
                    break;
651
652
                case '(':
653
                case ')':
654
                    ++$offset;
655
                    $type   = $char;
656
                    $strpos = $offset;
657
                    if ($char == '(') {
658
                        $open_bracket = 1;
659
                        while ($open_bracket > 0) {
660
                            if (!isset($text_part[$strpos])) {
661
                                break;
662
                            }
663
                            $ch = $text_part[$strpos];
664
                            switch ($ch) {
665
                                case '\\':
666
                                { // REVERSE SOLIDUS (5Ch) (Backslash)
667
                                    // skip next character
668
                                    ++$strpos;
669
                                    break;
670
                                }
671
                                case '(':
672
                                { // LEFT PARENHESIS (28h)
673
                                    ++$open_bracket;
674
                                    break;
675
                                }
676
                                case ')':
677
                                { // RIGHT PARENTHESIS (29h)
678
                                    --$open_bracket;
679
                                    break;
680
                                }
681
                            }
682
                            ++$strpos;
683
                        }
684
                        $command = substr($text_part, $offset, ($strpos - $offset - 1));
685
                        $offset  = $strpos;
686
687
                        if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) {
688
                            $operator = $matches[1];
689
                            $offset += strlen($matches[0]);
690
                        }
691
                    }
692
                    break;
693
694
                default:
695
696
                    if (substr($text_part, $offset, 2) == 'ET') {
697
                        break;
698
                    } elseif (preg_match(
699
                        '/^\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si',
700
                        substr($text_part, $offset),
701
                        $matches
702
                    )
703
                    ) {
704
                        $operator = trim($matches['id']);
705
                        $command  = trim($matches['data']);
706
                        $offset += strlen($matches[0]);
707
                    } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) {
708
                        $type    = 'n';
709
                        $command = trim($matches[0]);
710
                        $offset += strlen($matches[0]);
711
                    } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) {
712
                        $type     = '';
713
                        $operator = $matches[1];
714
                        $command  = '';
715
                        $offset += strlen($matches[0]);
716
                    }
717
            }
718
719
            if ($command !== false) {
720
                $commands[] = array(
721
                    self::TYPE     => $type,
722
                    self::OPERATOR => $operator,
723
                    self::COMMAND  => $command,
724
                );
725
            } else {
726
                break;
727
            }
728
        }
729
730
        return $commands;
731
    }
732
733
    /**
734
     * @param $document Document
735
     * @param $header   Header
736
     * @param $content  string
737
     *
738
     * @return PDFObject
739
     */
740
    public static function factory(Document $document, Header $header, $content)
741
    {
742
        switch ($header->get('Type')->getContent()) {
743
            case 'XObject':
744
                switch ($header->get('Subtype')->getContent()) {
745
                    case 'Image':
746
                        return new Image($document, $header, $content);
747
748
                    case 'Form':
749
                        return new Form($document, $header, $content);
750
751
                    default:
752
                        return new PDFObject($document, $header, $content);
753
                }
754
                break;
755
756
            case 'Pages':
757
                return new Pages($document, $header, $content);
758
759
            case 'Page':
760
                return new Page($document, $header, $content);
761
762
            case 'Encoding':
763
                return new Encoding($document, $header, $content);
764
765
            case 'Font':
766
                $subtype   = $header->get('Subtype')->getContent();
767
                $classname = '\Smalot\PdfParser\Font\Font' . $subtype;
768
769
                if (class_exists($classname)) {
770
                    return new $classname($document, $header, $content);
771
                } else {
772
                    return new Font($document, $header, $content);
773
                }
774
775
            default:
776
                return new PDFObject($document, $header, $content);
777
        }
778
    }
779
780
    /**
781
     * Returns unique id identifying the object.
782
     *
783
     * @return string
784
     */
785
    protected function getUniqueId()
786
    {
787
        return spl_object_hash($this);
788
    }
789
}
790