Test Failed
Pull Request — master (#614)
by
unknown
02:15
created

PDFObject::getTextArray()   D

Complexity

Conditions 35
Paths 85

Size

Total Lines 118
Code Lines 73

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 56
CRAP Score 52.6208

Importance

Changes 0
Metric Value
cc 35
eloc 73
c 0
b 0
f 0
nc 85
nop 1
dl 0
loc 118
ccs 56
cts 74
cp 0.7568
crap 52.6208
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    public function __construct(
77
        Document $document,
78
        Header $header = null,
79
        string $content = null,
80
        Config $config = null
81
    ) {
82 62
        $this->document = $document;
83 62
        $this->header = $header ?? new Header();
84 62
        $this->content = $content;
85 62
        $this->config = $config;
86 62
    }
87
88 49
    public function init()
89
    {
90 49
    }
91
92 3
    public function getDocument(): Document
93
    {
94 3
        return $this->document;
95
    }
96
97 49
    public function getHeader(): ?Header
98
    {
99 49
        return $this->header;
100
    }
101
102 3
    public function getConfig(): ?Config
103
    {
104 3
        return $this->config;
105
    }
106
107
    /**
108
     * @return Element|PDFObject|Header
109
     */
110 50
    public function get(string $name)
111
    {
112 50
        return $this->header->get($name);
113
    }
114
115 47
    public function has(string $name): bool
116
    {
117 47
        return $this->header->has($name);
118
    }
119
120 3
    public function getDetails(bool $deep = true): array
121
    {
122 3
        return $this->header->getDetails($deep);
123
    }
124
125 38
    public function getContent(): ?string
126
    {
127 38
        return $this->content;
128
    }
129
130 32
    public function cleanContent(string $content, string $char = 'X')
131
    {
132 32
        $char = $char[0];
133 32
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
134
135
        // Remove image bloc with binary content
136 32
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
137 32
        foreach ($matches[0] as $part) {
138
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
139
        }
140
141
        // Clean content in square brackets [.....]
142 32
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
143 32
        foreach ($matches[1] as $part) {
144 22
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
145
        }
146
147
        // Clean content in round brackets (.....)
148 32
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
149 32
        foreach ($matches[1] as $part) {
150 21
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
151
        }
152
153
        // Clean structure
154 32
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
155 32
            $content = '';
156 32
            $level = 0;
157 32
            foreach ($parts as $part) {
158 32
                if ('<' == $part) {
159 18
                    ++$level;
160
                }
161
162 32
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
163
164 32
                if ('>' == $part) {
165 18
                    --$level;
166
                }
167
            }
168
        }
169
170
        // Clean BDC and EMC markup
171 32
        preg_match_all(
172 32
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
173
            $content,
174
            $matches,
175 32
            \PREG_OFFSET_CAPTURE
176
        );
177 32
        foreach ($matches[1] as $part) {
178 7
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
179
        }
180
181 32
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
182 32
        foreach ($matches[1] as $part) {
183 11
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
184
        }
185
186 32
        return $content;
187
    }
188
189 31
    public function getSectionsText(?string $content): array
190
    {
191 31
        $sections = [];
192 31
        $content = ' '.$content.' ';
193 31
        $textCleaned = $this->cleanContent($content, '_');
194
195
        // Extract text blocks.
196 31
        if (preg_match_all('/(\sQ)?\s+BT[\s|\(|\[]+(.*?)\s*ET(\sq)?/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
197 29
            foreach ($matches[2] as $pos => $part) {
198 29
                $text = $part[0];
199 29
                if ('' === $text) {
200
                    continue;
201
                }
202 29
                $offset = $part[1];
203 29
                $section = substr($content, $offset, \strlen($text));
204
205
                // Removes BDC and EMC markup.
206 29
                $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' ');
207
208
                // Add Q and q flags if detected around BT/ET.
209
                // @see: https://github.com/smalot/pdfparser/issues/387
210 29
                $section = trim((!empty($matches[1][$pos][0]) ? "Q\n" : '').$section).(!empty($matches[3][$pos][0]) ? "\nq" : '');
211
212 29
                $sections[] = $section;
213
            }
214
        }
215
216
        // Extract 'do' commands.
217 31
        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
218 4
            foreach ($matches[1] as $part) {
219 4
                $text = $part[0];
220 4
                $offset = $part[1];
221 4
                $section = substr($content, $offset, \strlen($text));
222
223 4
                $sections[] = $section;
224
            }
225
        }
226
227 31
        return $sections;
228
    }
229
230 20
    private function getDefaultFont(Page $page = null): Font
231
    {
232 20
        $fonts = [];
233 20
        if (null !== $page) {
234 19
            $fonts = $page->getFonts();
235
        }
236
237 20
        $firstFont = $this->document->getFirstFont();
238 20
        if (null !== $firstFont) {
239 18
            $fonts[] = $firstFont;
240
        }
241
242 20
        if (\count($fonts) > 0) {
243 18
            return reset($fonts);
244
        }
245
246 2
        return new Font($this->document, null, null, $this->config);
247
    }
248
249
    /**
250
     * @throws \Exception
251
     */
252 20
    public function getText(Page $page = null): string
253
    {
254 20
        $result = '';
255 20
        $sections = $this->getSectionsText($this->content);
256 20
        $current_font = $this->getDefaultFont($page);
257 20
        $clipped_font = $current_font;
258
259 20
        $current_position_td = ['x' => false, 'y' => false];
260 20
        $current_position_tm = ['x' => false, 'y' => false];
261
262 20
        self::$recursionStack[] = $this->getUniqueId();
263
264 20
        foreach ($sections as $section) {
265 18
            $commands = $this->getCommandsText($section);
266 18
            $reverse_text = false;
267 18
            $text = '';
268
269 18
            foreach ($commands as $command) {
270 18
                switch ($command[self::OPERATOR]) {
271 18
                    case 'BMC':
272 1
                        if ('ReversedChars' == $command[self::COMMAND]) {
273 1
                            $reverse_text = true;
274
                        }
275 1
                        break;
276
277
                        // set character spacing
278 18
                    case 'Tc':
279 5
                        break;
280
281
                        // move text current point
282 18
                    case 'Td':
283 15
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
284 15
                        $y = array_pop($args);
285 15
                        $x = array_pop($args);
286 15
                        if (((float) $x <= 0)
287 15
                            || (false !== $current_position_td['y'] && (float) $y < (float) $current_position_td['y'])
288
                        ) {
289
                            // vertical offset
290 11
                            $text .= "\n";
291 15
                        } elseif (false !== $current_position_td['x'] && (float) $x > (float)
292 15
                            $current_position_td['x']
293
                        ) {
294 12
                            $text .= $this->config->getHorizontalOffset();
295
                        }
296 15
                        $current_position_td = ['x' => $x, 'y' => $y];
297 15
                        break;
298
299
                        // move text current point and set leading
300 18
                    case 'TD':
301 3
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
302 3
                        $y = array_pop($args);
303 3
                        $x = array_pop($args);
304 3
                        if ((float) $y < 0) {
305 3
                            $text .= "\n";
306
                        } elseif ((float) $x <= 0) {
307
                            $text .= ' ';
308
                        }
309 3
                        break;
310
311 18
                    case 'Tf':
312 18
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
313 18
                        $id = trim($id, '/');
314 18
                        if (null !== $page) {
315 18
                            $new_font = $page->getFont($id);
316
                            // If an invalid font ID is given, do not update the font.
317
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
318
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
319
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
320
                            // But we want to make sure that malformed PDFs do not simply crash.
321 18
                            if (null !== $new_font) {
322 16
                                $current_font = $new_font;
323
                            }
324
                        }
325 18
                        break;
326
327 18
                    case 'Q':
328
                        // Use clip: restore font.
329 5
                        $current_font = $clipped_font;
330 5
                        break;
331
332 18
                    case 'q':
333
                        // Use clip: save font.
334 6
                        $clipped_font = $current_font;
335 6
                        break;
336
337 18
                    case "'":
338 18
                    case 'Tj':
339 13
                        $command[self::COMMAND] = [$command];
340
                        // no break
341 17
                    case 'TJ':
342 18
                        // Check to see if $current_font can properly decode all this.
343 18
                        $use_font = $current_font;
344 18
                        $orig_text = $use_font->decodeText($command[self::COMMAND]);
345
                        $sub_text = $orig_text;
346
347 15
                        if (null !== $page) {
348 1
                            $font_ids = array_keys($page->getFonts());
349 1
350
                            // If the decoded text contains UTF-8 control characters
351 15
                            // then the font page being used is probably the wrong one.
352 14
                            // Loop through the rest of the fonts to see if we can get
353 14
                            // a good decode.
354 14
                            while (preg_match("/[\x00-\x1f\x7f]/u", $sub_text)) {
355 14
                                // If we're out of font IDs, then give up and use the
356 14
                                // original string
357 14
                                if (!\count($font_ids)) {
358 12
                                    $sub_text = $orig_text;
359
                                    break;
360
                                }
361 14
362 14
                                // Try the next font ID
363 14
                                $use_font = $page->getFont(array_pop($font_ids));
364 10
                                $sub_text = $use_font->decodeText($command[self::COMMAND]);
365
                            }
366
                        }
367 14
                        $text .= $sub_text;
368 14
                        break;
369
370
                        // set leading
371 12
                    case 'TL':
372
                        $text .= ' ';
373
                        break;
374
375 12
                    case 'Tm':
376 4
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
377
                        $y = array_pop($args);
378
                        $x = array_pop($args);
379 12
                        if (false !== $current_position_tm['x']) {
380
                            $delta = abs((float) $x - (float) $current_position_tm['x']);
381
                            if ($delta > 10) {
382
                                $text .= "\t";
383
                            }
384 12
                        }
385 4
                        if (false !== $current_position_tm['y']) {
386 4
                            $delta = abs((float) $y - (float) $current_position_tm['y']);
387
                            if ($delta > 10) {
388 11
                                $text .= "\n";
389
                            }
390
                        }
391 11
                        $current_position_tm = ['x' => $x, 'y' => $y];
392 4
                        break;
393 4
394 4
                        // set super/subscripting text rise
395 4
                    case 'Ts':
396
                        break;
397
398 4
                        // set word spacing
399
                    case 'Tw':
400 4
                        break;
401
402
                        // set horizontal scaling
403 4
                    case 'Tz':
404
                        $text .= "\n";
405 9
                        break;
406 8
407 2
                        // move to start of next line
408
                    case 'T*':
409 8
                        $text .= "\n";
410
                        break;
411
412 8
                    case 'Da':
413
                        break;
414
415 8
                    case 'Do':
416 3
                        if (null !== $page) {
417
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
418 8
                            $id = trim(array_pop($args), '/ ');
419 3
                            $xobject = $page->getXObject($id);
420
421 7
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
422
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
423
                                // Not a circular reference.
424 7
                                $text .= $xobject->getText($page);
425 7
                            }
426
                        }
427
                        break;
428 7
429 7
                    case 'rg':
430 1
                    case 'RG':
431
                        break;
432 6
433
                    case 're':
434
                        break;
435 6
436 6
                    case 'co':
437
                        break;
438
439
                    case 'cs':
440
                        break;
441
442
                    case 'gs':
443
                        break;
444
445 18
                    case 'en':
446 1
                        break;
447 1
448
                    case 'sc':
449
                    case 'SC':
450 18
                        break;
451
452
                    case 'g':
453 20
                    case 'G':
454
                        break;
455
456
                    case 'V':
457
                        break;
458
459 6
                    case 'vo':
460
                    case 'Vo':
461 6
                        break;
462 6
463 6
                    default:
464
                }
465 6
            }
466 6
467
            // Fix Hebrew and other reverse text oriented languages.
468 6
            // @see: https://github.com/smalot/pdfparser/issues/398
469 6
            if ($reverse_text) {
470
                $chars = mb_str_split($text, 1, mb_internal_encoding());
471 6
                $text = implode('', array_reverse($chars));
472 3
            }
473
474
            $result .= $text;
475 6
        }
476 6
477
        return $result.' ';
478
    }
479 6
480
    /**
481
     * @throws \Exception
482 6
     */
483 6
    public function getTextArray(Page $page = null): array
484 6
    {
485 6
        $text = [];
486 6
        $sections = $this->getSectionsText($this->content);
487
        $current_font = new Font($this->document, null, null, $this->config);
488 6
489
        foreach ($sections as $section) {
490 6
            $commands = $this->getCommandsText($section);
491 6
492 5
            foreach ($commands as $command) {
493
                switch ($command[self::OPERATOR]) {
494 6
                    // set character spacing
495 6
                    case 'Tc':
496 6
                        break;
497 6
498
                        // move text current point
499
                    case 'Td':
500 5
                        break;
501 4
502
                        // move text current point and set leading
503 5
                    case 'TD':
504 4
                        break;
505
506
                    case 'Tf':
507 5
                        if (null !== $page) {
508
                            list($id) = preg_split('/\s/s', $command[self::COMMAND]);
509
                            $id = trim($id, '/');
510
                            $current_font = $page->getFont($id);
511 5
                        }
512 2
                        break;
513
514
                    case "'":
515 5
                    case 'Tj':
516
                        $command[self::COMMAND] = [$command];
517
                        // no break
518
                    case 'TJ':
519
                        // Check to see if $current_font can properly decode all this.
520 5
                        $use_font = $current_font;
521
                        $orig_text = $use_font->decodeText($command[self::COMMAND]);
522 4
                        $sub_text = $orig_text;
523
524 4
                        if (null !== $page) {
525
                            $font_ids = array_keys($page->getFonts());
526
527 4
                            // If the decoded text contains UTF-8 control characters
528
                            // then the font page being used is probably the wrong one.
529
                            // Loop through the rest of the fonts to see if we can get
530
                            // a good decode.
531
                            while (preg_match("/[\x00-\x1f\x7f]/u", $sub_text)) {
532
                                // If we're out of font IDs, then give up and use the
533
                                // original string
534
                                if (!\count($font_ids)) {
535
                                    $sub_text = $orig_text;
536
                                    break;
537 4
                                }
538 4
539 2
                                // Try the next font ID
540
                                $use_font = $page->getFont(array_pop($font_ids));
541 4
                                $sub_text = $use_font->decodeText($command[self::COMMAND]);
542
                            }
543
                        }
544 4
                        $text[] = $sub_text;
545
                        break;
546
547 4
                        // set leading
548
                    case 'TL':
549
                        break;
550 4
551 1
                    case 'Tm':
552
                        break;
553 4
554
                        // set super/subscripting text rise
555
                    case 'Ts':
556 4
                        break;
557 4
558
                        // set word spacing
559
                    case 'Tw':
560 4
                        break;
561 4
562 2
                        // set horizontal scaling
563
                    case 'Tz':
564 2
                        // $text .= "\n";
565
                        break;
566
567 2
                        // move to start of next line
568 2
                    case 'T*':
569
                        // $text .= "\n";
570
                        break;
571
572
                    case 'Da':
573
                        break;
574
575
                    case 'Do':
576 6
                        if (null !== $page) {
577
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
578
                            $id = trim(array_pop($args), '/ ');
579 29
                            if ($xobject = $page->getXObject($id)) {
580
                                $text[] = $xobject->getText($page);
581 29
                            }
582
                        }
583 29
                        break;
584 29
585 29
                    case 'rg':
586
                    case 'RG':
587 29
                        break;
588 29
589 29
                    case 're':
590
                        break;
591 29
592 29
                    case 'co':
593 29
                        break;
594 29
595 29
                    case 'cs':
596 29
                        break;
597
598
                    case 'gs':
599
                        break;
600 29
601 29
                    case 'en':
602 29
                        break;
603 11
604 11
                    case 'sc':
605 11
                    case 'SC':
606
                        break;
607
608
                    case 'g':
609 11
                    case 'G':
610 11
                        break;
611 11
612
                    case 'V':
613 29
                        break;
614
615 29
                    case 'vo':
616 29
                    case 'Vo':
617
                        break;
618 25
619 25
                    default:
620 25
                }
621
            }
622 25
        }
623
624 25
        return $text;
625 25
    }
626 25
627
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected T_VARIABLE, expecting T_STRING or T_NAME_QUALIFIED or T_NAME_FULLY_QUALIFIED or T_NAME_RELATIVE on line 627 at column 60
Loading history...
628
    {
629 25
        $commands = $matches = [];
630 25
631
        while ($offset < \strlen($text_part)) {
632 25
            $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset);
633
            $char = $text_part[$offset];
634 29
635 29
            $operator = '';
636
            $type = '';
637 14
            $command = false;
638 14
639 14
            switch ($char) {
640 14
                case '/':
641 14
                    $type = $char;
642 14
                    if (preg_match(
643
                        '/\G\/([A-Z0-9\._,\+-]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
644
                        $text_part,
645 14
                        $matches,
646 9
                        0,
647 9
                        $offset
648
                    )
649 14
                    ) {
650
                        $operator = $matches[2];
651 29
                        $command = $matches[1];
652 29
                        $offset += \strlen($matches[0]);
653 22
                    } elseif (preg_match(
654 22
                        '/\G\/([A-Z0-9\._,\+-]+)\s+([A-Z]+)\s*/si',
655 22
                        $text_part,
656 22
                        $matches,
657 22
                        0,
658 22
                        $offset
659 22
                    )
660
                    ) {
661
                        $operator = $matches[2];
662 22
                        $command = $matches[1];
663 22
                        $offset += \strlen($matches[0]);
664 22
                    }
665
                    break;
666
667 16
                case '[':
668 16
                case ']':
669
                    // array object
670 22
                    $type = $char;
671
                    if ('[' == $char) {
672
                        ++$offset;
673
                        // get elements
674
                        $command = $this->getCommandsText($text_part, $offset);
675 22
676
                        if (preg_match(
677 22
                            '/\G\s*[A-Z]{1,2}\s*/si',
678 22
                            $text_part,
679
                            $matches,
680 22
                            0,
681
                            $offset
682 22
                        )
683 22
                        ) {
684
                            $operator = trim($matches[0]);
685 22
                            $offset += \strlen($matches[0]);
686 18
                        }
687 18
                    } else {
688
                        ++$offset;
689
                        break;
690 22
                    }
691
                    break;
692
693 29
                case '<':
694 1
                case '>':
695 29
                    // array object
696 29
                    $type = $char;
697 29
                    ++$offset;
698
                    if ('<' == $char) {
699
                        $strpos = strpos($text_part, '>', $offset);
700
                        $command = substr($text_part, $offset, $strpos - $offset);
701 29
                        $offset = $strpos + 1;
702 29
                    }
703 29
704 24
                    if (preg_match(
705 22
                        '/\G\s*[A-Z]{1,2}\s*/si',
706 22
                        $text_part,
707 22
                        $matches,
708 17
                        0,
709 17
                        $offset
710 17
                    )
711 17
                    ) {
712 17
                        $operator = trim($matches[0]);
713
                        $offset += \strlen($matches[0]);
714
                    }
715
                    break;
716 29
717 29
                case '(':
718 29
                case ')':
719 29
                    ++$offset;
720 29
                    $type = $char;
721
                    $strpos = $offset;
722
                    if ('(' == $char) {
723 25
                        $open_bracket = 1;
724
                        while ($open_bracket > 0) {
725
                            if (!isset($text_part[$strpos])) {
726
                                break;
727 29
                            }
728
                            $ch = $text_part[$strpos];
729
                            switch ($ch) {
730 42
                                case '\\':
731
                                    // REVERSE SOLIDUS (5Ch) (Backslash)
732
                                    // skip next character
733
                                    ++$strpos;
734
                                    break;
735
736 42
                                case '(':
737 42
                                    // LEFT PARENHESIS (28h)
738 8
                                    ++$open_bracket;
739 8
                                    break;
740 3
741
                                case ')':
742 6
                                    // RIGHT PARENTHESIS (29h)
743 6
                                    --$open_bracket;
744
                                    break;
745
                            }
746
                            ++$strpos;
747
                        }
748 42
                        $command = substr($text_part, $offset, $strpos - $offset - 1);
749 41
                        $offset = $strpos;
750
751 42
                        if (preg_match(
752 41
                            '/\G\s*([A-Z\']{1,2})\s*/si',
753
                            $text_part,
754 42
                            $matches,
755 6
                            0,
756
                            $offset
757 42
                        )
758 41
                        ) {
759 41
                            $operator = $matches[1];
760
                            $offset += \strlen($matches[0]);
761 41
                        }
762 41
                    }
763
                    break;
764
765
                default:
766
                    if ('ET' == substr($text_part, $offset, 2)) {
767
                        break;
768 42
                    } elseif (preg_match(
769
                        '/\G\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si',
770
                        $text_part,
771
                        $matches,
772
                        0,
773
                        $offset
774
                    )
775 20
                    ) {
776
                        $operator = trim($matches['id']);
777 20
                        $command = trim($matches['data']);
778
                        $offset += \strlen($matches[0]);
779
                    } elseif (preg_match(
780
                        '/\G\s*([0-9\.\-]+\s*?)+\s*/si',
781
                        $text_part,
782
                        $matches,
783
                        0,
784
                        $offset
785
                    )
786
                    ) {
787
                        $type = 'n';
788
                        $command = trim($matches[0]);
789
                        $offset += \strlen($matches[0]);
790
                    } elseif (preg_match(
791
                        '/\G\s*([A-Z\*]+)\s*/si',
792
                        $text_part,
793
                        $matches,
794
                        0,
795
                        $offset
796
                    )
797
                    ) {
798
                        $type = '';
799
                        $operator = $matches[1];
800
                        $command = '';
801
                        $offset += \strlen($matches[0]);
802
                    }
803
            }
804
805
            if (false !== $command) {
806
                $commands[] = [
807
                    self::TYPE => $type,
808
                    self::OPERATOR => $operator,
809
                    self::COMMAND => $command,
810
                ];
811
            } else {
812
                break;
813
            }
814
        }
815
816
        return $commands;
817
    }
818
819
    public static function factory(
820
        Document $document,
821
        Header $header,
822
        ?string $content,
823
        Config $config = null
824
    ): self {
825
        switch ($header->get('Type')->getContent()) {
826
            case 'XObject':
827
                switch ($header->get('Subtype')->getContent()) {
828
                    case 'Image':
829
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
830
831
                    case 'Form':
832
                        return new Form($document, $header, $content, $config);
833
                }
834
835
                return new self($document, $header, $content, $config);
836
837
            case 'Pages':
838
                return new Pages($document, $header, $content, $config);
839
840
            case 'Page':
841
                return new Page($document, $header, $content, $config);
842
843
            case 'Encoding':
844
                return new Encoding($document, $header, $content, $config);
845
846
            case 'Font':
847
                $subtype = $header->get('Subtype')->getContent();
848
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
849
850
                if (class_exists($classname)) {
851
                    return new $classname($document, $header, $content, $config);
852
                }
853
854
                return new Font($document, $header, $content, $config);
855
856
            default:
857
                return new self($document, $header, $content, $config);
858
        }
859
    }
860
861
    /**
862
     * Returns unique id identifying the object.
863
     */
864
    protected function getUniqueId(): string
865
    {
866
        return spl_object_hash($this);
867
    }
868
}
869