Test Failed
Pull Request — master (#634)
by
unknown
02:06
created

PDFObject::getCommandsText()   C

Complexity

Conditions 15
Paths 14

Size

Total Lines 77
Code Lines 47

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 240

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 15
eloc 47
c 1
b 0
f 0
nc 14
nop 1
dl 0
loc 77
ccs 0
cts 0
cp 0
crap 240
rs 5.9166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    public function cleanContent(?string $content): string
136 32
    {
137 32
        if (null === $content) {
138
            return '';
139
        }
140
141
        // Find all strings () and replace them so they aren't affected
142 32
        // by the next steps
143 32
        $pdfstrings = [];
144 22
        $attempt = '(';
145
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
146
            // PDF strings can contain unescaped parentheses as long as
147
            // they're balanced, so check for balanced parentheses
148 32
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
149 32
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
150 21
151
            if ($left == $right) {
152
                // Replace the string with a unique placeholder
153
                $id = uniqid('STRING_', true);
154 32
                $pdfstrings[$id] = $text[0];
155 32
                $content = preg_replace(
156 32
                    '/'.preg_quote($text[0], '/').'/',
157 32
                    '@@@'.$id.'@@@',
158 32
                    $content,
159 18
                    1
160
                );
161
162 32
                // Reset to search for the next string
163
                $attempt = '(';
164 32
            } else {
165 18
                // We had unbalanced parentheses, so use the current
166
                // match as a base to find a longer string
167
                $attempt = $text[0];
168
            }
169
        }
170
171 32
        // Remove all carriage returns and line-feeds from the document stream
172 32
        $content = str_replace(["\r", "\n"], ' ', trim($content));
173
174
        // Find all dictionary << >> commands and replace them so they
175 32
        // aren't affected by the next steps
176
        $dictstore = [];
177 32
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
178 7
            $dictid = uniqid('DICT_', true);
179
            $dictstore[$dictid] = $dicttext[1];
180
            $content = preg_replace(
181 32
                '/'.preg_quote($dicttext[0], '/').'/',
182 32
                ' ###'.$dictid.'###'.$dicttext[2],
183 11
                $content,
184
                1
185
            );
186 32
        }
187
188
        // Now that all strings and dictionaries are hidden, the only
189 31
        // PDF commands left should all be plain text.
190
        // Detect text encoding of the current string to prevent reading
191 31
        // content streams that are images, etc. This prevents PHP
192 31
        // error messages when JPEG content is sent to this function
193 31
        // by the sample file '12249.pdf' from:
194
        // https://github.com/smalot/pdfparser/issues/458
195
        if (false === mb_detect_encoding($content, null, true)) {
196 31
            return '';
197 29
        }
198 29
199 29
        // Normalize white-space in the document stream
200
        $content = preg_replace('/\s{2,}/', ' ', $content);
201
202 29
        // Find all valid PDF operators and add \r\n after each; this
203 29
        // ensures there is just one command on every line
204
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
205
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
206 29
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
207
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
208
        //       appear here in the list for completeness.
209
        $operators = [
210 29
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
211
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
212 29
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
213
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
214
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
215
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
216
        ];
217 31
        foreach ($operators as $operator) {
218 4
            $content = preg_replace(
219 4
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
220 4
                $operator."\r\n",
221 4
                $content
222
            );
223 4
        }
224
225
        // Restore the original content of the dictionary << >> commands
226
        $dictstore = array_reverse($dictstore, true);
227 31
        foreach ($dictstore as $id => $dict) {
228
            $content = str_replace('###'.$id.'###', $dict, $content);
229
        }
230 20
231
        // Restore the original string content
232 20
        $pdfstrings = array_reverse($pdfstrings, true);
233 20
        foreach ($pdfstrings as $id => $text) {
234 19
            // Strings may contain escaped newlines, or literal newlines
235
            // and we should clean these up before replacing the string
236
            // back into the content stream; this ensures no strings are
237 20
            // split between two lines (every command must be on one line)
238 20
            $text = str_replace(
239 18
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
240
                ['', '', '', '\r', '\n'],
241
                $text
242 20
            );
243 18
244
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
245
        }
246 2
247
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
248
249
        return $content;
250
    }
251
252 20
    /**
253
     * getSectionsText() now takes an entire, unformatted document
254 20
     * stream as a string, cleans it, then filters out commands that
255 20
     * aren't needed for text positioning/extraction. It returns an
256 20
     * array of unprocessed PDF commands, one command per element.
257 20
     */
258
    public function getSectionsText(?string $content): array
259 20
    {
260 20
        $sections = [];
261
262 20
        // A cleaned stream has one command on every line, so split the
263
        // cleaned stream content on \r\n into an array
264 20
        $textCleaned = preg_split(
265 18
            '/(\r\n|\n|\r)/',
266 18
            $this->cleanContent($content),
267 18
            -1,
268
            \PREG_SPLIT_NO_EMPTY
269 18
        );
270 18
271 18
        $inTextBlock = false;
272 1
        foreach ($textCleaned as $line) {
273 1
            $line = trim($line);
274
275 1
            // Skip empty lines
276
            if ('' === $line) {
277
                continue;
278 18
            }
279 5
280
            // If a 'BT' is encountered, set the $inTextBlock flag
281
            if (preg_match('/BT$/', $line)) {
282 18
                $inTextBlock = true;
283 15
                $sections[] = $line;
284 15
285 15
                // If an 'ET' is encountered, unset the $inTextBlock flag
286 15
            } elseif ('ET' == $line) {
287 15
                $inTextBlock = false;
288
                $sections[] = $line;
289
            } elseif ($inTextBlock) {
290 11
                // If we are inside a BT ... ET text block, save all lines
291 15
                $sections[] = trim($line);
292 15
            } else {
293
                // Otherwise, if we are outside of a text block, only
294 12
                // save specific, necessary lines. Care should be taken
295
                // to ensure a command being checked for *only* matches
296 15
                // that command. For instance, a simple search for 'c'
297 15
                // may also match the 'sc' command. See the command
298
                // list in the cleanContent() method above.
299
                // Add more commands to save here as you find them in
300 18
                // weird PDFs!
301 3
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
302 3
                    // Save and restore graphics state commands
303 3
                    $sections[] = $line;
304 3
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
305 3
                    // Begin marked content sequence
306
                    $sections[] = $line;
307
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
308
                    // Marked content point
309 3
                    $sections[] = $line;
310
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
311 18
                    // End marked content sequence
312 18
                    $sections[] = $line;
313 18
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
314 18
                    // Graphics position change commands
315 18
                    $sections[] = $line;
316
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
317
                    // Font change commands
318
                    $sections[] = $line;
319
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
320
                    // Invoke named XObject command
321 18
                    $sections[] = $line;
322 16
                }
323
            }
324
        }
325 18
326
        return $sections;
327 18
    }
328
329 5
    private function getDefaultFont(Page $page = null): Font
330 5
    {
331
        $fonts = [];
332 18
        if (null !== $page) {
333
            $fonts = $page->getFonts();
334 6
        }
335 6
336
        $firstFont = $this->document->getFirstFont();
337 18
        if (null !== $firstFont) {
338 18
            $fonts[] = $firstFont;
339 13
        }
340
341 17
        if (\count($fonts) > 0) {
342 18
            return reset($fonts);
343 18
        }
344 18
345
        return new Font($this->document, null, null, $this->config);
346
    }
347 15
348 1
    /**
349 1
     * Decode a '[]TJ' command and attempt to use alternate fonts if
350
     * the current font results in output that contains Unicode control
351 15
     * characters. See Font::decodeText for a full description of
352 14
     * $textMatrix
353 14
     *
354 14
     * @param array<int,array<string,string|bool>> $command
355 14
     * @param array<string,float>                  $textMatrix
356 14
     */
357 14
    private function getTJUsingFontFallback(
358 12
        Font $font,
359
        array $command,
360
        array $textMatrix = ['a' => 1, 'b' => 0, 'i' => 0, 'j' => 1],
361 14
        Page $page = null
362 14
    ): string {
363 14
        $orig_text = $font->decodeText($command, $textMatrix);
364 10
        $text = $orig_text;
365
366
        // If we make this a Config option, we can add a check if it's
367 14
        // enabled here.
368 14
        if (null !== $page) {
369
            $font_ids = array_keys($page->getFonts());
370
371 12
            // If the decoded text contains UTF-8 control characters
372
            // then the font page being used is probably the wrong one.
373
            // Loop through the rest of the fonts to see if we can get
374
            // a good decode. Allow x09 to x0d which are whitespace.
375 12
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
376 4
                // If we're out of font IDs, then give up and use the
377
                // original string
378
                if (0 == \count($font_ids)) {
379 12
                    return $orig_text;
380
                }
381
382
                // Try the next font ID
383
                $font = $page->getFont(array_shift($font_ids));
384 12
                $text = $font->decodeText($command, $textMatrix);
385 4
            }
386 4
        }
387
388 11
        return $text;
389
    }
390
391 11
    /**
392 4
     * @throws \Exception
393 4
     */
394 4
    public function parseDictionary(string $dictionary): array
395 4
    {
396
        // Normalize whitespace
397
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
398 4
399
        if ('<<' != substr($dictionary, 0, 2)) {
400 4
            throw new \Exception('Not a valid dictionary object.');
401
        }
402
403 4
        $parsed = [];
404
        $stack = [];
405 9
        $currentName = '';
406 8
        $arrayTypeNumeric = false;
407 2
408
        // Remove outer layer of dictionary, and split on tokens
409 8
        $split = preg_split(
410
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
411
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
412 8
            -1,
413
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
414
        );
415 8
416 3
        foreach ($split as $token) {
417
            $token = trim($token);
418 8
            switch ($token) {
419 3
                case '':
420
                    break;
421 7
422
                    // Open numeric array
423
                case '[':
424 7
                    $parsed[$currentName] = [];
425 7
                    $arrayTypeNumeric = true;
426
427
                    // Move up one level in the stack
428 7
                    $stack[\count($stack)] = &$parsed;
429 7
                    $parsed = &$parsed[$currentName];
430 1
                    $currentName = '';
431
                    break;
432 6
433
                    // Open hashed array
434
                case '<<':
435 6
                    $parsed[$currentName] = [];
436 6
                    $arrayTypeNumeric = false;
437
438
                    // Move up one level in the stack
439
                    $stack[\count($stack)] = &$parsed;
440
                    $parsed = &$parsed[$currentName];
441
                    $currentName = '';
442
                    break;
443
444
                    // Close numeric array
445 18
                case ']':
446 1
                    // Revert string type arrays back to a single element
447 1
                    if (\is_array($parsed) && 1 == \count($parsed)
448
                        && isset($parsed[0]) && \is_string($parsed[0])
449
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
450 18
                        $parsed = '['.$parsed[0].']';
451
                    }
452
                    // Close hashed array
453 20
                    // no break
454
                case '>>':
455
                    $arrayTypeNumeric = false;
456
457
                    // Move down one level in the stack
458
                    $parsed = &$stack[\count($stack) - 1];
459 6
                    unset($stack[\count($stack) - 1]);
460
                    break;
461 6
462 6
                default:
463 6
                    // If value begins with a slash, then this is a name
464
                    // Add it to the appropriate array
465 6
                    if ('/' == substr($token, 0, 1)) {
466 6
                        $currentName = substr($token, 1);
467
                        if (true == $arrayTypeNumeric) {
468 6
                            $parsed[] = $currentName;
469 6
                            $currentName = '';
470
                        }
471 6
                    } elseif ('' != $currentName) {
472 3
                        if (false == $arrayTypeNumeric) {
473
                            $parsed[$currentName] = $token;
474
                        }
475 6
                        $currentName = '';
476 6
                    } elseif ('' == $currentName) {
477
                        $parsed[] = $token;
478
                    }
479 6
            }
480
        }
481
482 6
        return $parsed;
483 6
    }
484 6
485 6
    /**
486 6
     * getText() leverages getTextArray() to get the content of the
487
     * document, setting the addPositionWhitespace flag to true so
488 6
     * whitespace is inserted in a logical way for reading by humans.
489
     */
490 6
    public function getText(Page $page = null): string
491 6
    {
492 5
        $this->addPositionWhitespace = true;
493
        $result = $this->getTextArray($page);
494 6
        $this->addPositionWhitespace = false;
495 6
496 6
        return implode('', $result).' ';
497 6
    }
498
499
    /**
500 5
     * getTextArray() returns the text objects of a document in an
501 4
     * array. By default no positioning whitespace is added to the
502
     * output unless the addPositionWhitespace flag is set to true.
503 5
     *
504 4
     * @throws \Exception
505
     */
506
    public function getTextArray(Page $page = null): array
507 5
    {
508
        $result = [];
509
        $text = [];
510
511 5
        $marked_stack = [];
512 2
        $last_written_position = false;
513
514
        $sections = $this->getSectionsText($this->content);
515 5
        $current_font = $this->getDefaultFont($page);
516
517
        $current_position = ['x' => false, 'y' => false];
518
        $current_position_tm = [
519
            'a' => 1, 'b' => 0, 'c' => 0,
520 5
            'i' => 0, 'j' => 1, 'k' => 0,
521
            'x' => false, 'y' => false, 'z' => 1,
522 4
        ];
523
        $current_position_td = ['x' => 0, 'y' => 0];
524 4
        $current_position_cm = [
525
            'a' => 1, 'b' => 0, 'c' => 0,
526
            'i' => 0, 'j' => 1, 'k' => 0,
527 4
            'x' => 0, 'y' => 0, 'z' => 1,
528
        ];
529
530
        $clipped_font = [];
531
        $clipped_position_cm = [];
532
533
        self::$recursionStack[] = $this->getUniqueId();
534
535
        foreach ($sections as $section) {
536
            $commands = $this->getCommandsText($section);
537 4
            foreach ($commands as $command) {
538 4
                switch ($command[self::OPERATOR]) {
539 2
                    case 'BT':
540
                        // Reset text positioning matrices
541 4
                        $current_position_tm = [
542
                            'a' => 1, 'b' => 0, 'c' => 0,
543
                            'i' => 0, 'j' => 1, 'k' => 0,
544 4
                            'x' => false, 'y' => false, 'z' => 1,
545
                        ];
546
                        $current_position_td = ['x' => 0, 'y' => 0];
547 4
                        break;
548
549
                    case 'ET':
550 4
                        break;
551 1
552
                        // set character spacing
553 4
                    case 'Tc':
554
                        break;
555
556 4
                        // move text current point and set leading
557 4
                    case 'TD':
558
                    case 'Td':
559
                        // move text current point
560 4
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
561 4
                        $y = (float) array_pop($args);
562 2
                        $x = (float) array_pop($args);
563
564 2
                        $current_position_td = [
565
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
566
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
567 2
                        ];
568 2
                        break;
569
570
                    case 'Tf':
571
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
572
                        $id = trim($id, '/');
573
                        if (null !== $page) {
574
                            $new_font = $page->getFont($id);
575
                            // If an invalid font ID is given, do not update the font.
576 6
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
577
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
578
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
579 29
                            // But we want to make sure that malformed PDFs do not simply crash.
580
                            if (null !== $new_font) {
581 29
                                $current_font = $new_font;
582
                            }
583 29
                        }
584 29
                        break;
585 29
586
                        // Store current selected font and graphics matrix
587 29
                    case 'q':
588 29
                        $clipped_font[] = $current_font;
589 29
                        $clipped_position_cm[] = $current_position_cm;
590
                        break;
591 29
592 29
                        // Restore previous selected font and graphics matrix
593 29
                    case 'Q':
594 29
                        $current_font = array_pop($clipped_font);
595 29
                        $current_position_cm = array_pop($clipped_position_cm);
596 29
                        break;
597
598
                    case 'DP':
599
                    case 'MP':
600 29
                        break;
601 29
602 29
                        // Begin marked content sequence with property list
603 11
                    case 'BDC':
604 11
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
605 11
                            $dict = $this->parseDictionary($match[1]);
606
607
                            // Check for ActualText block
608
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
609 11
                                if ('[' == $dict['ActualText'][0]) {
610 11
                                    // Simulate a 'TJ' command on the stack
611 11
                                    $marked_stack[] = [
612
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
613 29
                                    ];
614
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
615 29
                                    // Simulate a 'Tj' command on the stack
616 29
                                    $marked_stack[] = [
617
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
618 25
                                    ];
619 25
                                }
620 25
                            }
621
                        }
622 25
                        break;
623
624 25
                        // Begin marked content sequence
625 25
                    case 'BMC':
626 25
                        if ('ReversedChars' == $command[self::COMMAND]) {
627
                            // Upon encountering a ReversedChars command,
628
                            // add the characters we've built up so far to
629 25
                            // the result array
630 25
                            $result = array_merge($result, $text);
631
632 25
                            // Start a fresh $text array that will contain
633
                            // reversed characters
634 29
                            $text = [];
635 29
636
                            // Add the reversed text flag to the stack
637 14
                            $marked_stack[] = ['ReversedChars' => true];
638 14
                        }
639 14
                        break;
640 14
641 14
                        // End marked content sequence
642 14
                    case 'EMC':
643
                        $data = false;
644
                        if (\count($marked_stack)) {
645 14
                            $marked = array_pop($marked_stack);
646 9
                            $action = key($marked);
647 9
                            $data = $marked[$action];
648
649 14
                            switch ($action) {
650
                                // If we are in ReversedChars mode...
651 29
                                case 'ReversedChars':
652 29
                                    // Reverse the characters we've built up so far
653 22
                                    foreach ($text as $key => $t) {
654 22
                                        $text[$key] = implode('', array_reverse(
655 22
                                            mb_str_split($t, 1, mb_internal_encoding())
656 22
                                        ));
657 22
                                    }
658 22
659 22
                                    // Add these characters to the result array
660
                                    $result = array_merge($result, $text);
661
662 22
                                    // Start a fresh $text array that will contain
663 22
                                    // non-reversed characters
664 22
                                    $text = [];
665
                                    break;
666
667 16
                                case 'ActualText':
668 16
                                    // Use the content of the ActualText as a command
669
                                    $command = $data;
670 22
                                    break;
671
                            }
672
                        }
673
674
                        // If this EMC command has been transformed into a 'Tj'
675 22
                        // or 'TJ' command because of being ActualText, then bypass
676
                        // the break to proceed to the writing section below.
677 22
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
678 22
                            break;
679
                        }
680 22
681
                        // no break
682 22
                    case "'":
683 22
                    case '"':
684
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
685 22
                            // Move to next line and write text
686 18
                            $current_position['x'] = 0;
687 18
                            $current_position_td['x'] = 0;
688
                            $current_position_td['y'] += 10;
689
                        }
690 22
                        // no break
691
                    case 'Tj':
692
                        $command[self::COMMAND] = [$command];
693 29
                        // no break
694 1
                    case 'TJ':
695 29
                        // Check the marked content stack for flags
696 29
                        $actual_text = false;
697 29
                        $reverse_text = false;
698
                        foreach ($marked_stack as $marked) {
699
                            if (isset($marked['ActualText'])) {
700
                                $actual_text = true;
701 29
                            }
702 29
                            if (isset($marked['ReversedChars'])) {
703 29
                                $reverse_text = true;
704 24
                            }
705 22
                        }
706 22
707 22
                        // Account for text position ONLY just before we write text
708 17
                        if (false === $actual_text && \is_array($last_written_position)) {
709 17
                            // If $last_written_position is an array, that
710 17
                            // means we have stored text position coordinates
711 17
                            // for placing an ActualText
712 17
                            $currentX = $last_written_position[0];
713
                            $currentY = $last_written_position[1];
714
                            $last_written_position = false;
715
                        } else {
716 29
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
717 29
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
718 29
                        }
719 29
                        $whiteSpace = '';
720 29
721
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
722
                            if (abs($currentY - $current_position['y']) > 9) {
723 25
                                $whiteSpace = "\n";
724
                            } else {
725
                                $curX = $currentX - $current_position['x'];
726
                                $factorX = 10 * $current_position_tm['a'] + 10 * $current_position_tm['i'];
727 29
                                if (true === $reverse_text) {
728
                                    if ($curX < -abs($factorX * 8)) {
729
                                        $whiteSpace = "\t";
730 42
                                    } elseif ($curX < -abs($factorX)) {
731
                                        $whiteSpace = ' ';
732
                                    }
733
                                } else {
734
                                    if ($curX > ($factorX * 8)) {
735
                                        $whiteSpace = "\t";
736 42
                                    } elseif ($curX > $factorX) {
737 42
                                        $whiteSpace = ' ';
738 8
                                    }
739 8
                                }
740 3
                            }
741
                        }
742 6
743 6
                        $newtext = $this->getTJUsingFontFallback(
744
                            $current_font,
745
                            $command[self::COMMAND],
746
                            $current_position_tm,
747
                            $page
748 42
                        );
749 41
750
                        // If there is no ActualText pending then write
751 42
                        if (false === $actual_text) {
752 41
                            if (false !== $reverse_text) {
753
                                // If we are in ReversedChars mode, add the whitespace last
754 42
                                $text[] = str_replace(["\r", "\n"], '', $newtext).$whiteSpace;
755 6
                            } else {
756
                                // Otherwise add the whitespace first
757 42
                                $text[] = $whiteSpace.str_replace(["\r", "\n"], '', $newtext);
758 41
                            }
759 41
760
                            // Record the position of this inserted text for comparison
761 41
                            // with the next text block.
762 41
                            // Provide a 'fudge' factor guess on how wide this text block
763
                            // is based on the number of characters. This helps limit the
764
                            // number of tabs inserted, but isn't perfect.
765
                            $factor = 6;
766
                            if (true === $reverse_text) {
767
                                $factor = -$factor;
768 42
                            }
769
                            $current_position = [
770
                                'x' => $currentX + mb_strlen($newtext) * $factor,
771
                                'y' => $currentY,
772
                            ];
773
                        } elseif (false === $last_written_position) {
774
                            // If there is an ActualText in the pipeline
775 20
                            // store the position this undisplayed text
776
                            // *would* have been written to, so the
777 20
                            // ActualText is displayed in the right spot
778
                            $last_written_position = [$currentX, $currentY];
779
                        }
780
                        break;
781
782
                        // set leading
783
                    case 'TL':
784
                        break;
785
786
                        // set graphics position matrix
787
                    case 'cm':
788
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
789
                        $current_position_cm = [
790
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
791
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
792
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
793
                        ];
794
                        break;
795
796
                        // set text position matrix
797
                    case 'Tm':
798
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
799
                        $current_position_tm = [
800
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
801
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
802
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
803
                        ];
804
                        break;
805
806
                        // set super/subscripting text rise
807
                    case 'Ts':
808
                        break;
809
810
                        // set word spacing
811
                    case 'Tw':
812
                        break;
813
814
                        // set horizontal scaling
815
                    case 'Tz':
816
                        break;
817
818
                        // move to start of next line
819
                    case 'T*':
820
                        $current_position['x'] = 0;
821
                        $current_position_td['x'] = 0;
822
                        $current_position_td['y'] += 10;
823
                        break;
824
825
                    case 'Da':
826
                        break;
827
828
                    case 'Do':
829
                        if (null !== $page) {
830
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
831
                            $id = trim(array_pop($args), '/ ');
832
                            $xobject = $page->getXObject($id);
833
834
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
835
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
836
                                // Not a circular reference.
837
                                $text[] = $xobject->getText($page);
838
                            }
839
                        }
840
                        break;
841
842
                    case 'rg':
843
                    case 'RG':
844
                        break;
845
846
                    case 're':
847
                        break;
848
849
                    case 'co':
850
                        break;
851
852
                    case 'cs':
853
                        break;
854
855
                    case 'gs':
856
                        break;
857
858
                    case 'en':
859
                        break;
860
861
                    case 'sc':
862
                    case 'SC':
863
                        break;
864
865
                    case 'g':
866
                    case 'G':
867
                        break;
868
869
                    case 'V':
870
                        break;
871
872
                    case 'vo':
873
                    case 'Vo':
874
                        break;
875
876
                    default:
877
                }
878
            }
879
        }
880
881
        $result = array_merge($result, $text);
882
883
        return $result;
884
    }
885
886
    /**
887
     * getCommandsText() expects the content of $text_part to be an
888
     * already formatted, single-line command from a document stream.
889
     * The companion function getSectionsText() returns a document
890
     * stream as an array of single commands for just this purpose.
891
     *
892
     * A better name for this function would be getCommandText()
893
     * since it now always works on just one command.
894
     */
895
    public function getCommandsText(string $text_part): array
896
    {
897
        $commands = $matches = [];
898
899
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
900
901
        $type = $matches[2];
902
        $operator = $matches[3];
903
        $command = trim($matches[1]);
904
905
        if ('TJ' == $operator) {
906
            $subcommand = [];
907
            $command = trim($command, '[]');
908
            do {
909
                $oldCommand = $command;
910
911
                // Search for parentheses string () format
912
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
913
                    $subcommand[] = [
914
                        self::TYPE => '(',
915
                        self::OPERATOR => 'TJ',
916
                        self::COMMAND => $tjmatch[1],
917
                    ];
918
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
919
                        $subcommand[] = [
920
                            self::TYPE => 'n',
921
                            self::OPERATOR => '',
922
                            self::COMMAND => $tjmatch[2],
923
                        ];
924
                    }
925
                    $command = substr($command, \strlen($tjmatch[0]));
926
                }
927
928
                // Search for hexadecimal <> format
929
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
930
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
931
                    $subcommand[] = [
932
                        self::TYPE => '<',
933
                        self::OPERATOR => 'TJ',
934
                        self::COMMAND => $tjmatch[1],
935
                    ];
936
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
937
                        $subcommand[] = [
938
                            self::TYPE => 'n',
939
                            self::OPERATOR => '',
940
                            self::COMMAND => $tjmatch[2],
941
                        ];
942
                    }
943
                    $command = substr($command, \strlen($tjmatch[0]));
944
                }
945
            } while ($command != $oldCommand);
946
947
            $command = $subcommand;
948
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
949
            // Depending on the string type, trim the data of the
950
            // appropriate delimiters
951
            if ('(' == $type) {
952
                // Don't use trim() here since a () string may end with
953
                // a balanced or escaped right parentheses, and trim()
954
                // will delete both. Both strings below are valid:
955
                //   eg. (String())
956
                //   eg. (String\))
957
                $command = preg_replace('/^\(|\)$/', '', $command);
958
            } elseif ('<' == $type) {
959
                $command = trim($command, '<>');
960
            }
961
        } elseif ('/' == $type) {
962
            $command = substr($command, 1);
963
        }
964
965
        $commands[] = [
966
            self::TYPE => $type,
967
            self::OPERATOR => $operator,
968
            self::COMMAND => $command,
969
        ];
970
971
        return $commands;
972
    }
973
974
    public static function factory(
975
        Document $document,
976
        Header $header,
977
        ?string $content,
978
        Config $config = null
979
    ): self {
980
        switch ($header->get('Type')->getContent()) {
981
            case 'XObject':
982
                switch ($header->get('Subtype')->getContent()) {
983
                    case 'Image':
984
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
985
986
                    case 'Form':
987
                        return new Form($document, $header, $content, $config);
988
                }
989
990
                return new self($document, $header, $content, $config);
991
992
            case 'Pages':
993
                return new Pages($document, $header, $content, $config);
994
995
            case 'Page':
996
                return new Page($document, $header, $content, $config);
997
998
            case 'Encoding':
999
                return new Encoding($document, $header, $content, $config);
1000
1001
            case 'Font':
1002
                $subtype = $header->get('Subtype')->getContent();
1003
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1004
1005
                if (class_exists($classname)) {
1006
                    return new $classname($document, $header, $content, $config);
1007
                }
1008
1009
                return new Font($document, $header, $content, $config);
1010
1011
            default:
1012
                return new self($document, $header, $content, $config);
1013
        }
1014
    }
1015
1016
    /**
1017
     * Returns unique id identifying the object.
1018
     */
1019
    protected function getUniqueId(): string
1020
    {
1021
        return spl_object_hash($this);
1022
    }
1023
}
1024