Test Failed
Pull Request — master (#634)
by
unknown
02:07
created

PDFObject::factory()   B

Complexity

Conditions 10
Paths 9

Size

Total Lines 39
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 110

Importance

Changes 0
Metric Value
cc 10
eloc 22
c 0
b 0
f 0
nc 9
nop 4
dl 0
loc 39
ccs 0
cts 0
cp 0
crap 110
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    public function cleanContent(?string $content): string
136 32
    {
137 32
        if (null === $content) {
138
            return '';
139
        }
140
141
        // Find all strings () and replace them so they aren't affected
142 32
        // by the next steps
143 32
        $pdfstrings = [];
144 22
        $attempt = '(';
145
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
146
            // PDF strings can contain unescaped parentheses as long as
147
            // they're balanced, so check for balanced parentheses
148 32
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
149 32
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
150 21
151
            if ($left == $right) {
152
                // Replace the string with a unique placeholder
153
                $id = uniqid('STRING_', true);
154 32
                $pdfstrings[$id] = $text[0];
155 32
                $content = preg_replace(
156 32
                    '/'.preg_quote($text[0], '/').'/',
157 32
                    '@@@'.$id.'@@@',
158 32
                    $content,
159 18
                    1
160
                );
161
162 32
                // Reset to search for the next string
163
                $attempt = '(';
164 32
            } else {
165 18
                // We had unbalanced parentheses, so use the current
166
                // match as a base to find a longer string
167
                $attempt = $text[0];
168
            }
169
        }
170
171 32
        // Remove all carriage returns and line-feeds from the document stream
172 32
        $content = str_replace(["\r", "\n"], ' ', trim($content));
173
174
        // Find all dictionary << >> commands and replace them so they
175 32
        // aren't affected by the next steps
176
        $dictstore = [];
177 32
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
178 7
            $dictid = uniqid('DICT_', true);
179
            $dictstore[$dictid] = $dicttext[1];
180
            $content = preg_replace(
181 32
                '/'.preg_quote($dicttext[0], '/').'/',
182 32
                ' ###'.$dictid.'###'.$dicttext[2],
183 11
                $content,
184
                1
185
            );
186 32
        }
187
188
        // Now that all strings and dictionaries are hidden, the only
189 31
        // PDF commands left should all be plain text.
190
        // Detect MIME-type of the current string and prevent reading
191 31
        // content streams that are images, etc. This prevents PHP
192 31
        // error messages when JPEG content is sent to this function
193 31
        // by the sample file '12249.pdf' from:
194
        // https://github.com/smalot/pdfparser/issues/458
195
        $finfo = new \finfo(\FILEINFO_MIME);
196 31
        if (false === strpos($finfo->buffer($content), 'text/plain')) {
197 29
            return '';
198 29
        }
199 29
200
        // Normalize white-space in the document stream
201
        $content = preg_replace('/\s{2,}/', ' ', $content);
202 29
203 29
        // Find all valid PDF operators and add \r\n after each; this
204
        // ensures there is just one command on every line
205
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
206 29
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
207
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
208
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
209
        //       appear here in the list for completeness.
210 29
        $operators = [
211
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
212 29
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
213
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
214
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
215
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
216
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
217 31
        ];
218 4
        foreach ($operators as $operator) {
219 4
            $content = preg_replace(
220 4
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
221 4
                $operator."\r\n",
222
                $content
223 4
            );
224
        }
225
226
        // Restore the original content of the dictionary << >> commands
227 31
        $dictstore = array_reverse($dictstore, true);
228
        foreach ($dictstore as $id => $dict) {
229
            $content = str_replace('###'.$id.'###', $dict, $content);
230 20
        }
231
232 20
        // Restore the original string content
233 20
        $pdfstrings = array_reverse($pdfstrings, true);
234 19
        foreach ($pdfstrings as $id => $text) {
235
            // Strings may contain escaped newlines, or literal newlines
236
            // and we should clean these up before replacing the string
237 20
            // back into the content stream; this ensures no strings are
238 20
            // split between two lines (every command must be on one line)
239 18
            $text = str_replace(
240
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
241
                ['', '', '', '\r', '\n'],
242 20
                $text
243 18
            );
244
245
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
246 2
        }
247
248
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
249
250
        return $content;
251
    }
252 20
253
    /**
254 20
     * getSectionsText() now takes an entire, unformatted document
255 20
     * stream as a string, cleans it, then filters out commands that
256 20
     * aren't needed for text positioning/extraction. It returns an
257 20
     * array of unprocessed PDF commands, one command per element.
258
     */
259 20
    public function getSectionsText(?string $content): array
260 20
    {
261
        $sections = [];
262 20
263
        // A cleaned stream has one command on every line, so split the
264 20
        // cleaned stream content on \r\n into an array
265 18
        $textCleaned = preg_split(
266 18
            '/(\r\n|\n|\r)/',
267 18
            $this->cleanContent($content),
268
            -1,
269 18
            \PREG_SPLIT_NO_EMPTY
270 18
        );
271 18
272 1
        $inTextBlock = false;
273 1
        foreach ($textCleaned as $line) {
274
            $line = trim($line);
275 1
276
            // Skip empty lines
277
            if ('' === $line) {
278 18
                continue;
279 5
            }
280
281
            // If a 'BT' is encountered, set the $inTextBlock flag
282 18
            if (preg_match('/BT$/', $line)) {
283 15
                $inTextBlock = true;
284 15
                $sections[] = $line;
285 15
286 15
                // If an 'ET' is encountered, unset the $inTextBlock flag
287 15
            } elseif ('ET' == $line) {
288
                $inTextBlock = false;
289
                $sections[] = $line;
290 11
            } elseif ($inTextBlock) {
291 15
                // If we are inside a BT ... ET text block, save all lines
292 15
                $sections[] = trim($line);
293
            } else {
294 12
                // Otherwise, if we are outside of a text block, only
295
                // save specific, necessary lines. Care should be taken
296 15
                // to ensure a command being checked for *only* matches
297 15
                // that command. For instance, a simple search for 'c'
298
                // may also match the 'sc' command. See the command
299
                // list in the cleanContent() method above.
300 18
                // Add more commands to save here as you find them in
301 3
                // weird PDFs!
302 3
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
303 3
                    // Save and restore graphics state commands
304 3
                    $sections[] = $line;
305 3
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
306
                    // Begin marked content sequence
307
                    $sections[] = $line;
308
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
309 3
                    // Marked content point
310
                    $sections[] = $line;
311 18
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
312 18
                    // End marked content sequence
313 18
                    $sections[] = $line;
314 18
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
315 18
                    // Graphics position change commands
316
                    $sections[] = $line;
317
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
318
                    // Font change commands
319
                    $sections[] = $line;
320
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
321 18
                    // Invoke named XObject command
322 16
                    $sections[] = $line;
323
                }
324
            }
325 18
        }
326
327 18
        return $sections;
328
    }
329 5
330 5
    private function getDefaultFont(Page $page = null): Font
331
    {
332 18
        $fonts = [];
333
        if (null !== $page) {
334 6
            $fonts = $page->getFonts();
335 6
        }
336
337 18
        $firstFont = $this->document->getFirstFont();
338 18
        if (null !== $firstFont) {
339 13
            $fonts[] = $firstFont;
340
        }
341 17
342 18
        if (\count($fonts) > 0) {
343 18
            return reset($fonts);
344 18
        }
345
346
        return new Font($this->document, null, null, $this->config);
347 15
    }
348 1
349 1
    /**
350
     * @param array<int,array<string,string|bool>> $command
351 15
     * @param array<string,float>                  $textMatrix
352 14
     */
353 14
    private function getTJUsingFontFallback(
354 14
        Font $font,
355 14
        array $command,
356 14
        array $textMatrix = ['a' => 1, 'b' => 0, 'i' => 0, 'j' => 1],
357 14
        Page $page = null
358 12
    ): string {
359
        $orig_text = $font->decodeText($command, $textMatrix);
360
        $text = $orig_text;
361 14
362 14
        // If we make this a Config option, we can add a check if it's
363 14
        // enabled here.
364 10
        if (null !== $page) {
365
            $font_ids = array_keys($page->getFonts());
366
367 14
            // If the decoded text contains UTF-8 control characters
368 14
            // then the font page being used is probably the wrong one.
369
            // Loop through the rest of the fonts to see if we can get
370
            // a good decode. Allow x09 to x0d which are whitespace.
371 12
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
372
                // If we're out of font IDs, then give up and use the
373
                // original string
374
                if (0 == \count($font_ids)) {
375 12
                    return $orig_text;
376 4
                }
377
378
                // Try the next font ID
379 12
                $font = $page->getFont(array_shift($font_ids));
380
                $text = $font->decodeText($command, $textMatrix);
381
            }
382
        }
383
384 12
        return $text;
385 4
    }
386 4
387
    /**
388 11
     * @throws \Exception
389
     */
390
    public function parseDictionary(string $dictionary): array
391 11
    {
392 4
        // Normalize whitespace
393 4
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
394 4
395 4
        if ('<<' != substr($dictionary, 0, 2)) {
396
            throw new \Exception('Not a valid dictionary object.');
397
        }
398 4
399
        $parsed = [];
400 4
        $stack = [];
401
        $currentName = '';
402
        $arrayTypeNumeric = false;
403 4
404
        // Remove outer layer of dictionary, and split on tokens
405 9
        $split = preg_split(
406 8
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
407 2
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
408
            -1,
409 8
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
410
        );
411
412 8
        foreach ($split as $token) {
413
            $token = trim($token);
414
            switch ($token) {
415 8
                case '':
416 3
                    break;
417
418 8
                    // Open numeric array
419 3
                case '[':
420
                    $parsed[$currentName] = [];
421 7
                    $arrayTypeNumeric = true;
422
423
                    // Move up one level in the stack
424 7
                    $stack[\count($stack)] = &$parsed;
425 7
                    $parsed = &$parsed[$currentName];
426
                    $currentName = '';
427
                    break;
428 7
429 7
                    // Open hashed array
430 1
                case '<<':
431
                    $parsed[$currentName] = [];
432 6
                    $arrayTypeNumeric = false;
433
434
                    // Move up one level in the stack
435 6
                    $stack[\count($stack)] = &$parsed;
436 6
                    $parsed = &$parsed[$currentName];
437
                    $currentName = '';
438
                    break;
439
440
                    // Close numeric array
441
                case ']':
442
                    // Revert string type arrays back to a single element
443
                    if (\is_array($parsed) && 1 == \count($parsed)
444
                        && isset($parsed[0]) && \is_string($parsed[0])
445 18
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
446 1
                        $parsed = '['.$parsed[0].']';
447 1
                    }
448
                    // Close hashed array
449
                    // no break
450 18
                case '>>':
451
                    $arrayTypeNumeric = false;
452
453 20
                    // Move down one level in the stack
454
                    $parsed = &$stack[\count($stack) - 1];
455
                    unset($stack[\count($stack) - 1]);
456
                    break;
457
458
                default:
459 6
                    // If value begins with a slash, then this is a name
460
                    // Add it to the appropriate array
461 6
                    if ('/' == substr($token, 0, 1)) {
462 6
                        $currentName = substr($token, 1);
463 6
                        if (true == $arrayTypeNumeric) {
464
                            $parsed[] = $currentName;
465 6
                            $currentName = '';
466 6
                        }
467
                    } elseif ('' != $currentName) {
468 6
                        if (false == $arrayTypeNumeric) {
469 6
                            $parsed[$currentName] = $token;
470
                        }
471 6
                        $currentName = '';
472 3
                    } elseif ('' == $currentName) {
473
                        $parsed[] = $token;
474
                    }
475 6
            }
476 6
        }
477
478
        return $parsed;
479 6
    }
480
481
    /**
482 6
     * getText() leverages getTextArray() to get the content of the
483 6
     * document, setting the addPositionWhitespace flag to true so
484 6
     * whitespace is inserted in a logical way for reading by humans.
485 6
     */
486 6
    public function getText(Page $page = null): string
487
    {
488 6
        $this->addPositionWhitespace = true;
489
        $result = $this->getTextArray($page);
490 6
        $this->addPositionWhitespace = false;
491 6
492 5
        return implode('', $result).' ';
493
    }
494 6
495 6
    /**
496 6
     * getTextArray() returns the text objects of a document in an
497 6
     * array. By default no positioning whitespace is added to the
498
     * output unless the addPositionWhitespace flag is set to true.
499
     *
500 5
     * @throws \Exception
501 4
     */
502
    public function getTextArray(Page $page = null): array
503 5
    {
504 4
        $result = [];
505
        $text = [];
506
507 5
        $marked_stack = [];
508
        $last_written_position = false;
509
510
        $sections = $this->getSectionsText($this->content);
511 5
        $current_font = $this->getDefaultFont($page);
512 2
513
        $current_position = ['x' => false, 'y' => false];
514
        $current_position_tm = [
515 5
            'a' => 1, 'b' => 0, 'c' => 0,
516
            'i' => 0, 'j' => 1, 'k' => 0,
517
            'x' => false, 'y' => false, 'z' => 1,
518
        ];
519
        $current_position_td = ['x' => 0, 'y' => 0];
520 5
        $current_position_cm = [
521
            'a' => 1, 'b' => 0, 'c' => 0,
522 4
            'i' => 0, 'j' => 1, 'k' => 0,
523
            'x' => 0, 'y' => 0, 'z' => 1,
524 4
        ];
525
526
        $clipped_font = [];
527 4
        $clipped_position_cm = [];
528
529
        self::$recursionStack[] = $this->getUniqueId();
530
531
        foreach ($sections as $section) {
532
            $commands = $this->getCommandsText($section);
533
            foreach ($commands as $command) {
534
                switch ($command[self::OPERATOR]) {
535
                    case 'BT':
536
                        // Reset text positioning matrices
537 4
                        $current_position_tm = [
538 4
                            'a' => 1, 'b' => 0, 'c' => 0,
539 2
                            'i' => 0, 'j' => 1, 'k' => 0,
540
                            'x' => false, 'y' => false, 'z' => 1,
541 4
                        ];
542
                        $current_position_td = ['x' => 0, 'y' => 0];
543
                        break;
544 4
545
                    case 'ET':
546
                        break;
547 4
548
                        // set character spacing
549
                    case 'Tc':
550 4
                        break;
551 1
552
                        // move text current point and set leading
553 4
                    case 'TD':
554
                    case 'Td':
555
                        // move text current point
556 4
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
557 4
                        $y = (float) array_pop($args);
558
                        $x = (float) array_pop($args);
559
560 4
                        $current_position_td = [
561 4
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
562 2
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
563
                        ];
564 2
                        break;
565
566
                    case 'Tf':
567 2
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
568 2
                        $id = trim($id, '/');
569
                        if (null !== $page) {
570
                            $new_font = $page->getFont($id);
571
                            // If an invalid font ID is given, do not update the font.
572
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
573
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
574
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
575
                            // But we want to make sure that malformed PDFs do not simply crash.
576 6
                            if (null !== $new_font) {
577
                                $current_font = $new_font;
578
                            }
579 29
                        }
580
                        break;
581 29
582
                        // Store current selected font and graphics matrix
583 29
                    case 'q':
584 29
                        $clipped_font[] = $current_font;
585 29
                        $clipped_position_cm[] = $current_position_cm;
586
                        break;
587 29
588 29
                        // Restore previous selected font and graphics matrix
589 29
                    case 'Q':
590
                        $current_font = array_pop($clipped_font);
591 29
                        $current_position_cm = array_pop($clipped_position_cm);
592 29
                        break;
593 29
594 29
                    case 'DP':
595 29
                    case 'MP':
596 29
                        break;
597
598
                        // Begin marked content sequence with property list
599
                    case 'BDC':
600 29
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
601 29
                            $dict = $this->parseDictionary($match[1]);
602 29
603 11
                            // Check for ActualText block
604 11
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
605 11
                                if ('[' == $dict['ActualText'][0]) {
606
                                    // Simulate a 'TJ' command on the stack
607
                                    $marked_stack[] = [
608
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
609 11
                                    ];
610 11
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
611 11
                                    // Simulate a 'Tj' command on the stack
612
                                    $marked_stack[] = [
613 29
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
614
                                    ];
615 29
                                }
616 29
                            }
617
                        }
618 25
                        break;
619 25
620 25
                        // Begin marked content sequence
621
                    case 'BMC':
622 25
                        if ('ReversedChars' == $command[self::COMMAND]) {
623
                            // Upon encountering a ReversedChars command,
624 25
                            // add the characters we've built up so far to
625 25
                            // the result array
626 25
                            $result = array_merge($result, $text);
627
628
                            // Start a fresh $text array that will contain
629 25
                            // reversed characters
630 25
                            $text = [];
631
632 25
                            // Add the reversed text flag to the stack
633
                            $marked_stack[] = ['ReversedChars' => true];
634 29
                        }
635 29
                        break;
636
637 14
                        // End marked content sequence
638 14
                    case 'EMC':
639 14
                        $data = false;
640 14
                        if (\count($marked_stack)) {
641 14
                            $marked = array_pop($marked_stack);
642 14
                            $action = key($marked);
643
                            $data = $marked[$action];
644
645 14
                            switch ($action) {
646 9
                                // If we are in ReversedChars mode...
647 9
                                case 'ReversedChars':
648
                                    // Reverse the characters we've built up so far
649 14
                                    foreach ($text as $key => $t) {
650
                                        $text[$key] = implode('', array_reverse(
651 29
                                            mb_str_split($t, 1, mb_internal_encoding())
652 29
                                        ));
653 22
                                    }
654 22
655 22
                                    // Add these characters to the result array
656 22
                                    $result = array_merge($result, $text);
657 22
658 22
                                    // Start a fresh $text array that will contain
659 22
                                    // non-reversed characters
660
                                    $text = [];
661
                                    break;
662 22
663 22
                                case 'ActualText':
664 22
                                    // Use the content of the ActualText as a command
665
                                    $command = $data;
666
                                    break;
667 16
                            }
668 16
                        }
669
670 22
                        // If this EMC command has been transformed into a 'Tj'
671
                        // or 'TJ' command because of being ActualText, then bypass
672
                        // the break to proceed to the writing section below.
673
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
674
                            break;
675 22
                        }
676
677 22
                        // no break
678 22
                    case "'":
679
                    case '"':
680 22
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
681
                            // Move to next line and write text
682 22
                            $current_position['x'] = 0;
683 22
                            $current_position_td['x'] = 0;
684
                            $current_position_td['y'] += 10;
685 22
                        }
686 18
                        // no break
687 18
                    case 'Tj':
688
                        $command[self::COMMAND] = [$command];
689
                        // no break
690 22
                    case 'TJ':
691
                        // Check the marked content stack for flags
692
                        $actual_text = false;
693 29
                        $reverse_text = false;
694 1
                        foreach ($marked_stack as $marked) {
695 29
                            if (isset($marked['ActualText'])) {
696 29
                                $actual_text = true;
697 29
                            }
698
                            if (isset($marked['ReversedChars'])) {
699
                                $reverse_text = true;
700
                            }
701 29
                        }
702 29
703 29
                        // Account for text position ONLY just before we write text
704 24
                        if (false === $actual_text && \is_array($last_written_position)) {
705 22
                            // If $last_written_position is an array, that
706 22
                            // means we have stored text position coordinates
707 22
                            // for placing an ActualText
708 17
                            $currentX = $last_written_position[0];
709 17
                            $currentY = $last_written_position[1];
710 17
                            $last_written_position = false;
711 17
                        } else {
712 17
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
713
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
714
                        }
715
                        $whiteSpace = '';
716 29
717 29
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
718 29
                            if (abs($currentY - $current_position['y']) > 9) {
719 29
                                $whiteSpace = "\n";
720 29
                            } else {
721
                                $curX = $currentX - $current_position['x'];
722
                                $factorX = 10 * $current_position_tm['a'] + 10 * $current_position_tm['b'];
723 25
                                if (true === $reverse_text) {
724
                                    if ($curX < -abs($factorX * 8)) {
725
                                        $whiteSpace = "\t";
726
                                    } elseif ($curX < -abs($factorX)) {
727 29
                                        $whiteSpace = ' ';
728
                                    }
729
                                } else {
730 42
                                    if ($curX > ($factorX * 8)) {
731
                                        $whiteSpace = "\t";
732
                                    } elseif ($curX > $factorX) {
733
                                        $whiteSpace = ' ';
734
                                    }
735
                                }
736 42
                            }
737 42
                        }
738 8
739 8
                        $newtext = $this->getTJUsingFontFallback(
740 3
                            $current_font,
741
                            $command[self::COMMAND],
742 6
                            $current_position_tm,
743 6
                            $page
744
                        );
745
746
                        // If there is no ActualText pending then write
747
                        if (false === $actual_text) {
748 42
                            if (false !== $reverse_text) {
749 41
                                // If we are in ReversedChars mode, add the whitespace last
750
                                $text[] = str_replace(["\r", "\n"], '', $newtext).$whiteSpace;
751 42
                            } else {
752 41
                                // Otherwise add the whitespace first
753
                                $text[] = $whiteSpace.str_replace(["\r", "\n"], '', $newtext);
754 42
                            }
755 6
756
                            // Record the position of this inserted text for comparison
757 42
                            // with the next text block.
758 41
                            // Provide a 'fudge' factor guess on how wide this text block
759 41
                            // is based on the number of characters. This helps limit the
760
                            // number of tabs inserted, but isn't perfect.
761 41
                            $factor = 6;
762 41
                            if (true === $reverse_text) {
763
                                $factor = -$factor;
764
                            }
765
                            $current_position = [
766
                                'x' => $currentX + mb_strlen($newtext) * $factor,
767
                                'y' => $currentY,
768 42
                            ];
769
                        } elseif (false === $last_written_position) {
770
                            // If there is an ActualText in the pipeline
771
                            // store the position this undisplayed text
772
                            // *would* have been written to, so the
773
                            // ActualText is displayed in the right spot
774
                            $last_written_position = [$currentX, $currentY];
775 20
                        }
776
                        break;
777 20
778
                        // set leading
779
                    case 'TL':
780
                        break;
781
782
                        // set graphics position matrix
783
                    case 'cm':
784
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
785
                        $current_position_cm = [
786
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
787
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
788
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
789
                        ];
790
                        break;
791
792
                        // set text position matrix
793
                    case 'Tm':
794
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
795
                        $current_position_tm = [
796
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
797
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
798
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
799
                        ];
800
                        break;
801
802
                        // set super/subscripting text rise
803
                    case 'Ts':
804
                        break;
805
806
                        // set word spacing
807
                    case 'Tw':
808
                        break;
809
810
                        // set horizontal scaling
811
                    case 'Tz':
812
                        break;
813
814
                        // move to start of next line
815
                    case 'T*':
816
                        $current_position['x'] = 0;
817
                        $current_position_td['x'] = 0;
818
                        $current_position_td['y'] += 10;
819
                        break;
820
821
                    case 'Da':
822
                        break;
823
824
                    case 'Do':
825
                        if (null !== $page) {
826
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
827
                            $id = trim(array_pop($args), '/ ');
828
                            $xobject = $page->getXObject($id);
829
830
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
831
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
832
                                // Not a circular reference.
833
                                $text[] = $xobject->getText($page);
834
                            }
835
                        }
836
                        break;
837
838
                    case 'rg':
839
                    case 'RG':
840
                        break;
841
842
                    case 're':
843
                        break;
844
845
                    case 'co':
846
                        break;
847
848
                    case 'cs':
849
                        break;
850
851
                    case 'gs':
852
                        break;
853
854
                    case 'en':
855
                        break;
856
857
                    case 'sc':
858
                    case 'SC':
859
                        break;
860
861
                    case 'g':
862
                    case 'G':
863
                        break;
864
865
                    case 'V':
866
                        break;
867
868
                    case 'vo':
869
                    case 'Vo':
870
                        break;
871
872
                    default:
873
                }
874
            }
875
        }
876
877
        $result = array_merge($result, $text);
878
879
        return $result;
880
    }
881
882
    /**
883
     * getCommandsText() expects the content of $text_part to be an
884
     * already formatted, single-line command from a document stream.
885
     * The companion function getSectionsText() returns a document
886
     * stream as an array of single commands for just this purpose.
887
     *
888
     * A better name for this function would be getCommandText()
889
     * since it now always works on just one command.
890
     */
891
    public function getCommandsText(string $text_part): array
892
    {
893
        $commands = $matches = [];
894
895
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
896
897
        $type = $matches[2];
898
        $operator = $matches[3];
899
        $command = trim($matches[1]);
900
901
        if ('TJ' == $operator) {
902
            $subcommand = [];
903
            $command = trim($command, '[]');
904
            do {
905
                $oldCommand = $command;
906
907
                // Search for parentheses string () format
908
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
909
                    $subcommand[] = [
910
                        self::TYPE => '(',
911
                        self::OPERATOR => 'TJ',
912
                        self::COMMAND => $tjmatch[1],
913
                    ];
914
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
915
                        $subcommand[] = [
916
                            self::TYPE => 'n',
917
                            self::OPERATOR => '',
918
                            self::COMMAND => $tjmatch[2],
919
                        ];
920
                    }
921
                    $command = substr($command, \strlen($tjmatch[0]));
922
                }
923
924
                // Search for hexadecimal <> format
925
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
926
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
927
                    $subcommand[] = [
928
                        self::TYPE => '<',
929
                        self::OPERATOR => 'TJ',
930
                        self::COMMAND => $tjmatch[1],
931
                    ];
932
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
933
                        $subcommand[] = [
934
                            self::TYPE => 'n',
935
                            self::OPERATOR => '',
936
                            self::COMMAND => $tjmatch[2],
937
                        ];
938
                    }
939
                    $command = substr($command, \strlen($tjmatch[0]));
940
                }
941
            } while ($command != $oldCommand);
942
943
            $command = $subcommand;
944
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
945
            // Depending on the string type, trim the data of the
946
            // appropriate delimiters
947
            if ('(' == $type) {
948
                // Don't use trim() here since a () string may end with
949
                // a balanced or escaped right parentheses, and trim()
950
                // will delete both. Both strings below are valid:
951
                //   eg. (String())
952
                //   eg. (String\))
953
                $command = preg_replace('/^\(|\)$/', '', $command);
954
            } elseif ('<' == $type) {
955
                $command = trim($command, '<>');
956
            }
957
        } elseif ('/' == $type) {
958
            $command = substr($command, 1);
959
        }
960
961
        $commands[] = [
962
            self::TYPE => $type,
963
            self::OPERATOR => $operator,
964
            self::COMMAND => $command,
965
        ];
966
967
        return $commands;
968
    }
969
970
    public static function factory(
971
        Document $document,
972
        Header $header,
973
        ?string $content,
974
        Config $config = null
975
    ): self {
976
        switch ($header->get('Type')->getContent()) {
977
            case 'XObject':
978
                switch ($header->get('Subtype')->getContent()) {
979
                    case 'Image':
980
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
981
982
                    case 'Form':
983
                        return new Form($document, $header, $content, $config);
984
                }
985
986
                return new self($document, $header, $content, $config);
987
988
            case 'Pages':
989
                return new Pages($document, $header, $content, $config);
990
991
            case 'Page':
992
                return new Page($document, $header, $content, $config);
993
994
            case 'Encoding':
995
                return new Encoding($document, $header, $content, $config);
996
997
            case 'Font':
998
                $subtype = $header->get('Subtype')->getContent();
999
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1000
1001
                if (class_exists($classname)) {
1002
                    return new $classname($document, $header, $content, $config);
1003
                }
1004
1005
                return new Font($document, $header, $content, $config);
1006
1007
            default:
1008
                return new self($document, $header, $content, $config);
1009
        }
1010
    }
1011
1012
    /**
1013
     * Returns unique id identifying the object.
1014
     */
1015
    protected function getUniqueId(): string
1016
    {
1017
        return spl_object_hash($this);
1018
    }
1019
}
1020