Test Failed
Pull Request — master (#634)
by
unknown
02:02
created

PDFObject::getText()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 4
c 1
b 1
f 0
nc 1
nop 1
dl 0
loc 7
ccs 6
cts 6
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    public function cleanContent(?string $content): string
136 32
    {
137 32
        if (null === $content) {
138
            return '';
139
        }
140
141
        // Find all strings () and replace them so they aren't affected
142 32
        // by the next steps
143 32
        $pdfstrings = [];
144 22
        $attempt = '(';
145
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
146
            // PDF strings can contain unescaped parentheses as long as
147
            // they're balanced, so check for balanced parentheses
148 32
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
149 32
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
150 21
151
            if ($left == $right) {
152
                // Replace the string with a unique placeholder
153
                $id = uniqid('STRING_', true);
154 32
                $pdfstrings[$id] = $text[0];
155 32
                $content = preg_replace(
156 32
                    '/'.preg_quote($text[0], '/').'/',
157 32
                    '@@@'.$id.'@@@',
158 32
                    $content,
159 18
                    1
160
                );
161
162 32
                // Reset to search for the next string
163
                $attempt = '(';
164 32
            } else {
165 18
                // We had unbalanced parentheses, so use the current
166
                // match as a base to find a longer string
167
                $attempt = $text[0];
168
            }
169
        }
170
171 32
        // Remove all carriage returns and line-feeds from the document stream
172 32
        $content = str_replace(["\r", "\n"], ' ', trim($content));
173
174
        // Find all dictionary << >> commands and replace them so they
175 32
        // aren't affected by the next steps
176
        $dictstore = [];
177 32
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
178 7
            $dictid = uniqid('DICT_', true);
179
            $dictstore[$dictid] = $dicttext[1];
180
            $content = preg_replace(
181 32
                '/'.preg_quote($dicttext[0], '/').'/',
182 32
                ' ###'.$dictid.'###'.$dicttext[2],
183 11
                $content,
184
                1
185
            );
186 32
        }
187
188
        // Now that all strings and dictionaries are hidden, the only
189 31
        // PDF commands left should all be plain text.
190
        // Detect MIME-type of the current string and prevent reading
191 31
        // content streams that are images, etc. This prevents PHP
192 31
        // error messages when JPEG content is sent to this function
193 31
        // by the sample file '12249.pdf' from:
194
        // https://github.com/smalot/pdfparser/issues/458
195
        $finfo = new \finfo(\FILEINFO_MIME);
196 31
        if (false === strpos($finfo->buffer($content), 'text/plain')) {
197 29
            return '';
198 29
        }
199 29
200
        // Normalize white-space in the document stream
201
        $content = preg_replace('/\s{2,}/', ' ', $content);
202 29
203 29
        // Find all valid PDF operators and add \r\n after each; this
204
        // ensures there is just one command on every line
205
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
206 29
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
207
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
208
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
209
        //       appear here in the list for completeness.
210 29
        $operators = [
211
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
212 29
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
213
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
214
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
215
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
216
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
217 31
        ];
218 4
        foreach ($operators as $operator) {
219 4
            $content = preg_replace(
220 4
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
221 4
                $operator."\r\n",
222
                $content
223 4
            );
224
        }
225
226
        // Restore the original content of the dictionary << >> commands
227 31
        $dictstore = array_reverse($dictstore, true);
228
        foreach ($dictstore as $id => $dict) {
229
            $content = str_replace('###'.$id.'###', $dict, $content);
230 20
        }
231
232 20
        // Restore the original string content
233 20
        $pdfstrings = array_reverse($pdfstrings, true);
234 19
        foreach ($pdfstrings as $id => $text) {
235
            // Strings may contain escaped newlines, or literal newlines
236
            // and we should clean these up before replacing the string
237 20
            // back into the content stream; this ensures no strings are
238 20
            // split between two lines (every command must be on one line)
239 18
            $text = str_replace(
240
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
241
                ['', '', '', '\r', '\n'],
242 20
                $text
243 18
            );
244
245
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
246 2
        }
247
248
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
249
250
        return $content;
251
    }
252 20
253
    /**
254 20
     * getSectionsText() now takes an entire, unformatted document
255 20
     * stream as a string, cleans it, then filters out commands that
256 20
     * aren't needed for text positioning/extraction. It returns an
257 20
     * array of unprocessed PDF commands, one command per element.
258
     */
259 20
    public function getSectionsText(?string $content): array
260 20
    {
261
        $sections = [];
262 20
263
        // A cleaned stream has one command on every line, so split the
264 20
        // cleaned stream content on \r\n into an array
265 18
        $textCleaned = preg_split(
266 18
            '/(\r\n|\n|\r)/',
267 18
            $this->cleanContent($content),
268
            -1,
269 18
            \PREG_SPLIT_NO_EMPTY
270 18
        );
271 18
272 1
        $inTextBlock = false;
273 1
        foreach ($textCleaned as $line) {
274
            $line = trim($line);
275 1
276
            // Skip empty lines
277
            if ('' === $line) {
278 18
                continue;
279 5
            }
280
281
            // If a 'BT' is encountered, set the $inTextBlock flag
282 18
            if (preg_match('/BT$/', $line)) {
283 15
                $inTextBlock = true;
284 15
                $sections[] = $line;
285 15
286 15
                // If an 'ET' is encountered, unset the $inTextBlock flag
287 15
            } elseif ('ET' == $line) {
288
                $inTextBlock = false;
289
                $sections[] = $line;
290 11
            } elseif ($inTextBlock) {
291 15
                // If we are inside a BT ... ET text block, save all lines
292 15
                $sections[] = trim($line);
293
            } else {
294 12
                // Otherwise, if we are outside of a text block, only
295
                // save specific, necessary lines. Care should be taken
296 15
                // to ensure a command being checked for *only* matches
297 15
                // that command. For instance, a simple search for 'c'
298
                // may also match the 'sc' command. See the command
299
                // list in the cleanContent() method above.
300 18
                // Add more commands to save here as you find them in
301 3
                // weird PDFs!
302 3
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
303 3
                    // Save and restore graphics state commands
304 3
                    $sections[] = $line;
305 3
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
306
                    // Begin marked content sequence
307
                    $sections[] = $line;
308
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
309 3
                    // Marked content point
310
                    $sections[] = $line;
311 18
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
312 18
                    // End marked content sequence
313 18
                    $sections[] = $line;
314 18
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
315 18
                    // Graphics position change commands
316
                    $sections[] = $line;
317
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
318
                    // Font change commands
319
                    $sections[] = $line;
320
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
321 18
                    // Invoke named XObject command
322 16
                    $sections[] = $line;
323
                }
324
            }
325 18
        }
326
327 18
        return $sections;
328
    }
329 5
330 5
    private function getDefaultFont(Page $page = null): Font
331
    {
332 18
        $fonts = [];
333
        if (null !== $page) {
334 6
            $fonts = $page->getFonts();
335 6
        }
336
337 18
        $firstFont = $this->document->getFirstFont();
338 18
        if (null !== $firstFont) {
339 13
            $fonts[] = $firstFont;
340
        }
341 17
342 18
        if (\count($fonts) > 0) {
343 18
            return reset($fonts);
344 18
        }
345
346
        return new Font($this->document, null, null, $this->config);
347 15
    }
348 1
349 1
    /**
350
     * Decode a '[]TJ' command and attempt to use alternate fonts if
351 15
     * the current font results in output that contains Unicode control
352 14
     * characters. See Font::decodeText for a full description of
353 14
     * $textMatrix
354 14
     *
355 14
     * @param array<int,array<string,string|bool>> $command
356 14
     * @param array<string,float>                  $textMatrix
357 14
     */
358 12
    private function getTJUsingFontFallback(
359
        Font $font,
360
        array $command,
361 14
        array $textMatrix = ['a' => 1, 'b' => 0, 'i' => 0, 'j' => 1],
362 14
        Page $page = null
363 14
    ): string {
364 10
        $orig_text = $font->decodeText($command, $textMatrix);
365
        $text = $orig_text;
366
367 14
        // If we make this a Config option, we can add a check if it's
368 14
        // enabled here.
369
        if (null !== $page) {
370
            $font_ids = array_keys($page->getFonts());
371 12
372
            // If the decoded text contains UTF-8 control characters
373
            // then the font page being used is probably the wrong one.
374
            // Loop through the rest of the fonts to see if we can get
375 12
            // a good decode. Allow x09 to x0d which are whitespace.
376 4
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
377
                // If we're out of font IDs, then give up and use the
378
                // original string
379 12
                if (0 == \count($font_ids)) {
380
                    return $orig_text;
381
                }
382
383
                // Try the next font ID
384 12
                $font = $page->getFont(array_shift($font_ids));
385 4
                $text = $font->decodeText($command, $textMatrix);
386 4
            }
387
        }
388 11
389
        return $text;
390
    }
391 11
392 4
    /**
393 4
     * @throws \Exception
394 4
     */
395 4
    public function parseDictionary(string $dictionary): array
396
    {
397
        // Normalize whitespace
398 4
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
399
400 4
        if ('<<' != substr($dictionary, 0, 2)) {
401
            throw new \Exception('Not a valid dictionary object.');
402
        }
403 4
404
        $parsed = [];
405 9
        $stack = [];
406 8
        $currentName = '';
407 2
        $arrayTypeNumeric = false;
408
409 8
        // Remove outer layer of dictionary, and split on tokens
410
        $split = preg_split(
411
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
412 8
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
413
            -1,
414
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
415 8
        );
416 3
417
        foreach ($split as $token) {
418 8
            $token = trim($token);
419 3
            switch ($token) {
420
                case '':
421 7
                    break;
422
423
                    // Open numeric array
424 7
                case '[':
425 7
                    $parsed[$currentName] = [];
426
                    $arrayTypeNumeric = true;
427
428 7
                    // Move up one level in the stack
429 7
                    $stack[\count($stack)] = &$parsed;
430 1
                    $parsed = &$parsed[$currentName];
431
                    $currentName = '';
432 6
                    break;
433
434
                    // Open hashed array
435 6
                case '<<':
436 6
                    $parsed[$currentName] = [];
437
                    $arrayTypeNumeric = false;
438
439
                    // Move up one level in the stack
440
                    $stack[\count($stack)] = &$parsed;
441
                    $parsed = &$parsed[$currentName];
442
                    $currentName = '';
443
                    break;
444
445 18
                    // Close numeric array
446 1
                case ']':
447 1
                    // Revert string type arrays back to a single element
448
                    if (\is_array($parsed) && 1 == \count($parsed)
449
                        && isset($parsed[0]) && \is_string($parsed[0])
450 18
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
451
                        $parsed = '['.$parsed[0].']';
452
                    }
453 20
                    // Close hashed array
454
                    // no break
455
                case '>>':
456
                    $arrayTypeNumeric = false;
457
458
                    // Move down one level in the stack
459 6
                    $parsed = &$stack[\count($stack) - 1];
460
                    unset($stack[\count($stack) - 1]);
461 6
                    break;
462 6
463 6
                default:
464
                    // If value begins with a slash, then this is a name
465 6
                    // Add it to the appropriate array
466 6
                    if ('/' == substr($token, 0, 1)) {
467
                        $currentName = substr($token, 1);
468 6
                        if (true == $arrayTypeNumeric) {
469 6
                            $parsed[] = $currentName;
470
                            $currentName = '';
471 6
                        }
472 3
                    } elseif ('' != $currentName) {
473
                        if (false == $arrayTypeNumeric) {
474
                            $parsed[$currentName] = $token;
475 6
                        }
476 6
                        $currentName = '';
477
                    } elseif ('' == $currentName) {
478
                        $parsed[] = $token;
479 6
                    }
480
            }
481
        }
482 6
483 6
        return $parsed;
484 6
    }
485 6
486 6
    /**
487
     * getText() leverages getTextArray() to get the content of the
488 6
     * document, setting the addPositionWhitespace flag to true so
489
     * whitespace is inserted in a logical way for reading by humans.
490 6
     */
491 6
    public function getText(Page $page = null): string
492 5
    {
493
        $this->addPositionWhitespace = true;
494 6
        $result = $this->getTextArray($page);
495 6
        $this->addPositionWhitespace = false;
496 6
497 6
        return implode('', $result).' ';
498
    }
499
500 5
    /**
501 4
     * getTextArray() returns the text objects of a document in an
502
     * array. By default no positioning whitespace is added to the
503 5
     * output unless the addPositionWhitespace flag is set to true.
504 4
     *
505
     * @throws \Exception
506
     */
507 5
    public function getTextArray(Page $page = null): array
508
    {
509
        $result = [];
510
        $text = [];
511 5
512 2
        $marked_stack = [];
513
        $last_written_position = false;
514
515 5
        $sections = $this->getSectionsText($this->content);
516
        $current_font = $this->getDefaultFont($page);
517
518
        $current_position = ['x' => false, 'y' => false];
519
        $current_position_tm = [
520 5
            'a' => 1, 'b' => 0, 'c' => 0,
521
            'i' => 0, 'j' => 1, 'k' => 0,
522 4
            'x' => false, 'y' => false, 'z' => 1,
523
        ];
524 4
        $current_position_td = ['x' => 0, 'y' => 0];
525
        $current_position_cm = [
526
            'a' => 1, 'b' => 0, 'c' => 0,
527 4
            'i' => 0, 'j' => 1, 'k' => 0,
528
            'x' => 0, 'y' => 0, 'z' => 1,
529
        ];
530
531
        $clipped_font = [];
532
        $clipped_position_cm = [];
533
534
        self::$recursionStack[] = $this->getUniqueId();
535
536
        foreach ($sections as $section) {
537 4
            $commands = $this->getCommandsText($section);
538 4
            foreach ($commands as $command) {
539 2
                switch ($command[self::OPERATOR]) {
540
                    case 'BT':
541 4
                        // Reset text positioning matrices
542
                        $current_position_tm = [
543
                            'a' => 1, 'b' => 0, 'c' => 0,
544 4
                            'i' => 0, 'j' => 1, 'k' => 0,
545
                            'x' => false, 'y' => false, 'z' => 1,
546
                        ];
547 4
                        $current_position_td = ['x' => 0, 'y' => 0];
548
                        break;
549
550 4
                    case 'ET':
551 1
                        break;
552
553 4
                        // set character spacing
554
                    case 'Tc':
555
                        break;
556 4
557 4
                        // move text current point and set leading
558
                    case 'TD':
559
                    case 'Td':
560 4
                        // move text current point
561 4
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
562 2
                        $y = (float) array_pop($args);
563
                        $x = (float) array_pop($args);
564 2
565
                        $current_position_td = [
566
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
567 2
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
568 2
                        ];
569
                        break;
570
571
                    case 'Tf':
572
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
573
                        $id = trim($id, '/');
574
                        if (null !== $page) {
575
                            $new_font = $page->getFont($id);
576 6
                            // If an invalid font ID is given, do not update the font.
577
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
578
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
579 29
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
580
                            // But we want to make sure that malformed PDFs do not simply crash.
581 29
                            if (null !== $new_font) {
582
                                $current_font = $new_font;
583 29
                            }
584 29
                        }
585 29
                        break;
586
587 29
                        // Store current selected font and graphics matrix
588 29
                    case 'q':
589 29
                        $clipped_font[] = $current_font;
590
                        $clipped_position_cm[] = $current_position_cm;
591 29
                        break;
592 29
593 29
                        // Restore previous selected font and graphics matrix
594 29
                    case 'Q':
595 29
                        $current_font = array_pop($clipped_font);
596 29
                        $current_position_cm = array_pop($clipped_position_cm);
597
                        break;
598
599
                    case 'DP':
600 29
                    case 'MP':
601 29
                        break;
602 29
603 11
                        // Begin marked content sequence with property list
604 11
                    case 'BDC':
605 11
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
606
                            $dict = $this->parseDictionary($match[1]);
607
608
                            // Check for ActualText block
609 11
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
610 11
                                if ('[' == $dict['ActualText'][0]) {
611 11
                                    // Simulate a 'TJ' command on the stack
612
                                    $marked_stack[] = [
613 29
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
614
                                    ];
615 29
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
616 29
                                    // Simulate a 'Tj' command on the stack
617
                                    $marked_stack[] = [
618 25
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
619 25
                                    ];
620 25
                                }
621
                            }
622 25
                        }
623
                        break;
624 25
625 25
                        // Begin marked content sequence
626 25
                    case 'BMC':
627
                        if ('ReversedChars' == $command[self::COMMAND]) {
628
                            // Upon encountering a ReversedChars command,
629 25
                            // add the characters we've built up so far to
630 25
                            // the result array
631
                            $result = array_merge($result, $text);
632 25
633
                            // Start a fresh $text array that will contain
634 29
                            // reversed characters
635 29
                            $text = [];
636
637 14
                            // Add the reversed text flag to the stack
638 14
                            $marked_stack[] = ['ReversedChars' => true];
639 14
                        }
640 14
                        break;
641 14
642 14
                        // End marked content sequence
643
                    case 'EMC':
644
                        $data = false;
645 14
                        if (\count($marked_stack)) {
646 9
                            $marked = array_pop($marked_stack);
647 9
                            $action = key($marked);
648
                            $data = $marked[$action];
649 14
650
                            switch ($action) {
651 29
                                // If we are in ReversedChars mode...
652 29
                                case 'ReversedChars':
653 22
                                    // Reverse the characters we've built up so far
654 22
                                    foreach ($text as $key => $t) {
655 22
                                        $text[$key] = implode('', array_reverse(
656 22
                                            mb_str_split($t, 1, mb_internal_encoding())
657 22
                                        ));
658 22
                                    }
659 22
660
                                    // Add these characters to the result array
661
                                    $result = array_merge($result, $text);
662 22
663 22
                                    // Start a fresh $text array that will contain
664 22
                                    // non-reversed characters
665
                                    $text = [];
666
                                    break;
667 16
668 16
                                case 'ActualText':
669
                                    // Use the content of the ActualText as a command
670 22
                                    $command = $data;
671
                                    break;
672
                            }
673
                        }
674
675 22
                        // If this EMC command has been transformed into a 'Tj'
676
                        // or 'TJ' command because of being ActualText, then bypass
677 22
                        // the break to proceed to the writing section below.
678 22
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
679
                            break;
680 22
                        }
681
682 22
                        // no break
683 22
                    case "'":
684
                    case '"':
685 22
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
686 18
                            // Move to next line and write text
687 18
                            $current_position['x'] = 0;
688
                            $current_position_td['x'] = 0;
689
                            $current_position_td['y'] += 10;
690 22
                        }
691
                        // no break
692
                    case 'Tj':
693 29
                        $command[self::COMMAND] = [$command];
694 1
                        // no break
695 29
                    case 'TJ':
696 29
                        // Check the marked content stack for flags
697 29
                        $actual_text = false;
698
                        $reverse_text = false;
699
                        foreach ($marked_stack as $marked) {
700
                            if (isset($marked['ActualText'])) {
701 29
                                $actual_text = true;
702 29
                            }
703 29
                            if (isset($marked['ReversedChars'])) {
704 24
                                $reverse_text = true;
705 22
                            }
706 22
                        }
707 22
708 17
                        // Account for text position ONLY just before we write text
709 17
                        if (false === $actual_text && \is_array($last_written_position)) {
710 17
                            // If $last_written_position is an array, that
711 17
                            // means we have stored text position coordinates
712 17
                            // for placing an ActualText
713
                            $currentX = $last_written_position[0];
714
                            $currentY = $last_written_position[1];
715
                            $last_written_position = false;
716 29
                        } else {
717 29
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
718 29
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
719 29
                        }
720 29
                        $whiteSpace = '';
721
722
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
723 25
                            if (abs($currentY - $current_position['y']) > 9) {
724
                                $whiteSpace = "\n";
725
                            } else {
726
                                $curX = $currentX - $current_position['x'];
727 29
                                $factorX = 10 * $current_position_tm['a'] + 10 * $current_position_tm['i'];
728
                                if (true === $reverse_text) {
729
                                    if ($curX < -abs($factorX * 8)) {
730 42
                                        $whiteSpace = "\t";
731
                                    } elseif ($curX < -abs($factorX)) {
732
                                        $whiteSpace = ' ';
733
                                    }
734
                                } else {
735
                                    if ($curX > ($factorX * 8)) {
736 42
                                        $whiteSpace = "\t";
737 42
                                    } elseif ($curX > $factorX) {
738 8
                                        $whiteSpace = ' ';
739 8
                                    }
740 3
                                }
741
                            }
742 6
                        }
743 6
744
                        $newtext = $this->getTJUsingFontFallback(
745
                            $current_font,
746
                            $command[self::COMMAND],
747
                            $current_position_tm,
748 42
                            $page
749 41
                        );
750
751 42
                        // If there is no ActualText pending then write
752 41
                        if (false === $actual_text) {
753
                            if (false !== $reverse_text) {
754 42
                                // If we are in ReversedChars mode, add the whitespace last
755 6
                                $text[] = str_replace(["\r", "\n"], '', $newtext).$whiteSpace;
756
                            } else {
757 42
                                // Otherwise add the whitespace first
758 41
                                $text[] = $whiteSpace.str_replace(["\r", "\n"], '', $newtext);
759 41
                            }
760
761 41
                            // Record the position of this inserted text for comparison
762 41
                            // with the next text block.
763
                            // Provide a 'fudge' factor guess on how wide this text block
764
                            // is based on the number of characters. This helps limit the
765
                            // number of tabs inserted, but isn't perfect.
766
                            $factor = 6;
767
                            if (true === $reverse_text) {
768 42
                                $factor = -$factor;
769
                            }
770
                            $current_position = [
771
                                'x' => $currentX + mb_strlen($newtext) * $factor,
772
                                'y' => $currentY,
773
                            ];
774
                        } elseif (false === $last_written_position) {
775 20
                            // If there is an ActualText in the pipeline
776
                            // store the position this undisplayed text
777 20
                            // *would* have been written to, so the
778
                            // ActualText is displayed in the right spot
779
                            $last_written_position = [$currentX, $currentY];
780
                        }
781
                        break;
782
783
                        // set leading
784
                    case 'TL':
785
                        break;
786
787
                        // set graphics position matrix
788
                    case 'cm':
789
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
790
                        $current_position_cm = [
791
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
792
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
793
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
794
                        ];
795
                        break;
796
797
                        // set text position matrix
798
                    case 'Tm':
799
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
800
                        $current_position_tm = [
801
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
802
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
803
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
804
                        ];
805
                        break;
806
807
                        // set super/subscripting text rise
808
                    case 'Ts':
809
                        break;
810
811
                        // set word spacing
812
                    case 'Tw':
813
                        break;
814
815
                        // set horizontal scaling
816
                    case 'Tz':
817
                        break;
818
819
                        // move to start of next line
820
                    case 'T*':
821
                        $current_position['x'] = 0;
822
                        $current_position_td['x'] = 0;
823
                        $current_position_td['y'] += 10;
824
                        break;
825
826
                    case 'Da':
827
                        break;
828
829
                    case 'Do':
830
                        if (null !== $page) {
831
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
832
                            $id = trim(array_pop($args), '/ ');
833
                            $xobject = $page->getXObject($id);
834
835
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
836
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
837
                                // Not a circular reference.
838
                                $text[] = $xobject->getText($page);
839
                            }
840
                        }
841
                        break;
842
843
                    case 'rg':
844
                    case 'RG':
845
                        break;
846
847
                    case 're':
848
                        break;
849
850
                    case 'co':
851
                        break;
852
853
                    case 'cs':
854
                        break;
855
856
                    case 'gs':
857
                        break;
858
859
                    case 'en':
860
                        break;
861
862
                    case 'sc':
863
                    case 'SC':
864
                        break;
865
866
                    case 'g':
867
                    case 'G':
868
                        break;
869
870
                    case 'V':
871
                        break;
872
873
                    case 'vo':
874
                    case 'Vo':
875
                        break;
876
877
                    default:
878
                }
879
            }
880
        }
881
882
        $result = array_merge($result, $text);
883
884
        return $result;
885
    }
886
887
    /**
888
     * getCommandsText() expects the content of $text_part to be an
889
     * already formatted, single-line command from a document stream.
890
     * The companion function getSectionsText() returns a document
891
     * stream as an array of single commands for just this purpose.
892
     *
893
     * A better name for this function would be getCommandText()
894
     * since it now always works on just one command.
895
     */
896
    public function getCommandsText(string $text_part): array
897
    {
898
        $commands = $matches = [];
899
900
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
901
902
        $type = $matches[2];
903
        $operator = $matches[3];
904
        $command = trim($matches[1]);
905
906
        if ('TJ' == $operator) {
907
            $subcommand = [];
908
            $command = trim($command, '[]');
909
            do {
910
                $oldCommand = $command;
911
912
                // Search for parentheses string () format
913
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
914
                    $subcommand[] = [
915
                        self::TYPE => '(',
916
                        self::OPERATOR => 'TJ',
917
                        self::COMMAND => $tjmatch[1],
918
                    ];
919
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
920
                        $subcommand[] = [
921
                            self::TYPE => 'n',
922
                            self::OPERATOR => '',
923
                            self::COMMAND => $tjmatch[2],
924
                        ];
925
                    }
926
                    $command = substr($command, \strlen($tjmatch[0]));
927
                }
928
929
                // Search for hexadecimal <> format
930
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
931
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
932
                    $subcommand[] = [
933
                        self::TYPE => '<',
934
                        self::OPERATOR => 'TJ',
935
                        self::COMMAND => $tjmatch[1],
936
                    ];
937
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
938
                        $subcommand[] = [
939
                            self::TYPE => 'n',
940
                            self::OPERATOR => '',
941
                            self::COMMAND => $tjmatch[2],
942
                        ];
943
                    }
944
                    $command = substr($command, \strlen($tjmatch[0]));
945
                }
946
            } while ($command != $oldCommand);
947
948
            $command = $subcommand;
949
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
950
            // Depending on the string type, trim the data of the
951
            // appropriate delimiters
952
            if ('(' == $type) {
953
                // Don't use trim() here since a () string may end with
954
                // a balanced or escaped right parentheses, and trim()
955
                // will delete both. Both strings below are valid:
956
                //   eg. (String())
957
                //   eg. (String\))
958
                $command = preg_replace('/^\(|\)$/', '', $command);
959
            } elseif ('<' == $type) {
960
                $command = trim($command, '<>');
961
            }
962
        } elseif ('/' == $type) {
963
            $command = substr($command, 1);
964
        }
965
966
        $commands[] = [
967
            self::TYPE => $type,
968
            self::OPERATOR => $operator,
969
            self::COMMAND => $command,
970
        ];
971
972
        return $commands;
973
    }
974
975
    public static function factory(
976
        Document $document,
977
        Header $header,
978
        ?string $content,
979
        Config $config = null
980
    ): self {
981
        switch ($header->get('Type')->getContent()) {
982
            case 'XObject':
983
                switch ($header->get('Subtype')->getContent()) {
984
                    case 'Image':
985
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
986
987
                    case 'Form':
988
                        return new Form($document, $header, $content, $config);
989
                }
990
991
                return new self($document, $header, $content, $config);
992
993
            case 'Pages':
994
                return new Pages($document, $header, $content, $config);
995
996
            case 'Page':
997
                return new Page($document, $header, $content, $config);
998
999
            case 'Encoding':
1000
                return new Encoding($document, $header, $content, $config);
1001
1002
            case 'Font':
1003
                $subtype = $header->get('Subtype')->getContent();
1004
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1005
1006
                if (class_exists($classname)) {
1007
                    return new $classname($document, $header, $content, $config);
1008
                }
1009
1010
                return new Font($document, $header, $content, $config);
1011
1012
            default:
1013
                return new self($document, $header, $content, $config);
1014
        }
1015
    }
1016
1017
    /**
1018
     * Returns unique id identifying the object.
1019
     */
1020
    protected function getUniqueId(): string
1021
    {
1022
        return spl_object_hash($this);
1023
    }
1024
}
1025