Test Failed
Pull Request — master (#634)
by
unknown
07:22
created

PDFObject::cleanContent()   B

Complexity

Conditions 9
Paths 55

Size

Total Lines 116
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 56
CRAP Score 9.0033

Importance

Changes 0
Metric Value
cc 9
eloc 54
c 0
b 0
f 0
nc 55
nop 1
dl 0
loc 116
ccs 56
cts 58
cp 0.9655
crap 9.0033
rs 7.448

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    public function cleanContent(?string $content): string
136 32
    {
137 32
        if (null == $content) {
138
            return '';
139
        }
140
141
        // Find all strings () and replace them so they aren't affected
142 32
        // by the next steps
143 32
        $pdfstrings = [];
144 22
        $attempt = '(';
145
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
146
            // PDF strings can contain unescaped parentheses as long as
147
            // they're balanced, so check for balanced parentheses
148 32
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
149 32
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
150 21
151
            if ($left == $right) {
152
                // Replace the string with a unique placeholder
153
                $id = uniqid('STRING_', true);
154 32
                $pdfstrings[$id] = $text[0];
155 32
                $content = preg_replace(
156 32
                    '/'.preg_quote($text[0], '/').'/',
157 32
                    '@@@'.$id.'@@@',
158 32
                    $content,
159 18
                    1
160
                );
161
162 32
                // Reset to search for the next string
163
                $attempt = '(';
164 32
            } else {
165 18
                // We had unbalanced parentheses, so use the current
166
                // match as a base to find a longer string
167
                $attempt = $text[0];
168
            }
169
        }
170
171 32
        // Remove all carriage returns and line-feeds from the document stream
172 32
        $content = str_replace(["\r", "\n"], ' ', trim($content));
173
174
        // Find all dictionary << >> commands and replace them so they
175 32
        // aren't affected by the next steps
176
        $dictstore = [];
177 32
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
178 7
            $dictid = uniqid('DICT_', true);
179
            $dictstore[$dictid] = $dicttext[1];
180
            $content = preg_replace(
181 32
                '/'.preg_quote($dicttext[0], '/').'/',
182 32
                ' ###'.$dictid.'###'.$dicttext[2],
183 11
                $content,
184
                1
185
            );
186 32
        }
187
188
        // Now that all strings and dictionaries are hidden, the only
189 31
        // PDF commands left should all be plain text.
190
        // Detect MIME-type of the current string and prevent reading
191 31
        // content streams that are images, etc. This prevents PHP
192 31
        // error messages when JPEG content is sent to this function
193 31
        // by the sample file '12249.pdf' from:
194
        // https://github.com/smalot/pdfparser/issues/458
195
        $finfo = new \finfo(\FILEINFO_MIME);
196 31
        if (false === strpos($finfo->buffer($content), 'text/plain')) {
197 29
            return '';
198 29
        }
199 29
200
        // Normalize white-space in the document stream
201
        $content = preg_replace('/\s{2,}/', ' ', $content);
202 29
203 29
        // Find all valid PDF operators and add \r\n after each; this
204
        // ensures there is just one command on every line
205
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
206 29
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
207
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
208
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
209
        //       appear here in the list for completeness.
210 29
        $operators = [
211
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
212 29
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
213
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
214
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
215
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
216
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
217 31
        ];
218 4
        foreach ($operators as $operator) {
219 4
            $content = preg_replace(
220 4
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
221 4
                $operator."\r\n",
222
                $content
223 4
            );
224
        }
225
226
        // Restore the original content of the dictionary << >> commands
227 31
        $dictstore = array_reverse($dictstore, true);
228
        foreach ($dictstore as $id => $dict) {
229
            $content = str_replace('###'.$id.'###', $dict, $content);
230 20
        }
231
232 20
        // Restore the original string content
233 20
        $pdfstrings = array_reverse($pdfstrings, true);
234 19
        foreach ($pdfstrings as $id => $text) {
235
            // Strings may contain escaped newlines, or literal newlines
236
            // and we should clean these up before replacing the string
237 20
            // back into the content stream; this ensures no strings are
238 20
            // split between two lines (every command must be on one line)
239 18
            $text = str_replace(
240
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
241
                ['', '', '', '\r', '\n'],
242 20
                $text
243 18
            );
244
245
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
246 2
        }
247
248
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
249
250
        return $content;
251
    }
252 20
253
    /**
254 20
     * getSectionsText() now takes an entire, unformatted document
255 20
     * stream as a string, cleans it, then filters out commands that
256 20
     * aren't needed for text positioning/extraction. It returns an
257 20
     * array of unprocessed PDF commands, one command per element.
258
     */
259 20
    public function getSectionsText(?string $content): array
260 20
    {
261
        $sections = [];
262 20
263
        // A cleaned stream has one command on every line, so split the
264 20
        // cleaned stream content on \r\n into an array
265 18
        $textCleaned = preg_split(
266 18
            '/(\r\n|\n|\r)/',
267 18
            $this->cleanContent($content),
268
            -1,
269 18
            \PREG_SPLIT_NO_EMPTY
270 18
        );
271 18
272 1
        $inTextBlock = false;
273 1
        foreach ($textCleaned as $line) {
274
            $line = trim($line);
275 1
276
            // Skip empty lines
277
            if ('' === $line) {
278 18
                continue;
279 5
            }
280
281
            // If a 'BT' is encountered, set the $inTextBlock flag
282 18
            if (preg_match('/BT$/', $line)) {
283 15
                $inTextBlock = true;
284 15
                $sections[] = $line;
285 15
286 15
                // If an 'ET' is encountered, unset the $inTextBlock flag
287 15
            } elseif ('ET' == $line) {
288
                $inTextBlock = false;
289
                $sections[] = $line;
290 11
            } elseif ($inTextBlock) {
291 15
                // If we are inside a BT ... ET text block, save all lines
292 15
                $sections[] = trim($line);
293
            } else {
294 12
                // Otherwise, if we are outside of a text block, only
295
                // save specific, necessary lines. Care should be taken
296 15
                // to ensure a command being checked for *only* matches
297 15
                // that command. For instance, a simple search for 'c'
298
                // may also match the 'sc' command. See the command
299
                // list in the cleanContent() method above.
300 18
                // Add more commands to save here as you find them in
301 3
                // weird PDFs!
302 3
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
303 3
                    // Save and restore graphics state commands
304 3
                    $sections[] = $line;
305 3
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
306
                    // Begin marked content sequence
307
                    $sections[] = $line;
308
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
309 3
                    // Marked content point
310
                    $sections[] = $line;
311 18
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
312 18
                    // End marked content sequence
313 18
                    $sections[] = $line;
314 18
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
315 18
                    // Graphics position change commands
316
                    $sections[] = $line;
317
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
318
                    // Font change commands
319
                    $sections[] = $line;
320
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
321 18
                    // Invoke named XObject command
322 16
                    $sections[] = $line;
323
                }
324
            }
325 18
        }
326
327 18
        return $sections;
328
    }
329 5
330 5
    private function getDefaultFont(Page $page = null): Font
331
    {
332 18
        $fonts = [];
333
        if (null !== $page) {
334 6
            $fonts = $page->getFonts();
335 6
        }
336
337 18
        $firstFont = $this->document->getFirstFont();
338 18
        if (null !== $firstFont) {
339 13
            $fonts[] = $firstFont;
340
        }
341 17
342 18
        if (\count($fonts) > 0) {
343 18
            return reset($fonts);
344 18
        }
345
346
        return new Font($this->document, null, null, $this->config);
347 15
    }
348 1
349 1
    /**
350
     * @param array<int,array<string,string|bool>> $command
351 15
     * @param array<string,float> $textMatrix
352 14
     */
353 14
    private function getTJUsingFontFallback(
354 14
        Font $font,
355 14
        array $command,
356 14
        array $textMatrix = ['a' => 1, 'b' => 0, 'i' => 0, 'j' => 1],
357 14
        Page $page = null
358 12
    ): string
359
    {
360
        $orig_text = $font->decodeText($command, $textMatrix);
361 14
        $text = $orig_text;
362 14
363 14
        // If we make this a Config option, we can add a check if it's
364 10
        // enabled here.
365
        if (null !== $page) {
366
            $font_ids = array_keys($page->getFonts());
367 14
368 14
            // If the decoded text contains UTF-8 control characters
369
            // then the font page being used is probably the wrong one.
370
            // Loop through the rest of the fonts to see if we can get
371 12
            // a good decode. Allow x09 to x0d which are whitespace.
372
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
373
                // If we're out of font IDs, then give up and use the
374
                // original string
375 12
                if (0 == \count($font_ids)) {
376 4
                    return $orig_text;
377
                }
378
379 12
                // Try the next font ID
380
                $font = $page->getFont(array_shift($font_ids));
381
                $text = $font->decodeText($command, $textMatrix);
382
            }
383
        }
384 12
385 4
        return $text;
386 4
    }
387
388 11
    /**
389
     * @throws \Exception
390
     */
391 11
    public function parseDictionary(string $dictionary): array
392 4
    {
393 4
        // Normalize whitespace
394 4
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
395 4
396
        if ('<<' != substr($dictionary, 0, 2)) {
397
            throw new \Exception('Not a valid dictionary object.');
398 4
        }
399
400 4
        $parsed = [];
401
        $stack = [];
402
        $currentName = '';
403 4
        $arrayTypeNumeric = false;
404
405 9
        // Remove outer layer of dictionary, and split on tokens
406 8
        $split = preg_split(
407 2
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
408
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
409 8
            -1,
410
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
411
        );
412 8
413
        foreach ($split as $token) {
414
            $token = trim($token);
415 8
            switch ($token) {
416 3
                case '':
417
                    break;
418 8
419 3
                    // Open numeric array
420
                case '[':
421 7
                    $parsed[$currentName] = [];
422
                    $arrayTypeNumeric = true;
423
424 7
                    // Move up one level in the stack
425 7
                    $stack[\count($stack)] = &$parsed;
426
                    $parsed = &$parsed[$currentName];
427
                    $currentName = '';
428 7
                    break;
429 7
430 1
                    // Open hashed array
431
                case '<<':
432 6
                    $parsed[$currentName] = [];
433
                    $arrayTypeNumeric = false;
434
435 6
                    // Move up one level in the stack
436 6
                    $stack[\count($stack)] = &$parsed;
437
                    $parsed = &$parsed[$currentName];
438
                    $currentName = '';
439
                    break;
440
441
                    // Close numeric array
442
                case ']':
443
                    // Revert string type arrays back to a single element
444
                    if (is_array($parsed) && 1 == \count($parsed) &&
445 18
                        isset($parsed[0]) && is_string($parsed[0]) &&
446 1
                        0 < strlen($parsed[0]) && '/' != $parsed[0][0]) {
447 1
                        $parsed = '['.$parsed[0].']';
448
                    }
449
                    // no break
450 18
                    // Close hashed array
451
                case '>>':
452
                    $arrayTypeNumeric = false;
453 20
454
                    // Move down one level in the stack
455
                    $parsed = &$stack[\count($stack) - 1];
456
                    unset($stack[\count($stack) - 1]);
457
                    break;
458
459 6
                default:
460
                    // If value begins with a slash, then this is a name
461 6
                    // Add it to the appropriate array
462 6
                    if ('/' == substr($token, 0, 1)) {
463 6
                        $currentName = substr($token, 1);
464
                        if (true == $arrayTypeNumeric) {
465 6
                            $parsed[] = $currentName;
466 6
                            $currentName = '';
467
                        }
468 6
                    } else if ('' != $currentName) {
469 6
                        if (false == $arrayTypeNumeric) {
470
                            $parsed[$currentName] = $token;
471 6
                        }
472 3
                        $currentName = '';
473
                    } else if ('' == $currentName) {
474
                        $parsed[] = $token;
475 6
                    }
476 6
477
            }
478
        }
479 6
480
        return $parsed;
481
    }
482 6
483 6
    /**
484 6
     * getText() leverages getTextArray() to get the content of the
485 6
     * document, setting the addPositionWhitespace flag to true so
486 6
     * whitespace is inserted in a logical way for reading by humans.
487
     */
488 6
    public function getText(Page $page = null): string
489
    {
490 6
        $this->addPositionWhitespace = true;
491 6
        $result = $this->getTextArray($page);
492 5
        $this->addPositionWhitespace = false;
493
494 6
        return implode('', $result).' ';
495 6
    }
496 6
497 6
    /**
498
     * getTextArray() returns the text objects of a document in an
499
     * array. By default no positioning whitespace is added to the
500 5
     * output unless the addPositionWhitespace flag is set to true.
501 4
     *
502
     * @throws \Exception
503 5
     */
504 4
    public function getTextArray(Page $page = null): array
505
    {
506
        $result = [];
507 5
        $text = [];
508
509
        $marked_stack = [];
510
        $last_written_position = false;
511 5
512 2
        $sections = $this->getSectionsText($this->content);
513
        $current_font = $this->getDefaultFont($page);
514
515 5
        $current_position = ['x' => false, 'y' => false];
516
        $current_position_tm = [
517
            'a' => 1, 'b' => 0, 'c' => 0,
518
            'i' => 0, 'j' => 1, 'k' => 0,
519
            'x' => false, 'y' => false, 'z' => 1,
520 5
        ];
521
        $current_position_td = ['x' => 0, 'y' => 0];
522 4
        $current_position_cm = [
523
            'a' => 1, 'b' => 0, 'c' => 0,
524 4
            'i' => 0, 'j' => 1, 'k' => 0,
525
            'x' => 0, 'y' => 0, 'z' => 1,
526
        ];
527 4
528
        $clipped_font = [];
529
        $clipped_position_cm = [];
530
531
        self::$recursionStack[] = $this->getUniqueId();
532
533
        foreach ($sections as $section) {
534
            $commands = $this->getCommandsText($section);
535
            foreach ($commands as $command) {
536
                switch ($command[self::OPERATOR]) {
537 4
                    case 'BT':
538 4
                        // Reset text positioning matrices
539 2
                        $current_position_tm = [
540
                            'a' => 1, 'b' => 0, 'c' => 0,
541 4
                            'i' => 0, 'j' => 1, 'k' => 0,
542
                            'x' => false, 'y' => false, 'z' => 1,
543
                        ];
544 4
                        $current_position_td = ['x' => 0, 'y' => 0];
545
                        break;
546
547 4
                    case 'ET':
548
                        break;
549
550 4
                        // set character spacing
551 1
                    case 'Tc':
552
                        break;
553 4
554
                        // move text current point and set leading
555
                    case 'TD':
556 4
                    case 'Td':
557 4
                        // move text current point
558
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
559
                        $y = (float) array_pop($args);
560 4
                        $x = (float) array_pop($args);
561 4
562 2
                        $current_position_td = [
563
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
564 2
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
565
                        ];
566
                        break;
567 2
568 2
                    case 'Tf':
569
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
570
                        $id = trim($id, '/');
571
                        if (null !== $page) {
572
                            $new_font = $page->getFont($id);
573
                            // If an invalid font ID is given, do not update the font.
574
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
575
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
576 6
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
577
                            // But we want to make sure that malformed PDFs do not simply crash.
578
                            if (null !== $new_font) {
579 29
                                $current_font = $new_font;
580
                            }
581 29
                        }
582
                        break;
583 29
584 29
                        // Store current selected font and graphics matrix
585 29
                    case 'q':
586
                        $clipped_font[] = $current_font;
587 29
                        $clipped_position_cm[] = $current_position_cm;
588 29
                        break;
589 29
590
                        // Restore previous selected font and graphics matrix
591 29
                    case 'Q':
592 29
                        $current_font = array_pop($clipped_font);
593 29
                        $current_position_cm = array_pop($clipped_position_cm);
594 29
                        break;
595 29
596 29
                    case 'DP':
597
                    case 'MP':
598
                        break;
599
600 29
                        // Begin marked content sequence with property list
601 29
                    case 'BDC':
602 29
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
603 11
                            $dict = $this->parseDictionary($match[1]);
604 11
605 11
                            // Check for ActualText block
606
                            if (isset($dict['ActualText']) && is_string($dict['ActualText']) && 0 < strlen($dict['ActualText'])) {
607
                                if ('[' == $dict['ActualText'][0]) {
608
                                    // Simulate a 'TJ' command on the stack
609 11
                                    $marked_stack[] = [
610 11
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0]
611 11
                                    ];
612
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
613 29
                                    // Simulate a 'Tj' command on the stack
614
                                    $marked_stack[] = [
615 29
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0]
616 29
                                    ];
617
                                }
618 25
                            }
619 25
                        }
620 25
                        break;
621
622 25
                        // Begin marked content sequence
623
                    case 'BMC':
624 25
                        if ('ReversedChars' == $command[self::COMMAND]) {
625 25
                            // Upon encountering a ReversedChars command,
626 25
                            // add the characters we've built up so far to
627
                            // the result array
628
                            $result = array_merge($result, $text);
629 25
630 25
                            // Start a fresh $text array that will contain
631
                            // reversed characters
632 25
                            $text = [];
633
634 29
                            // Add the reversed text flag to the stack
635 29
                            $marked_stack[] = [ 'ReversedChars' => true ];
636
                        }
637 14
                        break;
638 14
639 14
                        // End marked content sequence
640 14
                    case 'EMC':
641 14
                        $data = false;
642 14
                        if (\count($marked_stack)) {
643
                            $marked = array_pop($marked_stack);
644
                            $action = key($marked);
645 14
                            $data = $marked[$action];
646 9
647 9
                            switch ($action) {
648
                                // If we are in ReversedChars mode...
649 14
                                 case 'ReversedChars':
650
                                    // Reverse the characters we've built up so far
651 29
                                    foreach ($text as $key => $t) {
652 29
                                        $text[$key] = implode('', array_reverse(
653 22
                                            mb_str_split($t, 1, mb_internal_encoding())
654 22
                                        ));
655 22
                                    }
656 22
657 22
                                    // Add these characters to the result array
658 22
                                    $result = array_merge($result, $text);
659 22
660
                                    // Start a fresh $text array that will contain
661
                                    // non-reversed characters
662 22
                                    $text = [];
663 22
                                    break;
664 22
665
                                case 'ActualText':
666
                                    // Use the content of the ActualText as a command
667 16
                                    $command = $data;
668 16
                                    break;
669
670 22
                            }
671
                        }
672
673
                        // If this EMC command has been transformed into a 'Tj'
674
                        // or 'TJ' command because of being ActualText, then bypass
675 22
                        // the break to proceed to the writing section below.
676
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
677 22
                            break;
678 22
                        }
679
680 22
                    case "'":
681
                    case '"':
682 22
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
683 22
                            // Move to next line and write text
684
                            $current_position['x'] = 0;
685 22
                            $current_position_td['x'] = 0;
686 18
                            $current_position_td['y'] += 10;
687 18
                        }
688
                        // no break
689
                    case 'Tj':
690 22
                        $command[self::COMMAND] = [$command];
691
                        // no break
692
                    case 'TJ':
693 29
                        // Check the marked content stack for flags
694 1
                        $actual_text = false;
695 29
                        $reverse_text = false;
696 29
                        foreach ($marked_stack as $marked) {
697 29
                            if (isset($marked['ActualText'])) {
698
                                $actual_text = true;
699
                            }
700
                            if (isset($marked['ReversedChars'])) {
701 29
                                $reverse_text = true;
702 29
                            }
703 29
                        }
704 24
705 22
                        // Account for text position ONLY just before we write text
706 22
                        if (false === $actual_text && is_array($last_written_position)) {
707 22
                            // If $last_written_position is an array, that
708 17
                            // means we have stored text position coordinates
709 17
                            // for placing an ActualText
710 17
                            $currentX = $last_written_position[0];
711 17
                            $currentY = $last_written_position[1];
712 17
                            $last_written_position = false;
713
                        } else {
714
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
715
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
716 29
                        }
717 29
                        $whiteSpace = '';
718 29
719 29
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
720 29
                            if (abs($currentY - $current_position['y']) > 9) {
721
                                $whiteSpace = "\n";
722
                            } else {
723 25
                                $curX = $currentX - $current_position['x'];
724
                                $factorX = 10 * $current_position_tm['a'] + 10 * $current_position_tm['b'];
725
                                if (true === $reverse_text) {
726
                                    if ($curX < -abs($factorX * 8)) {
727 29
                                        $whiteSpace = "\t";
728
                                    } elseif ($curX < -abs($factorX)) {
729
                                        $whiteSpace = ' ';
730 42
                                    }
731
                                } else {
732
                                    if ($curX > ($factorX * 8)) {
733
                                        $whiteSpace = "\t";
734
                                    } elseif ($curX > $factorX) {
735
                                        $whiteSpace = ' ';
736 42
                                    }
737 42
                                }
738 8
                            }
739 8
                        }
740 3
741
                        $newtext = $this->getTJUsingFontFallback(
742 6
                            $current_font,
743 6
                            $command[self::COMMAND],
744
                            $current_position_tm,
745
                            $page
746
                        );
747
748 42
                        // If there is no ActualText pending then write
749 41
                        if (false === $actual_text) {
750
                            if (false !== $reverse_text) {
751 42
                                // If we are in ReversedChars mode, add the whitespace last
752 41
                                $text[] = str_replace(["\r", "\n"], '', $newtext).$whiteSpace;
753
                            } else {
754 42
                                // Otherwise add the whitespace first
755 6
                                $text[] = $whiteSpace.str_replace(["\r", "\n"], '', $newtext);
756
                            }
757 42
758 41
                            // Record the position of this inserted text for comparison
759 41
                            // with the next text block.
760
                            // Provide a 'fudge' factor guess on how wide this text block
761 41
                            // is based on the number of characters. This helps limit the
762 41
                            // number of tabs inserted, but isn't perfect.
763
                            $factor = 6;
764
                            if (true === $reverse_text) {
765
                                $factor = -$factor;
766
                            }
767
                            $current_position = [
768 42
                                'x' => $currentX + mb_strlen($newtext) * $factor,
769
                                'y' => $currentY,
770
                            ];
771
                        } else if (false === $last_written_position) {
772
                            // If there is an ActualText in the pipeline
773
                            // store the position this undisplayed text
774
                            // *would* have been written to, so the
775 20
                            // ActualText is displayed in the right spot
776
                            $last_written_position = [$currentX, $currentY];
777 20
                        }
778
                        break;
779
780
                        // set leading
781
                    case 'TL':
782
                        break;
783
784
                        // set graphics position matrix
785
                    case 'cm':
786
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
787
                        $current_position_cm = [
788
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
789
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
790
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
791
                        ];
792
                        break;
793
794
                        // set text position matrix
795
                    case 'Tm':
796
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
797
                        $current_position_tm = [
798
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
799
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
800
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
801
                        ];
802
                        break;
803
804
                        // set super/subscripting text rise
805
                    case 'Ts':
806
                        break;
807
808
                        // set word spacing
809
                    case 'Tw':
810
                        break;
811
812
                        // set horizontal scaling
813
                    case 'Tz':
814
                        break;
815
816
                        // move to start of next line
817
                    case 'T*':
818
                        $current_position['x'] = 0;
819
                        $current_position_td['x'] = 0;
820
                        $current_position_td['y'] += 10;
821
                        break;
822
823
                    case 'Da':
824
                        break;
825
826
                    case 'Do':
827
                        if (null !== $page) {
828
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
829
                            $id = trim(array_pop($args), '/ ');
830
                            $xobject = $page->getXObject($id);
831
832
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
833
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
834
                                // Not a circular reference.
835
                                $text[] = $xobject->getText($page);
836
                            }
837
                        }
838
                        break;
839
840
                    case 'rg':
841
                    case 'RG':
842
                        break;
843
844
                    case 're':
845
                        break;
846
847
                    case 'co':
848
                        break;
849
850
                    case 'cs':
851
                        break;
852
853
                    case 'gs':
854
                        break;
855
856
                    case 'en':
857
                        break;
858
859
                    case 'sc':
860
                    case 'SC':
861
                        break;
862
863
                    case 'g':
864
                    case 'G':
865
                        break;
866
867
                    case 'V':
868
                        break;
869
870
                    case 'vo':
871
                    case 'Vo':
872
                        break;
873
874
                    default:
875
                }
876
            }
877
        }
878
879
        $result = array_merge($result, $text);
880
881
        return $result;
882
    }
883
884
    /**
885
     * getCommandsText() expects the content of $text_part to be an
886
     * already formatted, single-line command from a document stream.
887
     * The companion function getSectionsText() returns a document
888
     * stream as an array of single commands for just this purpose.
889
     *
890
     * A better name for this function would be getCommandText()
891
     * since it now always works on just one command.
892
     */
893
    public function getCommandsText(string $text_part): array
894
    {
895
        $commands = $matches = [];
896
897
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
898
899
        $type = $matches[2];
900
        $operator = $matches[3];
901
        $command = trim($matches[1]);
902
903
        if ('TJ' == $operator) {
904
            $subcommand = [];
905
            $command = trim($command, '[]');
906
            do {
907
                $oldCommand = $command;
908
909
                // Search for parentheses string () format
910
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
911
                    $subcommand[] = [
912
                        self::TYPE => '(',
913
                        self::OPERATOR => 'TJ',
914
                        self::COMMAND => $tjmatch[1],
915
                    ];
916
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
917
                        $subcommand[] = [
918
                            self::TYPE => 'n',
919
                            self::OPERATOR => '',
920
                            self::COMMAND => $tjmatch[2],
921
                        ];
922
                    }
923
                    $command = substr($command, \strlen($tjmatch[0]));
924
                }
925
926
                // Search for hexadecimal <> format
927
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
928
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
929
                    $subcommand[] = [
930
                        self::TYPE => '<',
931
                        self::OPERATOR => 'TJ',
932
                        self::COMMAND => $tjmatch[1],
933
                    ];
934
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
935
                        $subcommand[] = [
936
                            self::TYPE => 'n',
937
                            self::OPERATOR => '',
938
                            self::COMMAND => $tjmatch[2],
939
                        ];
940
                    }
941
                    $command = substr($command, \strlen($tjmatch[0]));
942
                }
943
            } while ($command != $oldCommand);
944
945
            $command = $subcommand;
946
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
947
            // Depending on the string type, trim the data of the
948
            // appropriate delimiters
949
            if ('(' == $type) {
950
                // Don't use trim() here since a () string may end with
951
                // a balanced or escaped right parentheses, and trim()
952
                // will delete both. Both strings below are valid:
953
                //   eg. (String())
954
                //   eg. (String\))
955
                $command = preg_replace('/^\(|\)$/', '', $command);
956
            } elseif ('<' == $type) {
957
                $command = trim($command, '<>');
958
            }
959
        } elseif ('/' == $type) {
960
            $command = substr($command, 1);
961
        }
962
963
        $commands[] = [
964
            self::TYPE => $type,
965
            self::OPERATOR => $operator,
966
            self::COMMAND => $command,
967
        ];
968
969
        return $commands;
970
    }
971
972
    public static function factory(
973
        Document $document,
974
        Header $header,
975
        ?string $content,
976
        Config $config = null
977
    ): self {
978
        switch ($header->get('Type')->getContent()) {
979
            case 'XObject':
980
                switch ($header->get('Subtype')->getContent()) {
981
                    case 'Image':
982
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
983
984
                    case 'Form':
985
                        return new Form($document, $header, $content, $config);
986
                }
987
988
                return new self($document, $header, $content, $config);
989
990
            case 'Pages':
991
                return new Pages($document, $header, $content, $config);
992
993
            case 'Page':
994
                return new Page($document, $header, $content, $config);
995
996
            case 'Encoding':
997
                return new Encoding($document, $header, $content, $config);
998
999
            case 'Font':
1000
                $subtype = $header->get('Subtype')->getContent();
1001
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1002
1003
                if (class_exists($classname)) {
1004
                    return new $classname($document, $header, $content, $config);
1005
                }
1006
1007
                return new Font($document, $header, $content, $config);
1008
1009
            default:
1010
                return new self($document, $header, $content, $config);
1011
        }
1012
    }
1013
1014
    /**
1015
     * Returns unique id identifying the object.
1016
     */
1017
    protected function getUniqueId(): string
1018
    {
1019
        return spl_object_hash($this);
1020
    }
1021
}
1022