Test Failed
Pull Request — master (#634)
by
unknown
02:03
created

PDFObject::getSectionsText()   C

Complexity

Conditions 14
Paths 13

Size

Total Lines 69
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 42
CRAP Score 14.0184

Importance

Changes 0
Metric Value
cc 14
eloc 34
c 0
b 0
f 0
nc 13
nop 1
dl 0
loc 69
ccs 42
cts 44
cp 0.9545
crap 14.0184
rs 6.2666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    public function cleanContent(?string $content): string
136 32
    {
137 32
        if (null === $content) {
138
            return '';
139
        }
140
141
        // Find all strings () and replace them so they aren't affected
142 32
        // by the next steps
143 32
        $pdfstrings = [];
144 22
        $attempt = '(';
145
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
146
            // PDF strings can contain unescaped parentheses as long as
147
            // they're balanced, so check for balanced parentheses
148 32
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
149 32
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
150 21
151
            if ($left == $right) {
152
                // Replace the string with a unique placeholder
153
                $id = uniqid('STRING_', true);
154 32
                $pdfstrings[$id] = $text[0];
155 32
                $content = preg_replace(
156 32
                    '/'.preg_quote($text[0], '/').'/',
157 32
                    '@@@'.$id.'@@@',
158 32
                    $content,
159 18
                    1
160
                );
161
162 32
                // Reset to search for the next string
163
                $attempt = '(';
164 32
            } else {
165 18
                // We had unbalanced parentheses, so use the current
166
                // match as a base to find a longer string
167
                $attempt = $text[0];
168
            }
169
        }
170
171 32
        // Remove all carriage returns and line-feeds from the document stream
172 32
        $content = str_replace(["\r", "\n"], ' ', trim($content));
173
174
        // Find all dictionary << >> commands and replace them so they
175 32
        // aren't affected by the next steps
176
        $dictstore = [];
177 32
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
178 7
            $dictid = uniqid('DICT_', true);
179
            $dictstore[$dictid] = $dicttext[1];
180
            $content = preg_replace(
181 32
                '/'.preg_quote($dicttext[0], '/').'/',
182 32
                ' ###'.$dictid.'###'.$dicttext[2],
183 11
                $content,
184
                1
185
            );
186 32
        }
187
188
        // Now that all strings and dictionaries are hidden, the only
189 31
        // PDF commands left should all be plain text.
190
        // Detect text encoding of the current string to prevent reading
191 31
        // content streams that are images, etc. This prevents PHP
192 31
        // error messages when JPEG content is sent to this function
193 31
        // by the sample file '12249.pdf' from:
194
        // https://github.com/smalot/pdfparser/issues/458
195
        if (false === mb_detect_encoding($content, null, true)) {
196 31
            return '';
197 29
        }
198 29
199 29
        // Normalize white-space in the document stream
200
        $content = preg_replace('/\s{2,}/', ' ', $content);
201
202 29
        // Find all valid PDF operators and add \r\n after each; this
203 29
        // ensures there is just one command on every line
204
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
205
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
206 29
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
207
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
208
        //       appear here in the list for completeness.
209
        $operators = [
210 29
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
211
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
212 29
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
213
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
214
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
215
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
216
        ];
217 31
        foreach ($operators as $operator) {
218 4
            $content = preg_replace(
219 4
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
220 4
                $operator."\r\n",
221 4
                $content
222
            );
223 4
        }
224
225
        // Restore the original content of the dictionary << >> commands
226
        $dictstore = array_reverse($dictstore, true);
227 31
        foreach ($dictstore as $id => $dict) {
228
            $content = str_replace('###'.$id.'###', $dict, $content);
229
        }
230 20
231
        // Restore the original string content
232 20
        $pdfstrings = array_reverse($pdfstrings, true);
233 20
        foreach ($pdfstrings as $id => $text) {
234 19
            // Strings may contain escaped newlines, or literal newlines
235
            // and we should clean these up before replacing the string
236
            // back into the content stream; this ensures no strings are
237 20
            // split between two lines (every command must be on one line)
238 20
            $text = str_replace(
239 18
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
240
                ['', '', '', '\r', '\n'],
241
                $text
242 20
            );
243 18
244
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
245
        }
246 2
247
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
248
249
        return $content;
250
    }
251
252 20
    /**
253
     * getSectionsText() now takes an entire, unformatted document
254 20
     * stream as a string, cleans it, then filters out commands that
255 20
     * aren't needed for text positioning/extraction. It returns an
256 20
     * array of unprocessed PDF commands, one command per element.
257 20
     */
258
    public function getSectionsText(?string $content): array
259 20
    {
260 20
        $sections = [];
261
262 20
        // A cleaned stream has one command on every line, so split the
263
        // cleaned stream content on \r\n into an array
264 20
        $textCleaned = preg_split(
265 18
            '/(\r\n|\n|\r)/',
266 18
            $this->cleanContent($content),
267 18
            -1,
268
            \PREG_SPLIT_NO_EMPTY
269 18
        );
270 18
271 18
        $inTextBlock = false;
272 1
        foreach ($textCleaned as $line) {
273 1
            $line = trim($line);
274
275 1
            // Skip empty lines
276
            if ('' === $line) {
277
                continue;
278 18
            }
279 5
280
            // If a 'BT' is encountered, set the $inTextBlock flag
281
            if (preg_match('/BT$/', $line)) {
282 18
                $inTextBlock = true;
283 15
                $sections[] = $line;
284 15
285 15
                // If an 'ET' is encountered, unset the $inTextBlock flag
286 15
            } elseif ('ET' == $line) {
287 15
                $inTextBlock = false;
288
                $sections[] = $line;
289
            } elseif ($inTextBlock) {
290 11
                // If we are inside a BT ... ET text block, save all lines
291 15
                $sections[] = trim($line);
292 15
            } else {
293
                // Otherwise, if we are outside of a text block, only
294 12
                // save specific, necessary lines. Care should be taken
295
                // to ensure a command being checked for *only* matches
296 15
                // that command. For instance, a simple search for 'c'
297 15
                // may also match the 'sc' command. See the command
298
                // list in the cleanContent() method above.
299
                // Add more commands to save here as you find them in
300 18
                // weird PDFs!
301 3
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
302 3
                    // Save and restore graphics state commands
303 3
                    $sections[] = $line;
304 3
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
305 3
                    // Begin marked content sequence
306
                    $sections[] = $line;
307
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
308
                    // Marked content point
309 3
                    $sections[] = $line;
310
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
311 18
                    // End marked content sequence
312 18
                    $sections[] = $line;
313 18
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
314 18
                    // Graphics position change commands
315 18
                    $sections[] = $line;
316
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
317
                    // Font change commands
318
                    $sections[] = $line;
319
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
320
                    // Invoke named XObject command
321 18
                    $sections[] = $line;
322 16
                }
323
            }
324
        }
325 18
326
        return $sections;
327 18
    }
328
329 5
    private function getDefaultFont(Page $page = null): Font
330 5
    {
331
        $fonts = [];
332 18
        if (null !== $page) {
333
            $fonts = $page->getFonts();
334 6
        }
335 6
336
        $firstFont = $this->document->getFirstFont();
337 18
        if (null !== $firstFont) {
338 18
            $fonts[] = $firstFont;
339 13
        }
340
341 17
        if (\count($fonts) > 0) {
342 18
            return reset($fonts);
343 18
        }
344 18
345
        return new Font($this->document, null, null, $this->config);
346
    }
347 15
348 1
    /**
349 1
     * Decode a '[]TJ' command and attempt to use alternate fonts if
350
     * the current font results in output that contains Unicode control
351 15
     * characters. See Font::decodeText for a full description of
352 14
     * $textMatrix
353 14
     *
354 14
     * @param array<int,array<string,string|bool>> $command
355 14
     * @param array<string,float>                  $textMatrix
356 14
     */
357 14
    private function getTJUsingFontFallback(
358 12
        Font $font,
359
        array $command,
360
        array $textMatrix = ['a' => 1, 'b' => 0, 'i' => 0, 'j' => 1],
361 14
        Page $page = null
362 14
    ): string {
363 14
        $orig_text = $font->decodeText($command, $textMatrix);
364 10
        $text = $orig_text;
365
366
        // If we make this a Config option, we can add a check if it's
367 14
        // enabled here.
368 14
        if (null !== $page) {
369
            $font_ids = array_keys($page->getFonts());
370
371 12
            // If the decoded text contains UTF-8 control characters
372
            // then the font page being used is probably the wrong one.
373
            // Loop through the rest of the fonts to see if we can get
374
            // a good decode. Allow x09 to x0d which are whitespace.
375 12
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
376 4
                // If we're out of font IDs, then give up and use the
377
                // original string
378
                if (0 == \count($font_ids)) {
379 12
                    return $orig_text;
380
                }
381
382
                // Try the next font ID
383
                $font = $page->getFont(array_shift($font_ids));
384 12
                $text = $font->decodeText($command, $textMatrix);
385 4
            }
386 4
        }
387
388 11
        return $text;
389
    }
390
391 11
    /**
392 4
     * @throws \Exception
393 4
     */
394 4
    public function parseDictionary(string $dictionary): array
395 4
    {
396
        // Normalize whitespace
397
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
398 4
399
        if ('<<' != substr($dictionary, 0, 2)) {
400 4
            throw new \Exception('Not a valid dictionary object.');
401
        }
402
403 4
        $parsed = [];
404
        $stack = [];
405 9
        $currentName = '';
406 8
        $arrayTypeNumeric = false;
407 2
408
        // Remove outer layer of dictionary, and split on tokens
409 8
        $split = preg_split(
410
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
411
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
412 8
            -1,
413
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
414
        );
415 8
416 3
        foreach ($split as $token) {
417
            $token = trim($token);
418 8
            switch ($token) {
419 3
                case '':
420
                    break;
421 7
422
                    // Open numeric array
423
                case '[':
424 7
                    $parsed[$currentName] = [];
425 7
                    $arrayTypeNumeric = true;
426
427
                    // Move up one level in the stack
428 7
                    $stack[\count($stack)] = &$parsed;
429 7
                    $parsed = &$parsed[$currentName];
430 1
                    $currentName = '';
431
                    break;
432 6
433
                    // Open hashed array
434
                case '<<':
435 6
                    $parsed[$currentName] = [];
436 6
                    $arrayTypeNumeric = false;
437
438
                    // Move up one level in the stack
439
                    $stack[\count($stack)] = &$parsed;
440
                    $parsed = &$parsed[$currentName];
441
                    $currentName = '';
442
                    break;
443
444
                    // Close numeric array
445 18
                case ']':
446 1
                    // Revert string type arrays back to a single element
447 1
                    if (\is_array($parsed) && 1 == \count($parsed)
448
                        && isset($parsed[0]) && \is_string($parsed[0])
449
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
450 18
                        $parsed = '['.$parsed[0].']';
451
                    }
452
                    // Close hashed array
453 20
                    // no break
454
                case '>>':
455
                    $arrayTypeNumeric = false;
456
457
                    // Move down one level in the stack
458
                    $parsed = &$stack[\count($stack) - 1];
459 6
                    unset($stack[\count($stack) - 1]);
460
                    break;
461 6
462 6
                default:
463 6
                    // If value begins with a slash, then this is a name
464
                    // Add it to the appropriate array
465 6
                    if ('/' == substr($token, 0, 1)) {
466 6
                        $currentName = substr($token, 1);
467
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
468 6
                            $parsed[] = $currentName;
469 6
                            $currentName = '';
470
                        }
471 6
                    } elseif ('' != $currentName) {
472 3
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
473
                            $parsed[$currentName] = $token;
474
                        }
475 6
                        $currentName = '';
476 6
                    } elseif ('' == $currentName) {
477
                        $parsed[] = $token;
478
                    }
479 6
            }
480
        }
481
482 6
        return $parsed;
483 6
    }
484 6
485 6
    /**
486 6
     * getText() leverages getTextArray() to get the content of the
487
     * document, setting the addPositionWhitespace flag to true so
488 6
     * whitespace is inserted in a logical way for reading by humans.
489
     */
490 6
    public function getText(Page $page = null): string
491 6
    {
492 5
        $this->addPositionWhitespace = true;
493
        $result = $this->getTextArray($page);
494 6
        $this->addPositionWhitespace = false;
495 6
496 6
        return implode('', $result).' ';
497 6
    }
498
499
    /**
500 5
     * getTextArray() returns the text objects of a document in an
501 4
     * array. By default no positioning whitespace is added to the
502
     * output unless the addPositionWhitespace flag is set to true.
503 5
     *
504 4
     * @throws \Exception
505
     */
506
    public function getTextArray(Page $page = null): array
507 5
    {
508
        $result = [];
509
        $text = [];
510
511 5
        $marked_stack = [];
512 2
        $last_written_position = false;
513
514
        $sections = $this->getSectionsText($this->content);
515 5
        $current_font = $this->getDefaultFont($page);
516
517
        $current_position = ['x' => false, 'y' => false];
518
        $current_position_tm = [
519
            'a' => 1, 'b' => 0, 'c' => 0,
520 5
            'i' => 0, 'j' => 1, 'k' => 0,
521
            'x' => false, 'y' => false, 'z' => 1,
522 4
        ];
523
        $current_position_td = ['x' => 0, 'y' => 0];
524 4
        $current_position_cm = [
525
            'a' => 1, 'b' => 0, 'c' => 0,
526
            'i' => 0, 'j' => 1, 'k' => 0,
527 4
            'x' => 0, 'y' => 0, 'z' => 1,
528
        ];
529
530
        $clipped_font = [];
531
        $clipped_position_cm = [];
532
533
        self::$recursionStack[] = $this->getUniqueId();
534
535
        foreach ($sections as $section) {
536
            $commands = $this->getCommandsText($section);
537 4
            foreach ($commands as $command) {
538 4
                switch ($command[self::OPERATOR]) {
539 2
                    // Begin text object
540
                    case 'BT':
541 4
                        // Reset text positioning matrices
542
                        $current_position_tm = [
543
                            'a' => 1, 'b' => 0, 'c' => 0,
544 4
                            'i' => 0, 'j' => 1, 'k' => 0,
545
                            'x' => false, 'y' => false, 'z' => 1,
546
                        ];
547 4
                        $current_position_td = ['x' => 0, 'y' => 0];
548
                        break;
549
550 4
                        // Begin marked content sequence with property list
551 1
                    case 'BDC':
552
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
                        if (preg_match('/(<<.*>>)$/', /** @scrutinizer ignore-type */ $command[self::COMMAND], $match)) {
Loading history...
553 4
                            $dict = $this->parseDictionary($match[1]);
554
555
                            // Check for ActualText block
556 4
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
557 4
                                if ('[' == $dict['ActualText'][0]) {
558
                                    // Simulate a 'TJ' command on the stack
559
                                    $marked_stack[] = [
560 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
561 4
                                    ];
562 2
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
563
                                    // Simulate a 'Tj' command on the stack
564 2
                                    $marked_stack[] = [
565
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
566
                                    ];
567 2
                                }
568 2
                            }
569
                        }
570
                        break;
571
572
                        // Begin marked content sequence
573
                    case 'BMC':
574
                        if ('ReversedChars' == $command[self::COMMAND]) {
575
                            // Upon encountering a ReversedChars command,
576 6
                            // add the characters we've built up so far to
577
                            // the result array
578
                            $result = array_merge($result, $text);
579 29
580
                            // Start a fresh $text array that will contain
581 29
                            // reversed characters
582
                            $text = [];
583 29
584 29
                            // Add the reversed text flag to the stack
585 29
                            $marked_stack[] = ['ReversedChars' => true];
586
                        }
587 29
                        break;
588 29
589 29
                        // set graphics position matrix
590
                    case 'cm':
591 29
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

591
                        $args = preg_split('/\s+/s', /** @scrutinizer ignore-type */ $command[self::COMMAND]);
Loading history...
592 29
                        $current_position_cm = [
593 29
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
594 29
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
595 29
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
596 29
                        ];
597
                        break;
598
599
                    case 'Do':
600 29
                        if (null !== $page) {
601 29
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
602 29
                            $id = trim(array_pop($args), '/ ');
603 11
                            $xobject = $page->getXObject($id);
604 11
605 11
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
606
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
607
                                // Not a circular reference.
608
                                $text[] = $xobject->getText($page);
609 11
                            }
610 11
                        }
611 11
                        break;
612
613 29
                        // Marked content point with (DP) & without (MP) property list
614
                    case 'DP':
615 29
                    case 'MP':
616 29
                        break;
617
618 25
                        // End text object
619 25
                    case 'ET':
620 25
                        break;
621
622 25
                        // Store current selected font and graphics matrix
623
                    case 'q':
624 25
                        $clipped_font[] = $current_font;
625 25
                        $clipped_position_cm[] = $current_position_cm;
626 25
                        break;
627
628
                        // Restore previous selected font and graphics matrix
629 25
                    case 'Q':
630 25
                        $current_font = array_pop($clipped_font);
631
                        $current_position_cm = array_pop($clipped_position_cm);
632 25
                        break;
633
634 29
                        // End marked content sequence
635 29
                    case 'EMC':
636
                        $data = false;
637 14
                        if (\count($marked_stack)) {
638 14
                            $marked = array_pop($marked_stack);
639 14
                            $action = key($marked);
640 14
                            $data = $marked[$action];
641 14
642 14
                            switch ($action) {
643
                                // If we are in ReversedChars mode...
644
                                case 'ReversedChars':
645 14
                                    // Reverse the characters we've built up so far
646 9
                                    foreach ($text as $key => $t) {
647 9
                                        $text[$key] = implode('', array_reverse(
648
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

648
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
649 14
                                        ));
650
                                    }
651 29
652 29
                                    // Add these characters to the result array
653 22
                                    $result = array_merge($result, $text);
654 22
655 22
                                    // Start a fresh $text array that will contain
656 22
                                    // non-reversed characters
657 22
                                    $text = [];
658 22
                                    break;
659 22
660
                                case 'ActualText':
661
                                    // Use the content of the ActualText as a command
662 22
                                    $command = $data;
663 22
                                    break;
664 22
                            }
665
                        }
666
667 16
                        // If this EMC command has been transformed into a 'Tj'
668 16
                        // or 'TJ' command because of being ActualText, then bypass
669
                        // the break to proceed to the writing section below.
670 22
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
671
                            break;
672
                        }
673
674
                        // no break
675 22
                    case "'":
676
                    case '"':
677 22
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
678 22
                            // Move to next line and write text
679
                            $current_position['x'] = 0;
680 22
                            $current_position_td['x'] = 0;
681
                            $current_position_td['y'] += 10;
682 22
                        }
683 22
                        // no break
684
                    case 'Tj':
685 22
                        $command[self::COMMAND] = [$command];
686 18
                        // no break
687 18
                    case 'TJ':
688
                        // Check the marked content stack for flags
689
                        $actual_text = false;
690 22
                        $reverse_text = false;
691
                        foreach ($marked_stack as $marked) {
692
                            if (isset($marked['ActualText'])) {
693 29
                                $actual_text = true;
694 1
                            }
695 29
                            if (isset($marked['ReversedChars'])) {
696 29
                                $reverse_text = true;
697 29
                            }
698
                        }
699
700
                        // Account for text position ONLY just before we write text
701 29
                        if (false === $actual_text && \is_array($last_written_position)) {
702 29
                            // If $last_written_position is an array, that
703 29
                            // means we have stored text position coordinates
704 24
                            // for placing an ActualText
705 22
                            $currentX = $last_written_position[0];
706 22
                            $currentY = $last_written_position[1];
707 22
                            $last_written_position = false;
708 17
                        } else {
709 17
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
710 17
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
711 17
                        }
712 17
                        $whiteSpace = '';
713
714
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
715
                            if (abs($currentY - $current_position['y']) > 9) {
716 29
                                $whiteSpace = "\n";
717 29
                            } else {
718 29
                                $curX = $currentX - $current_position['x'];
719 29
                                $factorX = 10 * $current_position_tm['a'] + 10 * $current_position_tm['i'];
720 29
                                if (true === $reverse_text) {
721
                                    if ($curX < -abs($factorX * 8)) {
722
                                        $whiteSpace = "\t";
723 25
                                    } elseif ($curX < -abs($factorX)) {
724
                                        $whiteSpace = ' ';
725
                                    }
726
                                } else {
727 29
                                    if ($curX > ($factorX * 8)) {
728
                                        $whiteSpace = "\t";
729
                                    } elseif ($curX > $factorX) {
730 42
                                        $whiteSpace = ' ';
731
                                    }
732
                                }
733
                            }
734
                        }
735
736 42
                        $newtext = $this->getTJUsingFontFallback(
737 42
                            $current_font,
738 8
                            $command[self::COMMAND],
739 8
                            $current_position_tm,
740 3
                            $page
741
                        );
742 6
743 6
                        // If there is no ActualText pending then write
744
                        if (false === $actual_text) {
745
                            if (false !== $reverse_text) {
746
                                // If we are in ReversedChars mode, add the whitespace last
747
                                $text[] = str_replace(["\r", "\n"], '', $newtext).$whiteSpace;
748 42
                            } else {
749 41
                                // Otherwise add the whitespace first
750
                                $text[] = $whiteSpace.str_replace(["\r", "\n"], '', $newtext);
751 42
                            }
752 41
753
                            // Record the position of this inserted text for comparison
754 42
                            // with the next text block.
755 6
                            // Provide a 'fudge' factor guess on how wide this text block
756
                            // is based on the number of characters. This helps limit the
757 42
                            // number of tabs inserted, but isn't perfect.
758 41
                            $factor = 6;
759 41
                            if (true === $reverse_text) {
760
                                $factor = -$factor;
761 41
                            }
762 41
                            $current_position = [
763
                                'x' => $currentX + mb_strlen($newtext) * $factor,
764
                                'y' => $currentY,
765
                            ];
766
                        } elseif (false === $last_written_position) {
767
                            // If there is an ActualText in the pipeline
768 42
                            // store the position this undisplayed text
769
                            // *would* have been written to, so the
770
                            // ActualText is displayed in the right spot
771
                            $last_written_position = [$currentX, $currentY];
772
                        }
773
                        break;
774
775 20
                        // move to start of next line
776
                    case 'T*':
777 20
                        $current_position['x'] = 0;
778
                        $current_position_td['x'] = 0;
779
                        $current_position_td['y'] += 10;
780
                        break;
781
782
                        // set character spacing
783
                    case 'Tc':
784
                        break;
785
786
                        // move text current point and set leading
787
                    case 'Td':
788
                    case 'TD':
789
                        // move text current point
790
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
791
                        $y = (float) array_pop($args);
792
                        $x = (float) array_pop($args);
793
794
                        $current_position_td = [
795
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
796
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
797
                        ];
798
                        break;
799
800
                    case 'Tf':
801
                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
802
                        $id = trim($id, '/');
803
                        if (null !== $page) {
804
                            $new_font = $page->getFont($id);
805
                            // If an invalid font ID is given, do not update the font.
806
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
807
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
808
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
809
                            // But we want to make sure that malformed PDFs do not simply crash.
810
                            if (null !== $new_font) {
811
                                $current_font = $new_font;
812
                            }
813
                        }
814
                        break;
815
816
                        // set leading
817
                    case 'TL':
818
                        break;
819
820
                        // set text position matrix
821
                    case 'Tm':
822
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
823
                        $current_position_tm = [
824
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
825
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
826
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
827
                        ];
828
                        break;
829
830
                        // set text rendering mode
831
                    case 'Ts':
832
                        break;
833
834
                        // set super/subscripting text rise
835
                    case 'Ts':
836
                        break;
837
838
                        // set word spacing
839
                    case 'Tw':
840
                        break;
841
842
                        // set horizontal scaling
843
                    case 'Tz':
844
                        break;
845
846
                    default:
847
                }
848
            }
849
        }
850
851
        $result = array_merge($result, $text);
852
853
        return $result;
854
    }
855
856
    /**
857
     * getCommandsText() expects the content of $text_part to be an
858
     * already formatted, single-line command from a document stream.
859
     * The companion function getSectionsText() returns a document
860
     * stream as an array of single commands for just this purpose.
861
     *
862
     * A better name for this function would be getCommandText()
863
     * since it now always works on just one command.
864
     */
865
    public function getCommandsText(string $text_part): array
866
    {
867
        $commands = $matches = [];
868
869
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
870
871
        $type = $matches[2];
872
        $operator = $matches[3];
873
        $command = trim($matches[1]);
874
875
        if ('TJ' == $operator) {
876
            $subcommand = [];
877
            $command = trim($command, '[]');
878
            do {
879
                $oldCommand = $command;
880
881
                // Search for parentheses string () format
882
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
883
                    $subcommand[] = [
884
                        self::TYPE => '(',
885
                        self::OPERATOR => 'TJ',
886
                        self::COMMAND => $tjmatch[1],
887
                    ];
888
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
889
                        $subcommand[] = [
890
                            self::TYPE => 'n',
891
                            self::OPERATOR => '',
892
                            self::COMMAND => $tjmatch[2],
893
                        ];
894
                    }
895
                    $command = substr($command, \strlen($tjmatch[0]));
896
                }
897
898
                // Search for hexadecimal <> format
899
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
900
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
901
                    $subcommand[] = [
902
                        self::TYPE => '<',
903
                        self::OPERATOR => 'TJ',
904
                        self::COMMAND => $tjmatch[1],
905
                    ];
906
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
907
                        $subcommand[] = [
908
                            self::TYPE => 'n',
909
                            self::OPERATOR => '',
910
                            self::COMMAND => $tjmatch[2],
911
                        ];
912
                    }
913
                    $command = substr($command, \strlen($tjmatch[0]));
914
                }
915
            } while ($command != $oldCommand);
916
917
            $command = $subcommand;
918
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
919
            // Depending on the string type, trim the data of the
920
            // appropriate delimiters
921
            if ('(' == $type) {
922
                // Don't use trim() here since a () string may end with
923
                // a balanced or escaped right parentheses, and trim()
924
                // will delete both. Both strings below are valid:
925
                //   eg. (String())
926
                //   eg. (String\))
927
                $command = preg_replace('/^\(|\)$/', '', $command);
928
            } elseif ('<' == $type) {
929
                $command = trim($command, '<>');
930
            }
931
        } elseif ('/' == $type) {
932
            $command = substr($command, 1);
933
        }
934
935
        $commands[] = [
936
            self::TYPE => $type,
937
            self::OPERATOR => $operator,
938
            self::COMMAND => $command,
939
        ];
940
941
        return $commands;
942
    }
943
944
    public static function factory(
945
        Document $document,
946
        Header $header,
947
        ?string $content,
948
        Config $config = null
949
    ): self {
950
        switch ($header->get('Type')->getContent()) {
951
            case 'XObject':
952
                switch ($header->get('Subtype')->getContent()) {
953
                    case 'Image':
954
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

954
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
955
956
                    case 'Form':
957
                        return new Form($document, $header, $content, $config);
958
                }
959
960
                return new self($document, $header, $content, $config);
961
962
            case 'Pages':
963
                return new Pages($document, $header, $content, $config);
964
965
            case 'Page':
966
                return new Page($document, $header, $content, $config);
967
968
            case 'Encoding':
969
                return new Encoding($document, $header, $content, $config);
970
971
            case 'Font':
972
                $subtype = $header->get('Subtype')->getContent();
973
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
974
975
                if (class_exists($classname)) {
976
                    return new $classname($document, $header, $content, $config);
977
                }
978
979
                return new Font($document, $header, $content, $config);
980
981
            default:
982
                return new self($document, $header, $content, $config);
983
        }
984
    }
985
986
    /**
987
     * Returns unique id identifying the object.
988
     */
989
    protected function getUniqueId(): string
990
    {
991
        return spl_object_hash($this);
992
    }
993
}
994