Test Failed
Pull Request — master (#634)
by
unknown
08:25
created

PDFObject::getUniqueId()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    /**
136 32
     * This function is no longer used, and could be deleted in a
137 32
     * future version of PDFParser.
138
     *
139
     * @internal Creates a duplicate of the document stream with
140
     * strings and other items replaced by $char. Formerly
141
     * getSectionsText() used this output to more easily gather offset
142 32
     * values to extract text from the *actual* document stream.
143 32
     */
144 22
    public function cleanContent(string $content, string $char = 'X')
145
    {
146
        $char = $char[0];
147
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
148 32
149 32
        // Remove image bloc with binary content
150 21
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
151
        foreach ($matches[0] as $part) {
152
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
153
        }
154 32
155 32
        // Clean content in square brackets [.....]
156 32
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

156
        /** @scrutinizer ignore-call */ 
157
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
157 32
        foreach ($matches[1] as $part) {
158 32
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
159 18
        }
160
161
        // Clean content in round brackets (.....)
162 32
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
163
        foreach ($matches[1] as $part) {
164 32
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
165 18
        }
166
167
        // Clean structure
168
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
169
            $content = '';
170
            $level = 0;
171 32
            foreach ($parts as $part) {
172 32
                if ('<' == $part) {
173
                    ++$level;
174
                }
175 32
176
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
177 32
178 7
                if ('>' == $part) {
179
                    --$level;
180
                }
181 32
            }
182 32
        }
183 11
184
        // Clean BDC and EMC markup
185
        preg_match_all(
186 32
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
187
            $content,
188
            $matches,
189 31
            \PREG_OFFSET_CAPTURE
190
        );
191 31
        foreach ($matches[1] as $part) {
192 31
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
193 31
        }
194
195
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
196 31
        foreach ($matches[1] as $part) {
197 29
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
198 29
        }
199 29
200
        return $content;
201
    }
202 29
203 29
    /**
204
     * @internal Takes a string of PDF document stream text and formats
205
     * it into a multi-line string with one PDF command on each line,
206 29
     * separated by \r\n. If the given string is null, or binary data
207
     * is detected instead of a document stream then return an empty
208
     * string.
209
     */
210 29
    public function formatContent(?string $content): string
211
    {
212 29
        if (null === $content) {
213
            return '';
214
        }
215
216
        // Find all strings () and replace them so they aren't affected
217 31
        // by the next steps
218 4
        $pdfstrings = [];
219 4
        $attempt = '(';
220 4
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
221 4
            // PDF strings can contain unescaped parentheses as long as
222
            // they're balanced, so check for balanced parentheses
223 4
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
224
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
225
226
            if ($left == $right) {
227 31
                // Replace the string with a unique placeholder
228
                $id = uniqid('STRING_', true);
229
                $pdfstrings[$id] = $text[0];
230 20
                $content = preg_replace(
231
                    '/'.preg_quote($text[0], '/').'/',
232 20
                    '@@@'.$id.'@@@',
233 20
                    $content,
234 19
                    1
235
                );
236
237 20
                // Reset to search for the next string
238 20
                $attempt = '(';
239 18
            } else {
240
                // We had unbalanced parentheses, so use the current
241
                // match as a base to find a longer string
242 20
                $attempt = $text[0];
243 18
            }
244
        }
245
246 2
        // Remove all carriage returns and line-feeds from the document stream
247
        $content = str_replace(["\r", "\n"], ' ', trim($content));
248
249
        // Find all dictionary << >> commands and replace them so they
250
        // aren't affected by the next steps
251
        $dictstore = [];
252 20
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
253
            $dictid = uniqid('DICT_', true);
254 20
            $dictstore[$dictid] = $dicttext[1];
255 20
            $content = preg_replace(
256 20
                '/'.preg_quote($dicttext[0], '/').'/',
257 20
                ' ###'.$dictid.'###'.$dicttext[2],
258
                $content,
259 20
                1
260 20
            );
261
        }
262 20
263
        // Now that all strings and dictionaries are hidden, the only
264 20
        // PDF commands left should all be plain text.
265 18
        // Detect text encoding of the current string to prevent reading
266 18
        // content streams that are images, etc. This prevents PHP
267 18
        // error messages when JPEG content is sent to this function
268
        // by the sample file '12249.pdf' from:
269 18
        // https://github.com/smalot/pdfparser/issues/458
270 18
        if (false === mb_detect_encoding($content, null, true)) {
271 18
            return '';
272 1
        }
273 1
274
        // Normalize white-space in the document stream
275 1
        $content = preg_replace('/\s{2,}/', ' ', $content);
276
277
        // Find all valid PDF operators and add \r\n after each; this
278 18
        // ensures there is just one command on every line
279 5
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
280
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
281
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
282 18
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
283 15
        //       appear here in the list for completeness.
284 15
        $operators = [
285 15
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
286 15
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
287 15
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
288
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
289
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
290 11
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
291 15
        ];
292 15
        foreach ($operators as $operator) {
293
            $content = preg_replace(
294 12
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
295
                $operator."\r\n",
296 15
                $content
297 15
            );
298
        }
299
300 18
        // Restore the original content of the dictionary << >> commands
301 3
        $dictstore = array_reverse($dictstore, true);
302 3
        foreach ($dictstore as $id => $dict) {
303 3
            $content = str_replace('###'.$id.'###', $dict, $content);
304 3
        }
305 3
306
        // Restore the original string content
307
        $pdfstrings = array_reverse($pdfstrings, true);
308
        foreach ($pdfstrings as $id => $text) {
309 3
            // Strings may contain escaped newlines, or literal newlines
310
            // and we should clean these up before replacing the string
311 18
            // back into the content stream; this ensures no strings are
312 18
            // split between two lines (every command must be on one line)
313 18
            $text = str_replace(
314 18
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
315 18
                ['', '', '', '\r', '\n'],
316
                $text
317
            );
318
319
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
320
        }
321 18
322 16
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
323
324
        return $content;
325 18
    }
326
327 18
    /**
328
     * @internal getSectionsText() now takes an entire, unformatted
329 5
     * document stream as a string, cleans it, then filters out
330 5
     * commands that aren't needed for text positioning/extraction. It
331
     * returns an array of unprocessed PDF commands, one command per
332 18
     * element.
333
     */
334 6
    public function getSectionsText(?string $content): array
335 6
    {
336
        $sections = [];
337 18
338 18
        // A cleaned stream has one command on every line, so split the
339 13
        // cleaned stream content on \r\n into an array
340
        $textCleaned = preg_split(
341 17
            '/(\r\n|\n|\r)/',
342 18
            $this->formatContent($content),
343 18
            -1,
344 18
            \PREG_SPLIT_NO_EMPTY
345
        );
346
347 15
        $inTextBlock = false;
348 1
        foreach ($textCleaned as $line) {
349 1
            $line = trim($line);
350
351 15
            // Skip empty lines
352 14
            if ('' === $line) {
353 14
                continue;
354 14
            }
355 14
356 14
            // If a 'BT' is encountered, set the $inTextBlock flag
357 14
            if (preg_match('/BT$/', $line)) {
358 12
                $inTextBlock = true;
359
                $sections[] = $line;
360
361 14
                // If an 'ET' is encountered, unset the $inTextBlock flag
362 14
            } elseif ('ET' == $line) {
363 14
                $inTextBlock = false;
364 10
                $sections[] = $line;
365
            } elseif ($inTextBlock) {
366
                // If we are inside a BT ... ET text block, save all lines
367 14
                $sections[] = trim($line);
368 14
            } else {
369
                // Otherwise, if we are outside of a text block, only
370
                // save specific, necessary lines. Care should be taken
371 12
                // to ensure a command being checked for *only* matches
372
                // that command. For instance, a simple search for 'c'
373
                // may also match the 'sc' command. See the command
374
                // list in the formatContent() method above.
375 12
                // Add more commands to save here as you find them in
376 4
                // weird PDFs!
377
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
378
                    // Save and restore graphics state commands
379 12
                    $sections[] = $line;
380
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
381
                    // Begin marked content sequence
382
                    $sections[] = $line;
383
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
384 12
                    // Marked content point
385 4
                    $sections[] = $line;
386 4
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
387
                    // End marked content sequence
388 11
                    $sections[] = $line;
389
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
390
                    // Graphics position change commands
391 11
                    $sections[] = $line;
392 4
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
393 4
                    // Font change commands
394 4
                    $sections[] = $line;
395 4
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
396
                    // Invoke named XObject command
397
                    $sections[] = $line;
398 4
                }
399
            }
400 4
        }
401
402
        return $sections;
403 4
    }
404
405 9
    private function getDefaultFont(Page $page = null): Font
406 8
    {
407 2
        $fonts = [];
408
        if (null !== $page) {
409 8
            $fonts = $page->getFonts();
410
        }
411
412 8
        $firstFont = $this->document->getFirstFont();
413
        if (null !== $firstFont) {
414
            $fonts[] = $firstFont;
415 8
        }
416 3
417
        if (\count($fonts) > 0) {
418 8
            return reset($fonts);
419 3
        }
420
421 7
        return new Font($this->document, null, null, $this->config);
422
    }
423
424 7
    /**
425 7
     * @internal decode a '[]TJ' command and attempt to use alternate
426
     * fonts if the current font results in output that contains
427
     * Unicode control characters
428 7
     *
429 7
     * @param array<int,array<string,string|bool>> $command
430 1
     */
431
    private function getTJUsingFontFallback(Font $font, array $command, Page $page = null, float $fontFactor = 4): string
432 6
    {
433
        $orig_text = $font->decodeText($command, $fontFactor);
434
        $text = $orig_text;
435 6
436 6
        // If we make this a Config option, we can add a check if it's
437
        // enabled here.
438
        if (null !== $page) {
439
            $font_ids = array_keys($page->getFonts());
440
441
            // If the decoded text contains UTF-8 control characters
442
            // then the font page being used is probably the wrong one.
443
            // Loop through the rest of the fonts to see if we can get
444
            // a good decode. Allow x09 to x0d which are whitespace.
445 18
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
446 1
                // If we're out of font IDs, then give up and use the
447 1
                // original string
448
                if (0 == \count($font_ids)) {
449
                    return $orig_text;
450 18
                }
451
452
                // Try the next font ID
453 20
                $font = $page->getFont(array_shift($font_ids));
454
                $text = $font->decodeText($command, $fontFactor);
455
            }
456
        }
457
458
        return $text;
459 6
    }
460
461 6
    /**
462 6
     * @internal expects a string that is a full PDF dictionary object,
463 6
     * including the outer enclosing << >> angle brackets
464
     *
465 6
     * @throws \Exception
466 6
     */
467
    public function parseDictionary(string $dictionary): array
468 6
    {
469 6
        // Normalize whitespace
470
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
471 6
472 3
        if ('<<' != substr($dictionary, 0, 2)) {
473
            throw new \Exception('Not a valid dictionary object.');
474
        }
475 6
476 6
        $parsed = [];
477
        $stack = [];
478
        $currentName = '';
479 6
        $arrayTypeNumeric = false;
480
481
        // Remove outer layer of dictionary, and split on tokens
482 6
        $split = preg_split(
483 6
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
484 6
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
485 6
            -1,
486 6
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
487
        );
488 6
489
        foreach ($split as $token) {
490 6
            $token = trim($token);
491 6
            switch ($token) {
492 5
                case '':
493
                    break;
494 6
495 6
                    // Open numeric array
496 6
                case '[':
497 6
                    $parsed[$currentName] = [];
498
                    $arrayTypeNumeric = true;
499
500 5
                    // Move up one level in the stack
501 4
                    $stack[\count($stack)] = &$parsed;
502
                    $parsed = &$parsed[$currentName];
503 5
                    $currentName = '';
504 4
                    break;
505
506
                    // Open hashed array
507 5
                case '<<':
508
                    $parsed[$currentName] = [];
509
                    $arrayTypeNumeric = false;
510
511 5
                    // Move up one level in the stack
512 2
                    $stack[\count($stack)] = &$parsed;
513
                    $parsed = &$parsed[$currentName];
514
                    $currentName = '';
515 5
                    break;
516
517
                    // Close numeric array
518
                case ']':
519
                    // Revert string type arrays back to a single element
520 5
                    if (\is_array($parsed) && 1 == \count($parsed)
521
                        && isset($parsed[0]) && \is_string($parsed[0])
522 4
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
523
                        $parsed = '['.$parsed[0].']';
524 4
                    }
525
                    // Close hashed array
526
                    // no break
527 4
                case '>>':
528
                    $arrayTypeNumeric = false;
529
530
                    // Move down one level in the stack
531
                    $parsed = &$stack[\count($stack) - 1];
532
                    unset($stack[\count($stack) - 1]);
533
                    break;
534
535
                default:
536
                    // If value begins with a slash, then this is a name
537 4
                    // Add it to the appropriate array
538 4
                    if ('/' == substr($token, 0, 1)) {
539 2
                        $currentName = substr($token, 1);
540
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
541 4
                            $parsed[] = $currentName;
542
                            $currentName = '';
543
                        }
544 4
                    } elseif ('' != $currentName) {
545
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
546
                            $parsed[$currentName] = $token;
547 4
                        }
548
                        $currentName = '';
549
                    } elseif ('' == $currentName) {
550 4
                        $parsed[] = $token;
551 1
                    }
552
            }
553 4
        }
554
555
        return $parsed;
556 4
    }
557 4
558
    /**
559
     * Returns the text content of a PDF as a string. Attempts to add
560 4
     * whitespace for spacing and line-breaks where appropriate.
561 4
     *
562 2
     * @internal getText() leverages getTextArray() to get the content
563
     * of the document, setting the addPositionWhitespace flag to true
564 2
     * so whitespace is inserted in a logical way for reading by
565
     * humans
566
     */
567 2
    public function getText(Page $page = null): string
568 2
    {
569
        $this->addPositionWhitespace = true;
570
        $result = $this->getTextArray($page);
571
        $this->addPositionWhitespace = false;
572
573
        return implode('', $result).' ';
574
    }
575
576 6
    /**
577
     * Returns the text content of a PDF as an array of strings. No
578
     * extra whitespace is inserted besides what is actually encoded in
579 29
     * the PDF text.
580
     *
581 29
     * @throws \Exception
582
     */
583 29
    public function getTextArray(Page $page = null): array
584 29
    {
585 29
        $result = [];
586
        $text = [];
587 29
588 29
        $marked_stack = [];
589 29
        $last_written_position = false;
590
591 29
        $sections = $this->getSectionsText($this->content);
592 29
        $current_font = $this->getDefaultFont($page);
593 29
        $current_font_size = 1;
594 29
        $current_text_leading = 0;
595 29
596 29
        $current_position = ['x' => false, 'y' => false];
597
        $current_position_tm = [
598
            'a' => 1, 'b' => 0, 'c' => 0,
599
            'i' => 0, 'j' => 1, 'k' => 0,
600 29
            'x' => 0, 'y' => 0, 'z' => 1,
601 29
        ];
602 29
        $current_position_td = ['x' => 0, 'y' => 0];
603 11
        $current_position_cm = [
604 11
            'a' => 1, 'b' => 0, 'c' => 0,
605 11
            'i' => 0, 'j' => 1, 'k' => 0,
606
            'x' => 0, 'y' => 0, 'z' => 1,
607
        ];
608
609 11
        $clipped_font = [];
610 11
        $clipped_position_cm = [];
611 11
612
        self::$recursionStack[] = $this->getUniqueId();
613 29
614
        foreach ($sections as $section) {
615 29
            $commands = $this->getCommandsText($section);
616 29
            foreach ($commands as $command) {
617
                switch ($command[self::OPERATOR]) {
618 25
                    // Begin text object
619 25
                    case 'BT':
620 25
                        // Reset text positioning matrices
621
                        $current_position_tm = [
622 25
                            'a' => 1, 'b' => 0, 'c' => 0,
623
                            'i' => 0, 'j' => 1, 'k' => 0,
624 25
                            'x' => 0, 'y' => 0, 'z' => 1,
625 25
                        ];
626 25
                        $current_position_td = ['x' => 0, 'y' => 0];
627
                        $current_text_leading = 0;
628
                        break;
629 25
630 25
                        // Begin marked content sequence with property list
631
                    case 'BDC':
632 25
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

632
                        if (preg_match('/(<<.*>>)$/', /** @scrutinizer ignore-type */ $command[self::COMMAND], $match)) {
Loading history...
633
                            $dict = $this->parseDictionary($match[1]);
634 29
635 29
                            // Check for ActualText block
636
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
637 14
                                if ('[' == $dict['ActualText'][0]) {
638 14
                                    // Simulate a 'TJ' command on the stack
639 14
                                    $marked_stack[] = [
640 14
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
641 14
                                    ];
642 14
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
643
                                    // Simulate a 'Tj' command on the stack
644
                                    $marked_stack[] = [
645 14
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
646 9
                                    ];
647 9
                                }
648
                            }
649 14
                        }
650
                        break;
651 29
652 29
                        // Begin marked content sequence
653 22
                    case 'BMC':
654 22
                        if ('ReversedChars' == $command[self::COMMAND]) {
655 22
                            // Upon encountering a ReversedChars command,
656 22
                            // add the characters we've built up so far to
657 22
                            // the result array
658 22
                            $result = array_merge($result, $text);
659 22
660
                            // Start a fresh $text array that will contain
661
                            // reversed characters
662 22
                            $text = [];
663 22
664 22
                            // Add the reversed text flag to the stack
665
                            $marked_stack[] = ['ReversedChars' => true];
666
                        }
667 16
                        break;
668 16
669
                        // set graphics position matrix
670 22
                    case 'cm':
671
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

671
                        $args = preg_split('/\s+/s', /** @scrutinizer ignore-type */ $command[self::COMMAND]);
Loading history...
672
                        $current_position_cm = [
673
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
674
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
675 22
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
676
                        ];
677 22
                        break;
678 22
679
                    case 'Do':
680 22
                        if (null !== $page) {
681
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
682 22
                            $id = trim(array_pop($args), '/ ');
683 22
                            $xobject = $page->getXObject($id);
684
685 22
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
686 18
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
687 18
                                // Not a circular reference.
688
                                $text[] = $xobject->getText($page);
689
                            }
690 22
                        }
691
                        break;
692
693 29
                        // Marked content point with (DP) & without (MP) property list
694 1
                    case 'DP':
695 29
                    case 'MP':
696 29
                        break;
697 29
698
                        // End text object
699
                    case 'ET':
700
                        break;
701 29
702 29
                        // Store current selected font and graphics matrix
703 29
                    case 'q':
704 24
                        $clipped_font[] = [$current_font, $current_font_size];
705 22
                        $clipped_position_cm[] = $current_position_cm;
706 22
                        break;
707 22
708 17
                        // Restore previous selected font and graphics matrix
709 17
                    case 'Q':
710 17
                        list($current_font, $current_font_size) = array_pop($clipped_font);
711 17
                        $current_position_cm = array_pop($clipped_position_cm);
712 17
                        break;
713
714
                        // End marked content sequence
715
                    case 'EMC':
716 29
                        $data = false;
717 29
                        if (\count($marked_stack)) {
718 29
                            $marked = array_pop($marked_stack);
719 29
                            $action = key($marked);
720 29
                            $data = $marked[$action];
721
722
                            switch ($action) {
723 25
                                // If we are in ReversedChars mode...
724
                                case 'ReversedChars':
725
                                    // Reverse the characters we've built up so far
726
                                    foreach ($text as $key => $t) {
727 29
                                        $text[$key] = implode('', array_reverse(
728
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

728
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
729
                                        ));
730 42
                                    }
731
732
                                    // Add these characters to the result array
733
                                    $result = array_merge($result, $text);
734
735
                                    // Start a fresh $text array that will contain
736 42
                                    // non-reversed characters
737 42
                                    $text = [];
738 8
                                    break;
739 8
740 3
                                case 'ActualText':
741
                                    // Use the content of the ActualText as a command
742 6
                                    $command = $data;
743 6
                                    break;
744
                            }
745
                        }
746
747
                        // If this EMC command has been transformed into a 'Tj'
748 42
                        // or 'TJ' command because of being ActualText, then bypass
749 41
                        // the break to proceed to the writing section below.
750
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
751 42
                            break;
752 41
                        }
753
754 42
                        // no break
755 6
                    case "'":
756
                    case '"':
757 42
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
758 41
                            // Move to next line and write text
759 41
                            $current_position['x'] = 0;
760
                            $current_position_td['x'] = 0;
761 41
                            $current_position_td['y'] += $current_text_leading;
762 41
                        }
763
                        // no break
764
                    case 'Tj':
765
                        $command[self::COMMAND] = [$command];
766
                        // no break
767
                    case 'TJ':
768 42
                        // Check the marked content stack for flags
769
                        $actual_text = false;
770
                        $reverse_text = false;
771
                        foreach ($marked_stack as $marked) {
772
                            if (isset($marked['ActualText'])) {
773
                                $actual_text = true;
774
                            }
775 20
                            if (isset($marked['ReversedChars'])) {
776
                                $reverse_text = true;
777 20
                            }
778
                        }
779
780
                        // Account for text position ONLY just before we write text
781
                        if (false === $actual_text && \is_array($last_written_position)) {
782
                            // If $last_written_position is an array, that
783
                            // means we have stored text position coordinates
784
                            // for placing an ActualText
785
                            $currentX = $last_written_position[0];
786
                            $currentY = $last_written_position[1];
787
                            $last_written_position = false;
788
                        } else {
789
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
790
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
791
                        }
792
                        $whiteSpace = '';
793
794
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
795
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
796
797
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
798
                            $curY = $currentY - $current_position['y'];
799
                            if (abs($curY) >= abs($factorY) / 4) {
800
                                $whiteSpace = "\n";
801
                            } else {
802
                                if (true === $reverse_text) {
803
                                    $curX = $current_position['x'] - $currentX;
804
                                } else {
805
                                    $curX = $currentX - $current_position['x'];
806
                                }
807
808
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
809
                                // as the number of apparent "spaces" in a document we
810
                                // would need before considering them a "tab". In the
811
                                // future, we might offer this value to users as a config
812
                                // option.
813
                                if ($curX >= abs($factorX * 7)) {
814
                                    $whiteSpace = "\t";
815
                                } elseif ($curX >= abs($factorX * 2)) {
816
                                    $whiteSpace = ' ';
817
                                }
818
                            }
819
                        }
820
821
                        $newtext = $this->getTJUsingFontFallback(
822
                            $current_font,
823
                            $command[self::COMMAND],
824
                            $page,
825
                            $factorX
826
                        );
827
828
                        // If there is no ActualText pending then write
829
                        if (false === $actual_text) {
830
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
831
                            if (false !== $reverse_text) {
832
                                // If we are in ReversedChars mode, add the whitespace last
833
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
834
                            } else {
835
                                // Otherwise add the whitespace first
836
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
837
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
838
                                }
839
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
840
                            }
841
842
                            // Record the position of this inserted text for comparison
843
                            // with the next text block.
844
                            // Provide a 'fudge' factor guess on how wide this text block
845
                            // is based on the number of characters. This helps limit the
846
                            // number of tabs inserted, but isn't perfect.
847
                            $factor = $factorX / 2;
848
                            $current_position = [
849
                                'x' => $currentX - mb_strlen($newtext) * $factor,
850
                                'y' => $currentY,
851
                            ];
852
                        } elseif (false === $last_written_position) {
853
                            // If there is an ActualText in the pipeline
854
                            // store the position this undisplayed text
855
                            // *would* have been written to, so the
856
                            // ActualText is displayed in the right spot
857
                            $last_written_position = [$currentX, $currentY];
858
                            $current_position['x'] = $currentX;
859
                        }
860
                        break;
861
862
                        // move to start of next line
863
                    case 'T*':
864
                        $current_position['x'] = 0;
865
                        $current_position_td['x'] = 0;
866
                        $current_position_td['y'] += $current_text_leading;
867
                        break;
868
869
                        // set character spacing
870
                    case 'Tc':
871
                        break;
872
873
                        // move text current point and set leading
874
                    case 'Td':
875
                    case 'TD':
876
                        // move text current point
877
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
878
                        $y = (float) array_pop($args);
879
                        $x = (float) array_pop($args);
880
881
                        if ('TD' == $command[self::OPERATOR]) {
882
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
883
                        }
884
885
                        $current_position_td = [
886
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
887
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
888
                        ];
889
                        break;
890
891
                    case 'Tf':
892
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
893
                        $size = (float) array_pop($args);
894
                        $id = trim(array_pop($args), '/');
895
                        if (null !== $page) {
896
                            $new_font = $page->getFont($id);
897
                            // If an invalid font ID is given, do not update the font.
898
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
899
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
900
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
901
                            // But we want to make sure that malformed PDFs do not simply crash.
902
                            if (null !== $new_font) {
903
                                $current_font = $new_font;
904
                                $current_font_size = $size;
905
                            }
906
                        }
907
                        break;
908
909
                        // set leading
910
                    case 'TL':
911
                        $y = (float) $command[self::COMMAND];
912
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
913
                        break;
914
915
                        // set text position matrix
916
                    case 'Tm':
917
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
918
                        $current_position_tm = [
919
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
920
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
921
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
922
                        ];
923
                        break;
924
925
                        // set text rendering mode
926
                    case 'Ts':
927
                        break;
928
929
                        // set super/subscripting text rise
930
                    case 'Ts':
931
                        break;
932
933
                        // set word spacing
934
                    case 'Tw':
935
                        break;
936
937
                        // set horizontal scaling
938
                    case 'Tz':
939
                        break;
940
941
                    default:
942
                }
943
            }
944
        }
945
946
        $result = array_merge($result, $text);
947
948
        return $result;
949
    }
950
951
    /**
952
     * getCommandsText() expects the content of $text_part to be an
953
     * already formatted, single-line command from a document stream.
954
     * The companion function getSectionsText() returns a document
955
     * stream as an array of single commands for just this purpose.
956
     * Because of this, the argument $offset is no longer used, and
957
     * may be removed in a future PdfParser release.
958
     *
959
     * A better name for this function would be getCommandText()
960
     * since it now always works on just one command.
961
     */
962
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

962
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
963
    {
964
        $commands = $matches = [];
965
966
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
967
968
        $type = $matches[2];
969
        $operator = $matches[3];
970
        $command = trim($matches[1]);
971
972
        if ('TJ' == $operator) {
973
            $subcommand = [];
974
            $command = trim($command, '[]');
975
            do {
976
                $oldCommand = $command;
977
978
                // Search for parentheses string () format
979
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
980
                    $subcommand[] = [
981
                        self::TYPE => '(',
982
                        self::OPERATOR => 'TJ',
983
                        self::COMMAND => $tjmatch[1],
984
                    ];
985
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
986
                        $subcommand[] = [
987
                            self::TYPE => 'n',
988
                            self::OPERATOR => '',
989
                            self::COMMAND => $tjmatch[2],
990
                        ];
991
                    }
992
                    $command = substr($command, \strlen($tjmatch[0]));
993
                }
994
995
                // Search for hexadecimal <> format
996
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
997
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
998
                    $subcommand[] = [
999
                        self::TYPE => '<',
1000
                        self::OPERATOR => 'TJ',
1001
                        self::COMMAND => $tjmatch[1],
1002
                    ];
1003
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1004
                        $subcommand[] = [
1005
                            self::TYPE => 'n',
1006
                            self::OPERATOR => '',
1007
                            self::COMMAND => $tjmatch[2],
1008
                        ];
1009
                    }
1010
                    $command = substr($command, \strlen($tjmatch[0]));
1011
                }
1012
            } while ($command != $oldCommand);
1013
1014
            $command = $subcommand;
1015
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1016
            // Depending on the string type, trim the data of the
1017
            // appropriate delimiters
1018
            if ('(' == $type) {
1019
                // Don't use trim() here since a () string may end with
1020
                // a balanced or escaped right parentheses, and trim()
1021
                // will delete both. Both strings below are valid:
1022
                //   eg. (String())
1023
                //   eg. (String\))
1024
                $command = preg_replace('/^\(|\)$/', '', $command);
1025
            } elseif ('<' == $type) {
1026
                $command = trim($command, '<>');
1027
            }
1028
        } elseif ('/' == $type) {
1029
            $command = substr($command, 1);
1030
        }
1031
1032
        $commands[] = [
1033
            self::TYPE => $type,
1034
            self::OPERATOR => $operator,
1035
            self::COMMAND => $command,
1036
        ];
1037
1038
        return $commands;
1039
    }
1040
1041
    public static function factory(
1042
        Document $document,
1043
        Header $header,
1044
        ?string $content,
1045
        Config $config = null
1046
    ): self {
1047
        switch ($header->get('Type')->getContent()) {
1048
            case 'XObject':
1049
                switch ($header->get('Subtype')->getContent()) {
1050
                    case 'Image':
1051
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1051
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1052
1053
                    case 'Form':
1054
                        return new Form($document, $header, $content, $config);
1055
                }
1056
1057
                return new self($document, $header, $content, $config);
1058
1059
            case 'Pages':
1060
                return new Pages($document, $header, $content, $config);
1061
1062
            case 'Page':
1063
                return new Page($document, $header, $content, $config);
1064
1065
            case 'Encoding':
1066
                return new Encoding($document, $header, $content, $config);
1067
1068
            case 'Font':
1069
                $subtype = $header->get('Subtype')->getContent();
1070
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1071
1072
                if (class_exists($classname)) {
1073
                    return new $classname($document, $header, $content, $config);
1074
                }
1075
1076
                return new Font($document, $header, $content, $config);
1077
1078
            default:
1079
                return new self($document, $header, $content, $config);
1080
        }
1081
    }
1082
1083
    /**
1084
     * Returns unique id identifying the object.
1085
     */
1086
    protected function getUniqueId(): string
1087
    {
1088
        return spl_object_hash($this);
1089
    }
1090
}
1091