PDFObject::getText()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 4
c 1
b 1
f 0
nc 1
nop 1
dl 0
loc 7
ccs 5
cts 5
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Exception\InvalidDictionaryObjectException;
36
use Smalot\PdfParser\XObject\Form;
37
use Smalot\PdfParser\XObject\Image;
38
39
/**
40
 * Class PDFObject
41
 */
42
class PDFObject
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    /**
51
     * The recursion stack.
52
     *
53
     * @var array
54
     */
55
    public static $recursionStack = [];
56
57
    /**
58
     * @var Document|null
59
     */
60
    protected $document;
61
62
    /**
63
     * @var Header
64
     */
65
    protected $header;
66
67
    /**
68
     * @var string
69
     */
70
    protected $content;
71
72
    /**
73
     * @var Config|null
74
     */
75
    protected $config;
76
77
    /**
78
     * @var bool
79
     */
80
    protected $addPositionWhitespace = false;
81
82 98
    public function __construct(
83
        Document $document,
84
        ?Header $header = null,
85
        ?string $content = null,
86
        ?Config $config = null
87
    ) {
88 98
        $this->document = $document;
89 98
        $this->header = $header ?? new Header();
90 98
        $this->content = $content;
91 98
        $this->config = $config;
92
    }
93
94 74
    public function init()
95
    {
96 74
    }
97
98 4
    public function getDocument(): Document
99
    {
100 4
        return $this->document;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->document could return the type null which is incompatible with the type-hinted return Smalot\PdfParser\Document. Consider adding an additional type-check to rule them out.
Loading history...
101
    }
102
103 74
    public function getHeader(): ?Header
104
    {
105 74
        return $this->header;
106
    }
107
108 4
    public function getConfig(): ?Config
109
    {
110 4
        return $this->config;
111
    }
112
113
    /**
114
     * @return Element|PDFObject|Header
115
     */
116 77
    public function get(string $name)
117
    {
118 77
        return $this->header->get($name);
119
    }
120
121 76
    public function has(string $name): bool
122
    {
123 76
        return $this->header->has($name);
124
    }
125
126 4
    public function getDetails(bool $deep = true): array
127
    {
128 4
        return $this->header->getDetails($deep);
129
    }
130
131 60
    public function getContent(): ?string
132
    {
133 60
        return $this->content;
134
    }
135
136
    /**
137
     * Creates a duplicate of the document stream with
138
     * strings and other items replaced by $char. Formerly
139
     * getSectionsText() used this output to more easily gather offset
140
     * values to extract text from the *actual* document stream.
141
     *
142
     * @deprecated function is no longer used and will be removed in a future release
143
     *
144
     * @internal
145
     */
146 1
    public function cleanContent(string $content, string $char = 'X')
147
    {
148 1
        $char = $char[0];
149 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
150
151
        // Remove image bloc with binary content
152 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
153 1
        foreach ($matches[0] as $part) {
154
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
155
        }
156
157
        // Clean content in square brackets [.....]
158 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

158
        /** @scrutinizer ignore-call */ 
159
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
159 1
        foreach ($matches[1] as $part) {
160 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
161
        }
162
163
        // Clean content in round brackets (.....)
164 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
165 1
        foreach ($matches[1] as $part) {
166 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
167
        }
168
169
        // Clean structure
170 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

170
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
171 1
            $content = '';
172 1
            $level = 0;
173 1
            foreach ($parts as $part) {
174 1
                if ('<' == $part) {
175 1
                    ++$level;
176
                }
177
178 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
179
180 1
                if ('>' == $part) {
181 1
                    --$level;
182
                }
183
            }
184
        }
185
186
        // Clean BDC and EMC markup
187 1
        preg_match_all(
188 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
189 1
            $content,
190 1
            $matches,
191 1
            \PREG_OFFSET_CAPTURE
192 1
        );
193 1
        foreach ($matches[1] as $part) {
194 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
195
        }
196
197 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
198 1
        foreach ($matches[1] as $part) {
199 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
200
        }
201
202 1
        return $content;
203
    }
204
205
    /**
206
     * Takes a string of PDF document stream text and formats
207
     * it into a multi-line string with one PDF command on each line,
208
     * separated by \r\n. If the given string is null, or binary data
209
     * is detected instead of a document stream then return an empty
210
     * string.
211
     */
212 56
    private function formatContent(?string $content): string
213
    {
214 56
        if (null === $content) {
215 3
            return '';
216
        }
217
218
        // Outside of (String) and inline image content in PDF document
219
        // streams, all text should conform to UTF-8. Test for binary
220
        // content by deleting everything after the first open-
221
        // parenthesis ( which indicates the beginning of a string, or
222
        // the first ID command which indicates the beginning of binary
223
        // inline image content. Then test what remains for valid
224
        // UTF-8. If it's not UTF-8, return an empty string as this
225
        // $content is most likely binary. Unfortunately, using
226
        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
227
        // following regexp, adapted from the W3, is used. See:
228
        // https://www.w3.org/International/questions/qa-forms-utf-8.en
229
        // We use preg_replace() instead of preg_match() to avoid "JIT
230
        // stack limit exhausted" errors on larger files.
231 53
        $utf8Filter = preg_replace('/(
232
            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
233
            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
234
            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
235
            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
236
            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
237
            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
238
            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
239
            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
240 53
        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
241
242 53
        if ('' !== $utf8Filter) {
243 1
            return '';
244
        }
245
246
        // Find all inline image content and replace them so they aren't
247
        // affected by the next steps
248 53
        $pdfInlineImages = [];
249 53
        $offsetBI = 0;
250 53
        while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
251
            // Attempt to detemine if this instance of the 'BI' command
252
            // actually occured within a (string) using the following
253
            // steps:
254
255
            // Step 1: Remove any escaped slashes and parentheses from
256
            // the alleged image characteristics data
257 1
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]);
258
259
            // Step 2: Remove all correctly ordered and balanced
260
            // parentheses from (strings)
261
            do {
262 1
                $paraTest = $para;
263 1
                $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest);
264 1
            } while ($para != $paraTest);
265
266 1
            $paraOpen = strpos($para, '(');
267 1
            $paraClose = strpos($para, ')');
268
269
            // Check: If the remaining text contains a close parenthesis
270
            // ')' AND it occurs before any open parenthesis, then we
271
            // are almost certain to be inside a (string)
272 1
            if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
273
                // Bump the search offset forward and match again
274 1
                $offsetBI = (int) $text[1][1];
275 1
                continue;
276
            }
277
278
            // Step 3: Double check that this is actually inline image
279
            // data by parsing the alleged image characteristics as a
280
            // dictionary
281 1
            $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
282
283
            // Check if an image Width and Height are set in the dict
284 1
            if ((isset($dict['W']) || isset($dict['Width']))
285 1
                && (isset($dict['H']) || isset($dict['Height']))) {
286 1
                $id = uniqid('IMAGE_', true);
287 1
                $pdfInlineImages[$id] = [
288 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
289 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
290 1
                ];
291 1
                $content = preg_replace(
292 1
                    '/'.preg_quote($text[0][0], '/').'/',
293 1
                    '^^^'.$id.'^^^',
294 1
                    $content,
295 1
                    1
296 1
                );
297
            } else {
298
                // If there was no valid dictionary, or a height and width
299
                // weren't specified, then we don't know what this is, so
300
                // just leave it alone; bump the search offset forward and
301
                // match again
302
                $offsetBI = (int) $text[1][1];
303
            }
304
        }
305
306
        // Find all strings () and replace them so they aren't affected
307
        // by the next steps
308 53
        $pdfstrings = [];
309 53
        $attempt = '(';
310 53
        while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) {
311
            // Remove all escaped slashes and parentheses from the target text
312 42
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]);
313
314
            // PDF strings can contain unescaped parentheses as long as
315
            // they're balanced, so check for balanced parentheses
316 42
            $left = preg_match_all('/\(/', $para);
317 42
            $right = preg_match_all('/\)/', $para);
318
319 42
            if (')' == $para[-1] && $left == $right) {
320
                // Replace the string with a unique placeholder
321 42
                $id = uniqid('STRING_', true);
322 42
                $pdfstrings[$id] = $text[0];
323 42
                $content = preg_replace(
324 42
                    '/'.preg_quote($text[0], '/').'/',
325 42
                    '@@@'.$id.'@@@',
326 42
                    $content,
327 42
                    1
328 42
                );
329
330
                // Reset to search for the next string
331 42
                $attempt = '(';
332
            } else {
333
                // We had unbalanced parentheses, so use the current
334
                // match as a base to find a longer string
335 21
                $attempt = $text[0];
336
            }
337
        }
338
339
        // Remove all carriage returns and line-feeds from the document stream
340 53
        $content = str_replace(["\r", "\n"], ' ', trim($content));
341
342
        // Find all dictionary << >> commands and replace them so they
343
        // aren't affected by the next steps
344 53
        $dictstore = [];
345 53
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
346 18
            $dictid = uniqid('DICT_', true);
347 18
            $dictstore[$dictid] = $dicttext[1];
348 18
            $content = preg_replace(
349 18
                '/'.preg_quote($dicttext[0], '/').'/',
350 18
                ' ###'.$dictid.'###'.$dicttext[2],
351 18
                $content,
352 18
                1
353 18
            );
354
        }
355
356
        // Normalize white-space in the document stream
357 53
        $content = preg_replace('/\s{2,}/', ' ', $content);
358
359
        // Find all valid PDF operators and add \r\n after each; this
360
        // ensures there is just one command on every line
361
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
362
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
363
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
364
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
365
        //       appear here in the list for completeness.
366 53
        $operators = [
367 53
            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
368 53
            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
369 53
            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
370 53
            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
371 53
            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
372 53
            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
373 53
        ];
374 53
        foreach ($operators as $operator) {
375 53
            $content = preg_replace(
376 53
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
377 53
                $operator."\r\n",
378 53
                $content
379 53
            );
380
        }
381
382
        // Restore the original content of the dictionary << >> commands
383 53
        $dictstore = array_reverse($dictstore, true);
384 53
        foreach ($dictstore as $id => $dict) {
385 18
            $content = str_replace('###'.$id.'###', $dict, $content);
386
        }
387
388
        // Restore the original string content
389 53
        $pdfstrings = array_reverse($pdfstrings, true);
390 53
        foreach ($pdfstrings as $id => $text) {
391
            // Strings may contain escaped newlines, or literal newlines
392
            // and we should clean these up before replacing the string
393
            // back into the content stream; this ensures no strings are
394
            // split between two lines (every command must be on one line)
395 42
            $text = str_replace(
396 42
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
397 42
                ['', '', '', '\r', '\n'],
398 42
                $text
399 42
            );
400
401 42
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
402
        }
403
404
        // Restore the original content of any inline images
405 53
        $pdfInlineImages = array_reverse($pdfInlineImages, true);
406 53
        foreach ($pdfInlineImages as $id => $image) {
407 1
            $content = str_replace(
408 1
                '^^^'.$id.'^^^',
409 1
                "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
410 1
                $content
411 1
            );
412
        }
413
414 53
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
415
416 53
        return $content;
417
    }
418
419
    /**
420
     * getSectionsText() now takes an entire, unformatted
421
     * document stream as a string, cleans it, then filters out
422
     * commands that aren't needed for text positioning/extraction. It
423
     * returns an array of unprocessed PDF commands, one command per
424
     * element.
425
     *
426
     * @internal
427
     */
428 54
    public function getSectionsText(?string $content): array
429
    {
430 54
        $sections = [];
431
432
        // A cleaned stream has one command on every line, so split the
433
        // cleaned stream content on \r\n into an array
434 54
        $textCleaned = preg_split(
435 54
            '/(\r\n|\n|\r)/',
436 54
            $this->formatContent($content),
437 54
            -1,
438 54
            \PREG_SPLIT_NO_EMPTY
439 54
        );
440
441 54
        $inTextBlock = false;
442 54
        foreach ($textCleaned as $line) {
443 51
            $line = trim($line);
444
445
            // Skip empty lines
446 51
            if ('' === $line) {
447
                continue;
448
            }
449
450
            // If a 'BT' is encountered, set the $inTextBlock flag
451 51
            if (preg_match('/BT$/', $line)) {
452 50
                $inTextBlock = true;
453 50
                $sections[] = $line;
454
455
                // If an 'ET' is encountered, unset the $inTextBlock flag
456 51
            } elseif ('ET' == $line) {
457 50
                $inTextBlock = false;
458 50
                $sections[] = $line;
459 51
            } elseif ($inTextBlock) {
460
                // If we are inside a BT ... ET text block, save all lines
461 50
                $sections[] = trim($line);
462
            } else {
463
                // Otherwise, if we are outside of a text block, only
464
                // save specific, necessary lines. Care should be taken
465
                // to ensure a command being checked for *only* matches
466
                // that command. For instance, a simple search for 'c'
467
                // may also match the 'sc' command. See the command
468
                // list in the formatContent() method above.
469
                // Add more commands to save here as you find them in
470
                // weird PDFs!
471 50
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
472
                    // Save and restore graphics state commands
473 44
                    $sections[] = $line;
474 50
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
475
                    // Begin marked content sequence
476 16
                    $sections[] = $line;
477 50
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
478
                    // Marked content point
479 1
                    $sections[] = $line;
480 49
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
481
                    // End marked content sequence
482 15
                    $sections[] = $line;
483 47
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
484
                    // Graphics position change commands
485 35
                    $sections[] = $line;
486 47
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
487
                    // Font change commands
488 3
                    $sections[] = $line;
489 47
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
490
                    // Invoke named XObject command
491 16
                    $sections[] = $line;
492
                }
493
            }
494
        }
495
496 54
        return $sections;
497
    }
498
499 48
    private function getDefaultFont(?Page $page = null): Font
500
    {
501 48
        $fonts = [];
502 48
        if (null !== $page) {
503 46
            $fonts = $page->getFonts();
504
        }
505
506 48
        $firstFont = $this->document->getFirstFont();
0 ignored issues
show
Bug introduced by
The method getFirstFont() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

506
        /** @scrutinizer ignore-call */ 
507
        $firstFont = $this->document->getFirstFont();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
507 48
        if (null !== $firstFont) {
508 44
            $fonts[] = $firstFont;
509
        }
510
511 48
        if (\count($fonts) > 0) {
512 44
            return reset($fonts);
513
        }
514
515 4
        return new Font($this->document, null, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

515
        return new Font(/** @scrutinizer ignore-type */ $this->document, null, null, $this->config);
Loading history...
516
    }
517
518
    /**
519
     * Decode a '[]TJ' command and attempt to use alternate
520
     * fonts if the current font results in output that contains
521
     * Unicode control characters.
522
     *
523
     * @internal
524
     *
525
     * @param array<int,array<string,string|bool>> $command
526
     */
527 44
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
528
    {
529 44
        $orig_text = $font->decodeText($command, $fontFactor);
530 44
        $text = $orig_text;
531
532
        // If we make this a Config option, we can add a check if it's
533
        // enabled here.
534 44
        if (null !== $page) {
535 44
            $font_ids = array_keys($page->getFonts());
536
537
            // If the decoded text contains UTF-8 control characters
538
            // then the font page being used is probably the wrong one.
539
            // Loop through the rest of the fonts to see if we can get
540
            // a good decode. Allow x09 to x0d which are whitespace.
541 44
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
542
                // If we're out of font IDs, then give up and use the
543
                // original string
544 3
                if (0 == \count($font_ids)) {
545 3
                    return $orig_text;
546
                }
547
548
                // Try the next font ID
549 3
                $font = $page->getFont(array_shift($font_ids));
550 3
                $text = $font->decodeText($command, $fontFactor);
551
            }
552
        }
553
554 44
        return $text;
555
    }
556
557
    /**
558
     * Expects a string that is a full PDF dictionary object,
559
     * including the outer enclosing << >> angle brackets
560
     *
561
     * @internal
562
     *
563
     * @throws InvalidDictionaryObjectException
564
     */
565 18
    public function parseDictionary(string $dictionary): array
566
    {
567
        // Normalize whitespace
568 18
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
569
570 18
        if ('<<' != substr($dictionary, 0, 2)) {
571
            throw new InvalidDictionaryObjectException('Not a valid dictionary object.');
572
        }
573
574 18
        $parsed = [];
575 18
        $stack = [];
576 18
        $currentName = '';
577 18
        $arrayTypeNumeric = false;
578
579
        // Remove outer layer of dictionary, and split on tokens
580 18
        $split = preg_split(
581 18
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
582 18
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
583 18
            -1,
584 18
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
585 18
        );
586
587 18
        foreach ($split as $token) {
588 18
            $token = trim($token);
589
            switch ($token) {
590 18
                case '':
591 8
                    break;
592
593
                    // Open numeric array
594 18
                case '[':
595 8
                    $parsed[$currentName] = [];
596 8
                    $arrayTypeNumeric = true;
597
598
                    // Move up one level in the stack
599 8
                    $stack[\count($stack)] = &$parsed;
600 8
                    $parsed = &$parsed[$currentName];
601 8
                    $currentName = '';
602 8
                    break;
603
604
                    // Open hashed array
605 18
                case '<<':
606 1
                    $parsed[$currentName] = [];
607 1
                    $arrayTypeNumeric = false;
608
609
                    // Move up one level in the stack
610 1
                    $stack[\count($stack)] = &$parsed;
611 1
                    $parsed = &$parsed[$currentName];
612 1
                    $currentName = '';
613 1
                    break;
614
615
                    // Close numeric array
616 18
                case ']':
617
                    // Revert string type arrays back to a single element
618 8
                    if (\is_array($parsed) && 1 == \count($parsed)
619 8
                        && isset($parsed[0]) && \is_string($parsed[0])
620 8
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
621 6
                        $parsed = '['.$parsed[0].']';
622
                    }
623
                    // Close hashed array
624
                    // no break
625 18
                case '>>':
626 8
                    $arrayTypeNumeric = false;
627
628
                    // Move down one level in the stack
629 8
                    $parsed = &$stack[\count($stack) - 1];
630 8
                    unset($stack[\count($stack) - 1]);
631 8
                    break;
632
633
                default:
634
                    // If value begins with a slash, then this is a name
635
                    // Add it to the appropriate array
636 18
                    if ('/' == substr($token, 0, 1)) {
637 18
                        $currentName = substr($token, 1);
638 18
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
639 7
                            $parsed[] = $currentName;
640 18
                            $currentName = '';
641
                        }
642 18
                    } elseif ('' != $currentName) {
643 18
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
644 18
                            $parsed[$currentName] = $token;
645
                        }
646 18
                        $currentName = '';
647 5
                    } elseif ('' == $currentName) {
648 5
                        $parsed[] = $token;
649
                    }
650
            }
651
        }
652
653 18
        return $parsed;
654
    }
655
656
    /**
657
     * Returns the text content of a PDF as a string. Attempts to add
658
     * whitespace for spacing and line-breaks where appropriate.
659
     *
660
     * getText() leverages getTextArray() to get the content
661
     * of the document, setting the addPositionWhitespace flag to true
662
     * so whitespace is inserted in a logical way for reading by
663
     * humans.
664
     */
665 38
    public function getText(?Page $page = null): string
666
    {
667 38
        $this->addPositionWhitespace = true;
668 38
        $result = $this->getTextArray($page);
669 38
        $this->addPositionWhitespace = false;
670
671 38
        return implode('', $result).' ';
672
    }
673
674
    /**
675
     * Returns the text content of a PDF as an array of strings. No
676
     * extra whitespace is inserted besides what is actually encoded in
677
     * the PDF text.
678
     *
679
     * @throws \Exception
680
     */
681 48
    public function getTextArray(?Page $page = null): array
682
    {
683 48
        $result = [];
684 48
        $text = [];
685
686 48
        $marked_stack = [];
687 48
        $last_written_position = false;
688
689 48
        $sections = $this->getSectionsText($this->content);
690 48
        $current_font = $this->getDefaultFont($page);
691 48
        $current_font_size = 1;
692 48
        $current_text_leading = 0;
693
694 48
        $current_position = ['x' => false, 'y' => false];
695 48
        $current_position_tm = [
696 48
            'a' => 1, 'b' => 0, 'c' => 0,
697 48
            'i' => 0, 'j' => 1, 'k' => 0,
698 48
            'x' => 0, 'y' => 0, 'z' => 1,
699 48
        ];
700 48
        $current_position_td = ['x' => 0, 'y' => 0];
701 48
        $current_position_cm = [
702 48
            'a' => 1, 'b' => 0, 'c' => 0,
703 48
            'i' => 0, 'j' => 1, 'k' => 0,
704 48
            'x' => 0, 'y' => 0, 'z' => 1,
705 48
        ];
706
707 48
        $clipped_font = [];
708 48
        $clipped_position_cm = [];
709
710 48
        self::$recursionStack[] = $this->getUniqueId();
711
712 48
        foreach ($sections as $section) {
713 45
            $commands = $this->getCommandsText($section);
714 45
            foreach ($commands as $command) {
715 45
                switch ($command[self::OPERATOR]) {
716
                    // Begin text object
717 45
                    case 'BT':
718
                        // Reset text positioning matrices
719 44
                        $current_position_tm = [
720 44
                            'a' => 1, 'b' => 0, 'c' => 0,
721 44
                            'i' => 0, 'j' => 1, 'k' => 0,
722 44
                            'x' => 0, 'y' => 0, 'z' => 1,
723 44
                        ];
724 44
                        $current_position_td = ['x' => 0, 'y' => 0];
725 44
                        $current_text_leading = 0;
726 44
                        break;
727
728
                        // Begin marked content sequence with property list
729 45
                    case 'BDC':
730 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
731 16
                            $dict = $this->parseDictionary($match[1]);
732
733
                            // Check for ActualText block
734 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
735 4
                                if ('[' == $dict['ActualText'][0]) {
736
                                    // Simulate a 'TJ' command on the stack
737
                                    $marked_stack[] = [
738
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
739
                                    ];
740 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
741
                                    // Simulate a 'Tj' command on the stack
742 4
                                    $marked_stack[] = [
743 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
744 4
                                    ];
745
                                }
746
                            }
747
                        }
748 16
                        break;
749
750
                        // Begin marked content sequence
751 45
                    case 'BMC':
752 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
753
                            // Upon encountering a ReversedChars command,
754
                            // add the characters we've built up so far to
755
                            // the result array
756 1
                            $result = array_merge($result, $text);
757
758
                            // Start a fresh $text array that will contain
759
                            // reversed characters
760 1
                            $text = [];
761
762
                            // Add the reversed text flag to the stack
763 1
                            $marked_stack[] = ['ReversedChars' => true];
764
                        }
765 2
                        break;
766
767
                        // set graphics position matrix
768 45
                    case 'cm':
769 31
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
770 31
                        $current_position_cm = [
771 31
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
772 31
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
773 31
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
774 31
                        ];
775 31
                        break;
776
777 45
                    case 'Do':
778 16
                        if (null !== $page) {
779 16
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
780 16
                            $id = trim(array_pop($args), '/ ');
781 16
                            $xobject = $page->getXObject($id);
782
783
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
784 16
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
785
                                // Not a circular reference.
786 16
                                $text[] = $xobject->getText($page);
787
                            }
788
                        }
789 16
                        break;
790
791
                        // Marked content point with (DP) & without (MP) property list
792 45
                    case 'DP':
793 45
                    case 'MP':
794 1
                        break;
795
796
                        // End text object
797 45
                    case 'ET':
798 44
                        break;
799
800
                        // Store current selected font and graphics matrix
801 45
                    case 'q':
802 39
                        $clipped_font[] = [$current_font, $current_font_size];
803 39
                        $clipped_position_cm[] = $current_position_cm;
804 39
                        break;
805
806
                        // Restore previous selected font and graphics matrix
807 45
                    case 'Q':
808 39
                        list($current_font, $current_font_size) = array_pop($clipped_font);
809 39
                        $current_position_cm = array_pop($clipped_position_cm);
810 39
                        break;
811
812
                        // End marked content sequence
813 44
                    case 'EMC':
814 17
                        $data = false;
815 17
                        if (\count($marked_stack)) {
816 5
                            $marked = array_pop($marked_stack);
817 5
                            $action = key($marked);
818 5
                            $data = $marked[$action];
819
820
                            switch ($action) {
821
                                // If we are in ReversedChars mode...
822 5
                                case 'ReversedChars':
823
                                    // Reverse the characters we've built up so far
824 1
                                    foreach ($text as $key => $t) {
825 1
                                        $text[$key] = implode('', array_reverse(
826 1
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

826
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
827 1
                                        ));
828
                                    }
829
830
                                    // Add these characters to the result array
831 1
                                    $result = array_merge($result, $text);
832
833
                                    // Start a fresh $text array that will contain
834
                                    // non-reversed characters
835 1
                                    $text = [];
836 1
                                    break;
837
838 4
                                case 'ActualText':
839
                                    // Use the content of the ActualText as a command
840 4
                                    $command = $data;
841 4
                                    break;
842
                            }
843
                        }
844
845
                        // If this EMC command has been transformed into a 'Tj'
846
                        // or 'TJ' command because of being ActualText, then bypass
847
                        // the break to proceed to the writing section below.
848 17
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
849 17
                            break;
850
                        }
851
852
                        // no break
853 44
                    case "'":
854 44
                    case '"':
855 4
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
856
                            // Move to next line and write text
857
                            $current_position['x'] = 0;
858
                            $current_position_td['x'] = 0;
859
                            $current_position_td['y'] += $current_text_leading;
860
                        }
861
                        // no break
862 44
                    case 'Tj':
863 35
                        $command[self::COMMAND] = [$command];
864
                        // no break
865 44
                    case 'TJ':
866
                        // Check the marked content stack for flags
867 44
                        $actual_text = false;
868 44
                        $reverse_text = false;
869 44
                        foreach ($marked_stack as $marked) {
870 5
                            if (isset($marked['ActualText'])) {
871 4
                                $actual_text = true;
872
                            }
873 5
                            if (isset($marked['ReversedChars'])) {
874 1
                                $reverse_text = true;
875
                            }
876
                        }
877
878
                        // Account for text position ONLY just before we write text
879 44
                        if (false === $actual_text && \is_array($last_written_position)) {
880
                            // If $last_written_position is an array, that
881
                            // means we have stored text position coordinates
882
                            // for placing an ActualText
883 4
                            $currentX = $last_written_position[0];
884 4
                            $currentY = $last_written_position[1];
885 4
                            $last_written_position = false;
886
                        } else {
887 44
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
888 44
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
889
                        }
890 44
                        $whiteSpace = '';
891
892 44
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
893 44
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
894
895 44
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
896 31
                            $curY = $currentY - $current_position['y'];
897 31
                            if (abs($curY) >= abs($factorY) / 4) {
898 30
                                $whiteSpace = "\n";
899
                            } else {
900 30
                                if (true === $reverse_text) {
901 1
                                    $curX = $current_position['x'] - $currentX;
902
                                } else {
903 30
                                    $curX = $currentX - $current_position['x'];
904
                                }
905
906
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
907
                                // as the number of apparent "spaces" in a document we
908
                                // would need before considering them a "tab". In the
909
                                // future, we might offer this value to users as a config
910
                                // option.
911 30
                                if ($curX >= abs($factorX * 7)) {
912 20
                                    $whiteSpace = "\t";
913 29
                                } elseif ($curX >= abs($factorX * 2)) {
914 19
                                    $whiteSpace = ' ';
915
                                }
916
                            }
917
                        }
918
919 44
                        $newtext = $this->getTJUsingFontFallback(
920 44
                            $current_font,
921 44
                            $command[self::COMMAND],
922 44
                            $page,
923 44
                            $factorX
924 44
                        );
925
926
                        // If there is no ActualText pending then write
927 44
                        if (false === $actual_text) {
928 44
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
929 44
                            if (false !== $reverse_text) {
930
                                // If we are in ReversedChars mode, add the whitespace last
931 1
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
932
                            } else {
933
                                // Otherwise add the whitespace first
934 44
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
935 18
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
936
                                }
937 44
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
938
                            }
939
940
                            // Record the position of this inserted text for comparison
941
                            // with the next text block.
942
                            // Provide a 'fudge' factor guess on how wide this text block
943
                            // is based on the number of characters. This helps limit the
944
                            // number of tabs inserted, but isn't perfect.
945 44
                            $factor = $factorX / 2;
946 44
                            $current_position = [
947 44
                                'x' => $currentX - mb_strlen($newtext) * $factor,
948 44
                                'y' => $currentY,
949 44
                            ];
950 4
                        } elseif (false === $last_written_position) {
951
                            // If there is an ActualText in the pipeline
952
                            // store the position this undisplayed text
953
                            // *would* have been written to, so the
954
                            // ActualText is displayed in the right spot
955 4
                            $last_written_position = [$currentX, $currentY];
956 4
                            $current_position['x'] = $currentX;
957
                        }
958 44
                        break;
959
960
                        // move to start of next line
961 44
                    case 'T*':
962 13
                        $current_position['x'] = 0;
963 13
                        $current_position_td['x'] = 0;
964 13
                        $current_position_td['y'] += $current_text_leading;
965 13
                        break;
966
967
                        // set character spacing
968 44
                    case 'Tc':
969 13
                        break;
970
971
                        // move text current point and set leading
972 44
                    case 'Td':
973 44
                    case 'TD':
974
                        // move text current point
975 32
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
976 32
                        $y = (float) array_pop($args);
977 32
                        $x = (float) array_pop($args);
978
979 32
                        if ('TD' == $command[self::OPERATOR]) {
980 7
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
981
                        }
982
983 32
                        $current_position_td = [
984 32
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
985 32
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
986 32
                        ];
987 32
                        break;
988
989 44
                    case 'Tf':
990 44
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
991 44
                        $size = (float) array_pop($args);
992 44
                        $id = trim(array_pop($args), '/');
993 44
                        if (null !== $page) {
994 44
                            $new_font = $page->getFont($id);
995
                            // If an invalid font ID is given, do not update the font.
996
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
997
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
998
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
999
                            // But we want to make sure that malformed PDFs do not simply crash.
1000 44
                            if (null !== $new_font) {
1001 44
                                $current_font = $new_font;
1002 44
                                $current_font_size = $size;
1003
                            }
1004
                        }
1005 44
                        break;
1006
1007
                        // set leading
1008 38
                    case 'TL':
1009 6
                        $y = (float) $command[self::COMMAND];
1010 6
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
1011 6
                        break;
1012
1013
                        // set text position matrix
1014 38
                    case 'Tm':
1015 35
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
1016 35
                        $current_position_tm = [
1017 35
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
1018 35
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
1019 35
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
1020 35
                        ];
1021 35
                        break;
1022
1023
                        // set text rendering mode
1024 23
                    case 'Ts':
1025
                        break;
1026
1027
                        // set super/subscripting text rise
1028 23
                    case 'Ts':
1029
                        break;
1030
1031
                        // set word spacing
1032 23
                    case 'Tw':
1033 9
                        break;
1034
1035
                        // set horizontal scaling
1036 23
                    case 'Tz':
1037
                        break;
1038
1039
                    default:
1040
                }
1041
            }
1042
        }
1043
1044 48
        $result = array_merge($result, $text);
1045
1046 48
        return $result;
1047
    }
1048
1049
    /**
1050
     * getCommandsText() expects the content of $text_part to be an
1051
     * already formatted, single-line command from a document stream.
1052
     * The companion function getSectionsText() returns a document
1053
     * stream as an array of single commands for just this purpose.
1054
     * Because of this, the argument $offset is no longer used, and
1055
     * may be removed in a future PdfParser release.
1056
     *
1057
     * A better name for this function would be getCommandText()
1058
     * since it now always works on just one command.
1059
     */
1060 52
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

1060
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1061
    {
1062 52
        $commands = $matches = [];
1063
1064 52
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
1065
1066
        // If no valid command is detected, return an empty array
1067 52
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
1068 1
            return [];
1069
        }
1070
1071 52
        $type = $matches[2];
1072 52
        $operator = $matches[3];
1073 52
        $command = trim($matches[1]);
1074
1075 52
        if ('TJ' == $operator) {
1076 41
            $subcommand = [];
1077 41
            $command = trim($command, '[]');
1078
            do {
1079 41
                $oldCommand = $command;
1080
1081
                // Search for parentheses string () format
1082 41
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
1083 34
                    $subcommand[] = [
1084 34
                        self::TYPE => '(',
1085 34
                        self::OPERATOR => 'TJ',
1086 34
                        self::COMMAND => $tjmatch[1],
1087 34
                    ];
1088 34
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1089 28
                        $subcommand[] = [
1090 28
                            self::TYPE => 'n',
1091 28
                            self::OPERATOR => '',
1092 28
                            self::COMMAND => $tjmatch[2],
1093 28
                        ];
1094
                    }
1095 34
                    $command = substr($command, \strlen($tjmatch[0]));
1096
                }
1097
1098
                // Search for hexadecimal <> format
1099 41
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1100 20
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1101 20
                    $subcommand[] = [
1102 20
                        self::TYPE => '<',
1103 20
                        self::OPERATOR => 'TJ',
1104 20
                        self::COMMAND => $tjmatch[1],
1105 20
                    ];
1106 20
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1107 19
                        $subcommand[] = [
1108 19
                            self::TYPE => 'n',
1109 19
                            self::OPERATOR => '',
1110 19
                            self::COMMAND => $tjmatch[2],
1111 19
                        ];
1112
                    }
1113 20
                    $command = substr($command, \strlen($tjmatch[0]));
1114
                }
1115 41
            } while ($command != $oldCommand);
1116
1117 41
            $command = $subcommand;
1118 52
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1119
            // Depending on the string type, trim the data of the
1120
            // appropriate delimiters
1121 39
            if ('(' == $type) {
1122
                // Don't use trim() here since a () string may end with
1123
                // a balanced or escaped right parentheses, and trim()
1124
                // will delete both. Both strings below are valid:
1125
                //   eg. (String())
1126
                //   eg. (String\))
1127 33
                $command = preg_replace('/^\(|\)$/', '', $command);
1128 15
            } elseif ('<' == $type) {
1129 39
                $command = trim($command, '<>');
1130
            }
1131 52
        } elseif ('/' == $type) {
1132 51
            $command = substr($command, 1);
1133
        }
1134
1135 52
        $commands[] = [
1136 52
            self::TYPE => $type,
1137 52
            self::OPERATOR => $operator,
1138 52
            self::COMMAND => $command,
1139 52
        ];
1140
1141 52
        return $commands;
1142
    }
1143
1144 67
    public static function factory(
1145
        Document $document,
1146
        Header $header,
1147
        ?string $content,
1148
        ?Config $config = null
1149
    ): self {
1150 67
        switch ($header->get('Type')->getContent()) {
1151 67
            case 'XObject':
1152 20
                switch ($header->get('Subtype')->getContent()) {
1153 20
                    case 'Image':
1154 13
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1154
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1155
1156 8
                    case 'Form':
1157 8
                        return new Form($document, $header, $content, $config);
1158
                }
1159
1160
                return new self($document, $header, $content, $config);
1161
1162 67
            case 'Pages':
1163 66
                return new Pages($document, $header, $content, $config);
1164
1165 67
            case 'Page':
1166 66
                return new Page($document, $header, $content, $config);
1167
1168 67
            case 'Encoding':
1169 12
                return new Encoding($document, $header, $content, $config);
1170
1171 67
            case 'Font':
1172 65
                $subtype = $header->get('Subtype')->getContent();
1173 65
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1174
1175 65
                if (class_exists($classname)) {
1176 65
                    return new $classname($document, $header, $content, $config);
1177
                }
1178
1179
                return new Font($document, $header, $content, $config);
1180
1181
            default:
1182 67
                return new self($document, $header, $content, $config);
1183
        }
1184
    }
1185
1186
    /**
1187
     * Returns unique id identifying the object.
1188
     */
1189 48
    protected function getUniqueId(): string
1190
    {
1191 48
        return spl_object_hash($this);
1192
    }
1193
}
1194