PDFObject::__construct()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 4
c 0
b 0
f 0
nc 1
nop 4
dl 0
loc 10
rs 10
ccs 5
cts 5
cp 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Exception\InvalidDictionaryObjectException;
36
use Smalot\PdfParser\XObject\Form;
37
use Smalot\PdfParser\XObject\Image;
38
39
/**
40
 * Class PDFObject
41
 */
42
class PDFObject
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    /**
51
     * The recursion stack.
52
     *
53
     * @var array
54
     */
55
    public static $recursionStack = [];
56
57
    /**
58
     * @var Document|null
59
     */
60
    protected $document;
61
62
    /**
63
     * @var Header
64
     */
65
    protected $header;
66
67
    /**
68
     * @var string
69
     */
70
    protected $content;
71
72
    /**
73
     * @var Config|null
74
     */
75
    protected $config;
76
77
    /**
78
     * @var bool
79
     */
80
    protected $addPositionWhitespace = false;
81
82 98
    public function __construct(
83
        Document $document,
84
        ?Header $header = null,
85
        ?string $content = null,
86
        ?Config $config = null
87
    ) {
88 98
        $this->document = $document;
89 98
        $this->header = $header ?? new Header();
90 98
        $this->content = $content;
91 98
        $this->config = $config;
92
    }
93
94 74
    public function init()
95
    {
96 74
    }
97
98 4
    public function getDocument(): Document
99
    {
100 4
        return $this->document;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->document could return the type null which is incompatible with the type-hinted return Smalot\PdfParser\Document. Consider adding an additional type-check to rule them out.
Loading history...
101
    }
102
103 74
    public function getHeader(): ?Header
104
    {
105 74
        return $this->header;
106
    }
107
108 4
    public function getConfig(): ?Config
109
    {
110 4
        return $this->config;
111
    }
112
113
    /**
114
     * @return Element|PDFObject|Header
115
     */
116 77
    public function get(string $name)
117
    {
118 77
        return $this->header->get($name);
119
    }
120
121 76
    public function has(string $name): bool
122
    {
123 76
        return $this->header->has($name);
124
    }
125
126 4
    public function getDetails(bool $deep = true): array
127
    {
128 4
        return $this->header->getDetails($deep);
129
    }
130
131 60
    public function getContent(): ?string
132
    {
133 60
        return $this->content;
134
    }
135
136
    /**
137
     * Creates a duplicate of the document stream with
138
     * strings and other items replaced by $char. Formerly
139
     * getSectionsText() used this output to more easily gather offset
140
     * values to extract text from the *actual* document stream.
141
     *
142
     * @deprecated function is no longer used and will be removed in a future release
143
     *
144
     * @internal
145
     */
146 1
    public function cleanContent(string $content, string $char = 'X')
147
    {
148 1
        $char = $char[0];
149 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
150
151
        // Remove image bloc with binary content
152 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
153 1
        foreach ($matches[0] as $part) {
154
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
155
        }
156
157
        // Clean content in square brackets [.....]
158 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

158
        /** @scrutinizer ignore-call */ 
159
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
159 1
        foreach ($matches[1] as $part) {
160 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
161
        }
162
163
        // Clean content in round brackets (.....)
164 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
165 1
        foreach ($matches[1] as $part) {
166 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
167
        }
168
169
        // Clean structure
170 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

170
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
171 1
            $content = '';
172 1
            $level = 0;
173 1
            foreach ($parts as $part) {
174 1
                if ('<' == $part) {
175 1
                    ++$level;
176
                }
177
178 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
179
180 1
                if ('>' == $part) {
181 1
                    --$level;
182
                }
183
            }
184
        }
185
186
        // Clean BDC and EMC markup
187 1
        preg_match_all(
188 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
189 1
            $content,
190 1
            $matches,
191 1
            \PREG_OFFSET_CAPTURE
192 1
        );
193 1
        foreach ($matches[1] as $part) {
194 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
195
        }
196
197 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
198 1
        foreach ($matches[1] as $part) {
199 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
200
        }
201
202 1
        return $content;
203
    }
204
205
    /**
206
     * Takes a string of PDF document stream text and formats
207
     * it into a multi-line string with one PDF command on each line,
208
     * separated by \r\n. If the given string is null, or binary data
209
     * is detected instead of a document stream then return an empty
210
     * string.
211
     */
212 56
    private function formatContent(?string $content): string
213
    {
214 56
        if (null === $content) {
215 3
            return '';
216
        }
217
218
        // Outside of (String) and inline image content in PDF document
219
        // streams, all text should conform to UTF-8. Test for binary
220
        // content by deleting everything after the first open-
221
        // parenthesis ( which indicates the beginning of a string, or
222
        // the first ID command which indicates the beginning of binary
223
        // inline image content. Then test what remains for valid
224
        // UTF-8. If it's not UTF-8, return an empty string as this
225
        // $content is most likely binary. Unfortunately, using
226
        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
227
        // following regexp, adapted from the W3, is used. See:
228
        // https://www.w3.org/International/questions/qa-forms-utf-8.en
229
        // We use preg_replace() instead of preg_match() to avoid "JIT
230
        // stack limit exhausted" errors on larger files.
231 53
        $utf8Filter = preg_replace('/(
232
            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
233
            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
234
            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
235
            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
236
            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
237
            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
238
            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
239
            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
240 53
        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
241
242 53
        if ('' !== $utf8Filter) {
243 1
            return '';
244
        }
245
246
        // Find all inline image content and replace them so they aren't
247
        // affected by the next steps
248 53
        $pdfInlineImages = [];
249 53
        $offsetBI = 0;
250 53
        while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
251
            // Attempt to detemine if this instance of the 'BI' command
252
            // actually occured within a (string) using the following
253
            // steps:
254
255
            // Step 1: Remove any escaped slashes and parentheses from
256
            // the alleged image characteristics data
257 1
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]);
258
259
            // Step 2: Remove all correctly ordered and balanced
260
            // parentheses from (strings)
261
            do {
262 1
                $paraTest = $para;
263 1
                $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest);
264 1
            } while ($para != $paraTest);
265
266 1
            $paraOpen = strpos($para, '(');
267 1
            $paraClose = strpos($para, ')');
268
269
            // Check: If the remaining text contains a close parenthesis
270
            // ')' AND it occurs before any open parenthesis, then we
271
            // are almost certain to be inside a (string)
272 1
            if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
273
                // Bump the search offset forward and match again
274 1
                $offsetBI = (int) $text[1][1];
275 1
                continue;
276
            }
277
278
            // Step 3: Double check that this is actually inline image
279
            // data by parsing the alleged image characteristics as a
280
            // dictionary
281 1
            $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
282
283
            // Check if an image Width and Height are set in the dict
284 1
            if ((isset($dict['W']) || isset($dict['Width']))
285 1
                && (isset($dict['H']) || isset($dict['Height']))) {
286 1
                $id = uniqid('IMAGE_', true);
287 1
                $pdfInlineImages[$id] = [
288 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
289 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
290 1
                ];
291 1
                $content = preg_replace(
292 1
                    '/'.preg_quote($text[0][0], '/').'/',
293 1
                    '^^^'.$id.'^^^',
294 1
                    $content,
295 1
                    1
296 1
                );
297
            } else {
298
                // If there was no valid dictionary, or a height and width
299
                // weren't specified, then we don't know what this is, so
300
                // just leave it alone; bump the search offset forward and
301
                // match again
302
                $offsetBI = (int) $text[1][1];
303
            }
304
        }
305
306
        // Find all strings () and replace them so they aren't affected
307
        // by the next steps
308 53
        $pdfstrings = [];
309 53
        $attempt = '(';
310 53
        while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) {
311
            // Remove all escaped slashes and parentheses from the target text
312 42
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]);
313
314
            // PDF strings can contain unescaped parentheses as long as
315
            // they're balanced, so check for balanced parentheses
316 42
            $left = preg_match_all('/\(/', $para);
317 42
            $right = preg_match_all('/\)/', $para);
318
319 42
            if (')' == $para[-1] && $left == $right) {
320
                // Replace the string with a unique placeholder
321 42
                $id = uniqid('STRING_', true);
322 42
                $pdfstrings[$id] = $text[0];
323 42
                $content = preg_replace(
324 42
                    '/'.preg_quote($text[0], '/').'/',
325 42
                    '@@@'.$id.'@@@',
326 42
                    $content,
327 42
                    1
328 42
                );
329
330
                // Reset to search for the next string
331 42
                $attempt = '(';
332
            } else {
333
                // We had unbalanced parentheses, so use the current
334
                // match as a base to find a longer string
335 21
                $attempt = $text[0];
336
            }
337
        }
338
339
        // Remove all carriage returns and line-feeds from the document stream
340 53
        $content = str_replace(["\r", "\n"], ' ', trim($content));
341
342
        // Find all dictionary << >> commands and replace them so they
343
        // aren't affected by the next steps
344 53
        $dictstore = [];
345 53
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
346 18
            $dictid = uniqid('DICT_', true);
347 18
            $dictstore[$dictid] = $dicttext[1];
348 18
            $content = preg_replace(
349 18
                '/'.preg_quote($dicttext[0], '/').'/',
350 18
                ' ###'.$dictid.'###'.$dicttext[2],
351 18
                $content,
352 18
                1
353 18
            );
354
        }
355
356
        // Normalize white-space in the document stream
357 53
        $content = preg_replace('/\s{2,}/', ' ', $content);
358
359
        // Find all valid PDF operators and add \r\n after each; this
360
        // ensures there is just one command on every line
361
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
362
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
363
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
364
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
365
        //       appear here in the list for completeness.
366 53
        $operators = [
367 53
            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
368 53
            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
369 53
            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
370 53
            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
371 53
            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
372 53
            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
373 53
        ];
374 53
        foreach ($operators as $operator) {
375 53
            $content = preg_replace(
376 53
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
377 53
                $operator."\r\n",
378 53
                $content
379 53
            );
380
        }
381
382
        // Restore the original content of the dictionary << >> commands
383 53
        $dictstore = array_reverse($dictstore, true);
384 53
        foreach ($dictstore as $id => $dict) {
385 18
            $content = str_replace('###'.$id.'###', $dict, $content);
386
        }
387
388
        // Restore the original string content
389 53
        $pdfstrings = array_reverse($pdfstrings, true);
390 53
        foreach ($pdfstrings as $id => $text) {
391
            // Strings may contain escaped newlines, or literal newlines
392
            // and we should clean these up before replacing the string
393
            // back into the content stream; this ensures no strings are
394
            // split between two lines (every command must be on one line)
395 42
            $text = str_replace(
396 42
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
397 42
                ['', '', '', '\r', '\n'],
398 42
                $text
399 42
            );
400
401 42
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
402
        }
403
404
        // Restore the original content of any inline images
405 53
        $pdfInlineImages = array_reverse($pdfInlineImages, true);
406 53
        foreach ($pdfInlineImages as $id => $image) {
407 1
            $content = str_replace(
408 1
                '^^^'.$id.'^^^',
409 1
                "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
410 1
                $content
411 1
            );
412
        }
413
414 53
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
415
416 53
        return $content;
417
    }
418
419
    /**
420
     * getSectionsText() now takes an entire, unformatted
421
     * document stream as a string, cleans it, then filters out
422
     * commands that aren't needed for text positioning/extraction. It
423
     * returns an array of unprocessed PDF commands, one command per
424
     * element.
425
     *
426
     * @internal
427
     */
428 54
    public function getSectionsText(?string $content): array
429
    {
430 54
        $sections = [];
431
432
        // A cleaned stream has one command on every line, so split the
433
        // cleaned stream content on \r\n into an array
434 54
        $textCleaned = preg_split(
435 54
            '/(\r\n|\n|\r)/',
436 54
            $this->formatContent($content),
437 54
            -1,
438 54
            \PREG_SPLIT_NO_EMPTY
439 54
        );
440
441 54
        $inTextBlock = false;
442 54
        foreach ($textCleaned as $line) {
443 51
            $line = trim($line);
444
445
            // Skip empty lines
446 51
            if ('' === $line) {
447
                continue;
448
            }
449
450
            // If a 'BT' is encountered, set the $inTextBlock flag
451 51
            if (preg_match('/BT$/', $line)) {
452 50
                $inTextBlock = true;
453 50
                $sections[] = $line;
454
455
                // If an 'ET' is encountered, unset the $inTextBlock flag
456 51
            } elseif ('ET' == $line) {
457 50
                $inTextBlock = false;
458 50
                $sections[] = $line;
459 51
            } elseif ($inTextBlock) {
460
                // If we are inside a BT ... ET text block, save all lines
461 50
                $sections[] = trim($line);
462
            } else {
463
                // Otherwise, if we are outside of a text block, only
464
                // save specific, necessary lines. Care should be taken
465
                // to ensure a command being checked for *only* matches
466
                // that command. For instance, a simple search for 'c'
467
                // may also match the 'sc' command. See the command
468
                // list in the formatContent() method above.
469
                // Add more commands to save here as you find them in
470
                // weird PDFs!
471 50
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
472
                    // Save and restore graphics state commands
473 44
                    $sections[] = $line;
474 50
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
475
                    // Begin marked content sequence
476 16
                    $sections[] = $line;
477 50
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
478
                    // Marked content point
479 1
                    $sections[] = $line;
480 49
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
481
                    // End marked content sequence
482 15
                    $sections[] = $line;
483 47
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
484
                    // Graphics position change commands
485 35
                    $sections[] = $line;
486 47
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
487
                    // Font change commands
488 3
                    $sections[] = $line;
489 47
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
490
                    // Invoke named XObject command
491 16
                    $sections[] = $line;
492
                }
493
            }
494
        }
495
496 54
        return $sections;
497
    }
498
499 48
    private function getDefaultFont(?Page $page = null): Font
500
    {
501 48
        $fonts = [];
502 48
        if (null !== $page) {
503 46
            $fonts = $page->getFonts();
504
        }
505
506 48
        $firstFont = $this->document->getFirstFont();
0 ignored issues
show
Bug introduced by
The method getFirstFont() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

506
        /** @scrutinizer ignore-call */ 
507
        $firstFont = $this->document->getFirstFont();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
507 48
        if (null !== $firstFont) {
508 44
            $fonts[] = $firstFont;
509
        }
510
511 48
        if (\count($fonts) > 0) {
512 44
            return reset($fonts);
513
        }
514
515 4
        return new Font($this->document, null, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

515
        return new Font(/** @scrutinizer ignore-type */ $this->document, null, null, $this->config);
Loading history...
516
    }
517
518
    /**
519
     * Decode a '[]TJ' command and attempt to use alternate
520
     * fonts if the current font results in output that contains
521
     * Unicode control characters.
522
     *
523
     * @internal
524
     *
525
     * @param array<int,array<string,string|bool>> $command
526
     */
527 44
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
528
    {
529 44
        $orig_text = $font->decodeText($command, $fontFactor);
530 44
        $text = $orig_text;
531
532
        // If we make this a Config option, we can add a check if it's
533
        // enabled here.
534 44
        if (null !== $page) {
535 44
            $font_ids = array_keys($page->getFonts());
536
537
            // If the decoded text contains UTF-8 control characters
538
            // then the font page being used is probably the wrong one.
539
            // Loop through the rest of the fonts to see if we can get
540
            // a good decode. Allow x09 to x0d which are whitespace.
541 44
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
542
                // If we're out of font IDs, then give up and use the
543
                // original string
544 3
                if (0 == \count($font_ids)) {
545 3
                    return $orig_text;
546
                }
547
548
                // Try the next font ID
549 3
                $font = $page->getFont(array_shift($font_ids));
550 3
                $text = $font->decodeText($command, $fontFactor);
551
            }
552
        }
553
554 44
        return $text;
555
    }
556
557
    /**
558
     * Expects a string that is a full PDF dictionary object,
559
     * including the outer enclosing << >> angle brackets
560
     *
561
     * @internal
562
     *
563
     * @throws InvalidDictionaryObjectException
564
     */
565 18
    public function parseDictionary(string $dictionary): array
566
    {
567
        // Normalize whitespace
568 18
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
569
570 18
        if ('<<' != substr($dictionary, 0, 2)) {
571
            throw new InvalidDictionaryObjectException('Not a valid dictionary object.');
572
        }
573
574 18
        $parsed = [];
575 18
        $stack = [];
576 18
        $currentName = '';
577 18
        $arrayTypeNumeric = false;
578
579
        // Remove outer layer of dictionary, and split on tokens
580 18
        $split = preg_split(
581 18
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
582 18
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
583 18
            -1,
584 18
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
585 18
        );
586
587 18
        foreach ($split as $token) {
588 18
            $token = trim($token);
589
            switch ($token) {
590 18
                case '':
591 8
                    break;
592
593
                    // Open numeric array
594 18
                case '[':
595 8
                    $parsed[$currentName] = [];
596 8
                    $arrayTypeNumeric = true;
597
598
                    // Move up one level in the stack
599 8
                    $stack[\count($stack)] = &$parsed;
600 8
                    $parsed = &$parsed[$currentName];
601 8
                    $currentName = '';
602 8
                    break;
603
604
                    // Open hashed array
605 18
                case '<<':
606 1
                    $parsed[$currentName] = [];
607 1
                    $arrayTypeNumeric = false;
608
609
                    // Move up one level in the stack
610 1
                    $stack[\count($stack)] = &$parsed;
611 1
                    $parsed = &$parsed[$currentName];
612 1
                    $currentName = '';
613 1
                    break;
614
615
                    // Close numeric array
616 18
                case ']':
617
                    // Revert string type arrays back to a single element
618 8
                    if (\is_array($parsed) && 1 == \count($parsed)
619 8
                        && isset($parsed[0]) && \is_string($parsed[0])
620 8
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
621 6
                        $parsed = '['.$parsed[0].']';
622
                    }
623
                    // Close hashed array
624
                    // no break
625 18
                case '>>':
626 8
                    $arrayTypeNumeric = false;
627
628
                    // Move down one level in the stack
629 8
                    $parsed = &$stack[\count($stack) - 1];
630 8
                    unset($stack[\count($stack) - 1]);
631 8
                    break;
632
633
                default:
634
                    // If value begins with a slash, then this is a name
635
                    // Add it to the appropriate array
636 18
                    if ('/' == substr($token, 0, 1)) {
637 18
                        $currentName = substr($token, 1);
638 18
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
639 7
                            $parsed[] = $currentName;
640 18
                            $currentName = '';
641
                        }
642 18
                    } elseif ('' != $currentName) {
643 18
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
644 18
                            $parsed[$currentName] = $token;
645
                        }
646 18
                        $currentName = '';
647 5
                    } elseif ('' == $currentName) {
648 5
                        $parsed[] = $token;
649
                    }
650
            }
651
        }
652
653 18
        return $parsed;
654
    }
655
656
    /**
657
     * Returns the text content of a PDF as a string. Attempts to add
658
     * whitespace for spacing and line-breaks where appropriate.
659
     *
660
     * getText() leverages getTextArray() to get the content
661
     * of the document, setting the addPositionWhitespace flag to true
662
     * so whitespace is inserted in a logical way for reading by
663
     * humans.
664
     */
665 38
    public function getText(?Page $page = null): string
666
    {
667 38
        $this->addPositionWhitespace = true;
668 38
        $result = $this->getTextArray($page);
669 38
        $this->addPositionWhitespace = false;
670
671 38
        return implode('', $result).' ';
672
    }
673
674
    /**
675
     * Returns the text content of a PDF as an array of strings. No
676
     * extra whitespace is inserted besides what is actually encoded in
677
     * the PDF text.
678
     *
679
     * @throws \Exception
680
     */
681 48
    public function getTextArray(?Page $page = null): array
682
    {
683 48
        $result = [];
684 48
        $text = [];
685
686 48
        $marked_stack = [];
687 48
        $last_written_position = false;
688
689 48
        $sections = $this->getSectionsText($this->content);
690 48
        $current_font = $this->getDefaultFont($page);
691 48
        $current_font_size = 1;
692 48
        $current_text_leading = 0;
693
694 48
        $current_position = ['x' => false, 'y' => false];
695 48
        $current_position_tm = [
696 48
            'a' => 1, 'b' => 0, 'c' => 0,
697 48
            'i' => 0, 'j' => 1, 'k' => 0,
698 48
            'x' => 0, 'y' => 0, 'z' => 1,
699 48
        ];
700 48
        $current_position_td = ['x' => 0, 'y' => 0];
701 48
        $current_position_cm = [
702 48
            'a' => 1, 'b' => 0, 'c' => 0,
703 48
            'i' => 0, 'j' => 1, 'k' => 0,
704 48
            'x' => 0, 'y' => 0, 'z' => 1,
705 48
        ];
706
707 48
        $clipped_font = [];
708 48
        $clipped_position_cm = [];
709
710 48
        self::$recursionStack[] = $this->getUniqueId();
711
712 48
        foreach ($sections as $section) {
713 45
            $commands = $this->getCommandsText($section);
714 45
            foreach ($commands as $command) {
715 45
                switch ($command[self::OPERATOR]) {
716
                    // Begin text object
717 45
                    case 'BT':
718
                        // Reset text positioning matrices
719 44
                        $current_position_tm = [
720 44
                            'a' => 1, 'b' => 0, 'c' => 0,
721 44
                            'i' => 0, 'j' => 1, 'k' => 0,
722 44
                            'x' => 0, 'y' => 0, 'z' => 1,
723 44
                        ];
724 44
                        $current_position_td = ['x' => 0, 'y' => 0];
725 44
                        $current_text_leading = 0;
726 44
                        break;
727
728
                        // Begin marked content sequence with property list
729 45
                    case 'BDC':
730 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
731 16
                            $dict = $this->parseDictionary($match[1]);
732
733
                            // Check for ActualText block
734 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
735 4
                                if ('[' == $dict['ActualText'][0]) {
736
                                    // Simulate a 'TJ' command on the stack
737
                                    $marked_stack[] = [
738
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
739
                                    ];
740 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
741
                                    // Simulate a 'Tj' command on the stack
742 4
                                    $marked_stack[] = [
743 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
744 4
                                    ];
745
                                }
746
                            }
747
                        }
748 16
                        break;
749
750
                        // Begin marked content sequence
751 45
                    case 'BMC':
752 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
753
                            // Upon encountering a ReversedChars command,
754
                            // add the characters we've built up so far to
755
                            // the result array
756 1
                            $result = array_merge($result, $text);
757
758
                            // Start a fresh $text array that will contain
759
                            // reversed characters
760 1
                            $text = [];
761
762
                            // Add the reversed text flag to the stack
763 1
                            $marked_stack[] = ['ReversedChars' => true];
764
                        }
765 2
                        break;
766
767
                        // set graphics position matrix
768 45
                    case 'cm':
769 31
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
770 31
                        $current_position_cm = [
771 31
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
772 31
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
773 31
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
774 31
                        ];
775 31
                        break;
776
777 45
                    case 'Do':
778 16
                        if (is_null($page)) {
779 16
                            break;
780 16
                        }
781 16
782
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
783
                        $id = trim(array_pop($args), '/ ');
784 16
                        $xobject = $page->getXObject($id);
785
786 16
                        // Check we got a PDFObject back.
787
                        if (!$xobject instanceof self) {
788
                            break;
789 16
                        }
790
791
                        // If the PDFObject is an image, do nothing, as images aren't text.
792 45
                        if ($xobject instanceof Image) {
793 45
                            break;
794 1
                        }
795
796
                        // Check this is not a circular reference.
797 45
                        if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
798 44
                            $text[] = $xobject->getText($page);
799
                        }
800
                        break;
801 45
802 39
                        // Marked content point with (DP) & without (MP) property list
803 39
                    case 'DP':
804 39
                    case 'MP':
805
                        break;
806
807 45
                        // End text object
808 39
                    case 'ET':
809 39
                        break;
810 39
811
                        // Store current selected font and graphics matrix
812
                    case 'q':
813 44
                        $clipped_font[] = [$current_font, $current_font_size];
814 17
                        $clipped_position_cm[] = $current_position_cm;
815 17
                        break;
816 5
817 5
                        // Restore previous selected font and graphics matrix
818 5
                    case 'Q':
819
                        list($current_font, $current_font_size) = array_pop($clipped_font);
820
                        $current_position_cm = array_pop($clipped_position_cm);
821
                        break;
822 5
823
                        // End marked content sequence
824 1
                    case 'EMC':
825 1
                        $data = false;
826 1
                        if (\count($marked_stack)) {
827 1
                            $marked = array_pop($marked_stack);
828
                            $action = key($marked);
829
                            $data = $marked[$action];
830
831 1
                            switch ($action) {
832
                                // If we are in ReversedChars mode...
833
                                case 'ReversedChars':
834
                                    // Reverse the characters we've built up so far
835 1
                                    foreach ($text as $key => $t) {
836 1
                                        $text[$key] = implode('', array_reverse(
837
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

837
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
838 4
                                        ));
839
                                    }
840 4
841 4
                                    // Add these characters to the result array
842
                                    $result = array_merge($result, $text);
843
844
                                    // Start a fresh $text array that will contain
845
                                    // non-reversed characters
846
                                    $text = [];
847
                                    break;
848 17
849 17
                                case 'ActualText':
850
                                    // Use the content of the ActualText as a command
851
                                    $command = $data;
852
                                    break;
853 44
                            }
854 44
                        }
855 4
856
                        // If this EMC command has been transformed into a 'Tj'
857
                        // or 'TJ' command because of being ActualText, then bypass
858
                        // the break to proceed to the writing section below.
859
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
860
                            break;
861
                        }
862 44
863 35
                        // no break
864
                    case "'":
865 44
                    case '"':
866
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
867 44
                            // Move to next line and write text
868 44
                            $current_position['x'] = 0;
869 44
                            $current_position_td['x'] = 0;
870 5
                            $current_position_td['y'] += $current_text_leading;
871 4
                        }
872
                        // no break
873 5
                    case 'Tj':
874 1
                        $command[self::COMMAND] = [$command];
875
                        // no break
876
                    case 'TJ':
877
                        // Check the marked content stack for flags
878
                        $actual_text = false;
879 44
                        $reverse_text = false;
880
                        foreach ($marked_stack as $marked) {
881
                            if (isset($marked['ActualText'])) {
882
                                $actual_text = true;
883 4
                            }
884 4
                            if (isset($marked['ReversedChars'])) {
885 4
                                $reverse_text = true;
886
                            }
887 44
                        }
888 44
889
                        // Account for text position ONLY just before we write text
890 44
                        if (false === $actual_text && \is_array($last_written_position)) {
891
                            // If $last_written_position is an array, that
892 44
                            // means we have stored text position coordinates
893 44
                            // for placing an ActualText
894
                            $currentX = $last_written_position[0];
895 44
                            $currentY = $last_written_position[1];
896 31
                            $last_written_position = false;
897 31
                        } else {
898 30
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
899
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
900 30
                        }
901 1
                        $whiteSpace = '';
902
903 30
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
904
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
905
906
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
907
                            $curY = $currentY - $current_position['y'];
908
                            if (abs($curY) >= abs($factorY) / 4) {
909
                                $whiteSpace = "\n";
910
                            } else {
911 30
                                if (true === $reverse_text) {
912 20
                                    $curX = $current_position['x'] - $currentX;
913 29
                                } else {
914 19
                                    $curX = $currentX - $current_position['x'];
915
                                }
916
917
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
918
                                // as the number of apparent "spaces" in a document we
919 44
                                // would need before considering them a "tab". In the
920 44
                                // future, we might offer this value to users as a config
921 44
                                // option.
922 44
                                if ($curX >= abs($factorX * 7)) {
923 44
                                    $whiteSpace = "\t";
924 44
                                } elseif ($curX >= abs($factorX * 2)) {
925
                                    $whiteSpace = ' ';
926
                                }
927 44
                            }
928 44
                        }
929 44
930
                        $newtext = $this->getTJUsingFontFallback(
931 1
                            $current_font,
932
                            $command[self::COMMAND],
933
                            $page,
934 44
                            $factorX
935 18
                        );
936
937 44
                        // If there is no ActualText pending then write
938
                        if (false === $actual_text) {
939
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
940
                            if (false !== $reverse_text) {
941
                                // If we are in ReversedChars mode, add the whitespace last
942
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
943
                            } else {
944
                                // Otherwise add the whitespace first
945 44
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
946 44
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
947 44
                                }
948 44
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
949 44
                            }
950 4
951
                            // Record the position of this inserted text for comparison
952
                            // with the next text block.
953
                            // Provide a 'fudge' factor guess on how wide this text block
954
                            // is based on the number of characters. This helps limit the
955 4
                            // number of tabs inserted, but isn't perfect.
956 4
                            $factor = $factorX / 2;
957
                            $current_position = [
958 44
                                'x' => $currentX - mb_strlen($newtext) * $factor,
959
                                'y' => $currentY,
960
                            ];
961 44
                        } elseif (false === $last_written_position) {
962 13
                            // If there is an ActualText in the pipeline
963 13
                            // store the position this undisplayed text
964 13
                            // *would* have been written to, so the
965 13
                            // ActualText is displayed in the right spot
966
                            $last_written_position = [$currentX, $currentY];
967
                            $current_position['x'] = $currentX;
968 44
                        }
969 13
                        break;
970
971
                        // move to start of next line
972 44
                    case 'T*':
973 44
                        $current_position['x'] = 0;
974
                        $current_position_td['x'] = 0;
975 32
                        $current_position_td['y'] += $current_text_leading;
976 32
                        break;
977 32
978
                        // set character spacing
979 32
                    case 'Tc':
980 7
                        break;
981
982
                        // move text current point and set leading
983 32
                    case 'Td':
984 32
                    case 'TD':
985 32
                        // move text current point
986 32
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
987 32
                        $y = (float) array_pop($args);
988
                        $x = (float) array_pop($args);
989 44
990 44
                        if ('TD' == $command[self::OPERATOR]) {
991 44
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
992 44
                        }
993 44
994 44
                        $current_position_td = [
995
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
996
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
997
                        ];
998
                        break;
999
1000 44
                    case 'Tf':
1001 44
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
1002 44
                        $size = (float) array_pop($args);
1003
                        $id = trim(array_pop($args), '/');
1004
                        if (null !== $page) {
1005 44
                            $new_font = $page->getFont($id);
1006
                            // If an invalid font ID is given, do not update the font.
1007
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
1008 38
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
1009 6
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
1010 6
                            // But we want to make sure that malformed PDFs do not simply crash.
1011 6
                            if (null !== $new_font) {
1012
                                $current_font = $new_font;
1013
                                $current_font_size = $size;
1014 38
                            }
1015 35
                        }
1016 35
                        break;
1017 35
1018 35
                        // set leading
1019 35
                    case 'TL':
1020 35
                        $y = (float) $command[self::COMMAND];
1021 35
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
1022
                        break;
1023
1024 23
                        // set text position matrix
1025
                    case 'Tm':
1026
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
1027
                        $current_position_tm = [
1028 23
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
1029
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
1030
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
1031
                        ];
1032 23
                        break;
1033 9
1034
                        // set text rendering mode
1035
                    case 'Ts':
1036 23
                        break;
1037
1038
                        // set super/subscripting text rise
1039
                    case 'Ts':
1040
                        break;
1041
1042
                        // set word spacing
1043
                    case 'Tw':
1044 48
                        break;
1045
1046 48
                        // set horizontal scaling
1047
                    case 'Tz':
1048
                        break;
1049
1050
                    default:
1051
                }
1052
            }
1053
        }
1054
1055
        $result = array_merge($result, $text);
1056
1057
        return $result;
1058
    }
1059
1060 52
    /**
1061
     * getCommandsText() expects the content of $text_part to be an
1062 52
     * already formatted, single-line command from a document stream.
1063
     * The companion function getSectionsText() returns a document
1064 52
     * stream as an array of single commands for just this purpose.
1065
     * Because of this, the argument $offset is no longer used, and
1066
     * may be removed in a future PdfParser release.
1067 52
     *
1068 1
     * A better name for this function would be getCommandText()
1069
     * since it now always works on just one command.
1070
     */
1071 52
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

1071
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1072 52
    {
1073 52
        $commands = $matches = [];
1074
1075 52
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
1076 41
1077 41
        // If no valid command is detected, return an empty array
1078
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
1079 41
            return [];
1080
        }
1081
1082 41
        $type = $matches[2];
1083 34
        $operator = $matches[3];
1084 34
        $command = trim($matches[1]);
1085 34
1086 34
        if ('TJ' == $operator) {
1087 34
            $subcommand = [];
1088 34
            $command = trim($command, '[]');
1089 28
            do {
1090 28
                $oldCommand = $command;
1091 28
1092 28
                // Search for parentheses string () format
1093 28
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
1094
                    $subcommand[] = [
1095 34
                        self::TYPE => '(',
1096
                        self::OPERATOR => 'TJ',
1097
                        self::COMMAND => $tjmatch[1],
1098
                    ];
1099 41
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1100 20
                        $subcommand[] = [
1101 20
                            self::TYPE => 'n',
1102 20
                            self::OPERATOR => '',
1103 20
                            self::COMMAND => $tjmatch[2],
1104 20
                        ];
1105 20
                    }
1106 20
                    $command = substr($command, \strlen($tjmatch[0]));
1107 19
                }
1108 19
1109 19
                // Search for hexadecimal <> format
1110 19
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1111 19
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1112
                    $subcommand[] = [
1113 20
                        self::TYPE => '<',
1114
                        self::OPERATOR => 'TJ',
1115 41
                        self::COMMAND => $tjmatch[1],
1116
                    ];
1117 41
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1118 52
                        $subcommand[] = [
1119
                            self::TYPE => 'n',
1120
                            self::OPERATOR => '',
1121 39
                            self::COMMAND => $tjmatch[2],
1122
                        ];
1123
                    }
1124
                    $command = substr($command, \strlen($tjmatch[0]));
1125
                }
1126
            } while ($command != $oldCommand);
1127 33
1128 15
            $command = $subcommand;
1129 39
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1130
            // Depending on the string type, trim the data of the
1131 52
            // appropriate delimiters
1132 51
            if ('(' == $type) {
1133
                // Don't use trim() here since a () string may end with
1134
                // a balanced or escaped right parentheses, and trim()
1135 52
                // will delete both. Both strings below are valid:
1136 52
                //   eg. (String())
1137 52
                //   eg. (String\))
1138 52
                $command = preg_replace('/^\(|\)$/', '', $command);
1139 52
            } elseif ('<' == $type) {
1140
                $command = trim($command, '<>');
1141 52
            }
1142
        } elseif ('/' == $type) {
1143
            $command = substr($command, 1);
1144 67
        }
1145
1146
        $commands[] = [
1147
            self::TYPE => $type,
1148
            self::OPERATOR => $operator,
1149
            self::COMMAND => $command,
1150 67
        ];
1151 67
1152 20
        return $commands;
1153 20
    }
1154 13
1155
    public static function factory(
1156 8
        Document $document,
1157 8
        Header $header,
1158
        ?string $content,
1159
        ?Config $config = null
1160
    ): self {
1161
        switch ($header->get('Type')->getContent()) {
1162 67
            case 'XObject':
1163 66
                switch ($header->get('Subtype')->getContent()) {
1164
                    case 'Image':
1165 67
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1165
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1166 66
1167
                    case 'Form':
1168 67
                        return new Form($document, $header, $content, $config);
1169 12
                }
1170
1171 67
                return new self($document, $header, $content, $config);
1172 65
1173 65
            case 'Pages':
1174
                return new Pages($document, $header, $content, $config);
1175 65
1176 65
            case 'Page':
1177
                return new Page($document, $header, $content, $config);
1178
1179
            case 'Encoding':
1180
                return new Encoding($document, $header, $content, $config);
1181
1182 67
            case 'Font':
1183
                $subtype = $header->get('Subtype')->getContent();
1184
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1185
1186
                if (class_exists($classname)) {
1187
                    return new $classname($document, $header, $content, $config);
1188
                }
1189 48
1190
                return new Font($document, $header, $content, $config);
1191 48
1192
            default:
1193
                return new self($document, $header, $content, $config);
1194
        }
1195
    }
1196
1197
    /**
1198
     * Returns unique id identifying the object.
1199
     */
1200
    protected function getUniqueId(): string
1201
    {
1202
        return spl_object_hash($this);
1203
    }
1204
}
1205