PDFObject::getText()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 4
c 1
b 1
f 0
nc 1
nop 1
dl 0
loc 7
ccs 5
cts 5
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document|null
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config|null
73
     */
74
    protected $config;
75
76
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81 93
    public function __construct(
82
        Document $document,
83
        ?Header $header = null,
84
        ?string $content = null,
85
        ?Config $config = null
86
    ) {
87 93
        $this->document = $document;
88 93
        $this->header = $header ?? new Header();
89 93
        $this->content = $content;
90 93
        $this->config = $config;
91
    }
92
93 72
    public function init()
94
    {
95 72
    }
96
97 4
    public function getDocument(): Document
98
    {
99 4
        return $this->document;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->document could return the type null which is incompatible with the type-hinted return Smalot\PdfParser\Document. Consider adding an additional type-check to rule them out.
Loading history...
100
    }
101
102 72
    public function getHeader(): ?Header
103
    {
104 72
        return $this->header;
105
    }
106
107 4
    public function getConfig(): ?Config
108
    {
109 4
        return $this->config;
110
    }
111
112
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 74
    public function get(string $name)
116
    {
117 74
        return $this->header->get($name);
118
    }
119
120 73
    public function has(string $name): bool
121
    {
122 73
        return $this->header->has($name);
123
    }
124
125 4
    public function getDetails(bool $deep = true): array
126
    {
127 4
        return $this->header->getDetails($deep);
128
    }
129
130 59
    public function getContent(): ?string
131
    {
132 59
        return $this->content;
133
    }
134
135
    /**
136
     * Creates a duplicate of the document stream with
137
     * strings and other items replaced by $char. Formerly
138
     * getSectionsText() used this output to more easily gather offset
139
     * values to extract text from the *actual* document stream.
140
     *
141
     * @deprecated function is no longer used and will be removed in a future release
142
     *
143
     * @internal
144
     */
145 1
    public function cleanContent(string $content, string $char = 'X')
146
    {
147 1
        $char = $char[0];
148 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149
150
        // Remove image bloc with binary content
151 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152 1
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean content in square brackets [.....]
157 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

157
        /** @scrutinizer ignore-call */ 
158
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
158 1
        foreach ($matches[1] as $part) {
159 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162
        // Clean content in round brackets (.....)
163 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 1
        foreach ($matches[1] as $part) {
165 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

169
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
170 1
            $content = '';
171 1
            $level = 0;
172 1
            foreach ($parts as $part) {
173 1
                if ('<' == $part) {
174 1
                    ++$level;
175
                }
176
177 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178
179 1
                if ('>' == $part) {
180 1
                    --$level;
181
                }
182
            }
183
        }
184
185
        // Clean BDC and EMC markup
186 1
        preg_match_all(
187 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188 1
            $content,
189 1
            $matches,
190 1
            \PREG_OFFSET_CAPTURE
191 1
        );
192 1
        foreach ($matches[1] as $part) {
193 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 1
        foreach ($matches[1] as $part) {
198 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199
        }
200
201 1
        return $content;
202
    }
203
204
    /**
205
     * Takes a string of PDF document stream text and formats
206
     * it into a multi-line string with one PDF command on each line,
207
     * separated by \r\n. If the given string is null, or binary data
208
     * is detected instead of a document stream then return an empty
209
     * string.
210
     */
211 52
    private function formatContent(?string $content): string
212
    {
213 52
        if (null === $content) {
214 3
            return '';
215
        }
216
217
        // Outside of (String) and inline image content in PDF document
218
        // streams, all text should conform to UTF-8. Test for binary
219
        // content by deleting everything after the first open-
220
        // parenthesis ( which indicates the beginning of a string, or
221
        // the first ID command which indicates the beginning of binary
222
        // inline image content. Then test what remains for valid
223
        // UTF-8. If it's not UTF-8, return an empty string as this
224
        // $content is most likely binary. Unfortunately, using
225
        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
226
        // following regexp, adapted from the W3, is used. See:
227
        // https://www.w3.org/International/questions/qa-forms-utf-8.en
228
        // We use preg_replace() instead of preg_match() to avoid "JIT
229
        // stack limit exhausted" errors on larger files.
230 49
        $utf8Filter = preg_replace('/(
231
            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
232
            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
233
            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
234
            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
235
            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
236
            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
237
            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
238
            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
239 49
        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
240
241 49
        if ('' !== $utf8Filter) {
242 1
            return '';
243
        }
244
245
        // Find all strings () and replace them so they aren't affected
246
        // by the next steps
247 49
        $pdfstrings = [];
248 49
        $attempt = '(';
249 49
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
250
            // PDF strings can contain unescaped parentheses as long as
251
            // they're balanced, so check for balanced parentheses
252 40
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
253 40
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
254
255 40
            if ($left == $right) {
256
                // Replace the string with a unique placeholder
257 40
                $id = uniqid('STRING_', true);
258 40
                $pdfstrings[$id] = $text[0];
259 40
                $content = preg_replace(
260 40
                    '/'.preg_quote($text[0], '/').'/',
261 40
                    '@@@'.$id.'@@@',
262 40
                    $content,
263 40
                    1
264 40
                );
265
266
                // Reset to search for the next string
267 40
                $attempt = '(';
268
            } else {
269
                // We had unbalanced parentheses, so use the current
270
                // match as a base to find a longer string
271
                $attempt = $text[0];
272
            }
273
        }
274
275
        // Remove all carriage returns and line-feeds from the document stream
276 49
        $content = str_replace(["\r", "\n"], ' ', trim($content));
277
278
        // Find all dictionary << >> commands and replace them so they
279
        // aren't affected by the next steps
280 49
        $dictstore = [];
281 49
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
282 18
            $dictid = uniqid('DICT_', true);
283 18
            $dictstore[$dictid] = $dicttext[1];
284 18
            $content = preg_replace(
285 18
                '/'.preg_quote($dicttext[0], '/').'/',
286 18
                ' ###'.$dictid.'###'.$dicttext[2],
287 18
                $content,
288 18
                1
289 18
            );
290
        }
291
292
        // Normalize white-space in the document stream
293 49
        $content = preg_replace('/\s{2,}/', ' ', $content);
294
295
        // Find all valid PDF operators and add \r\n after each; this
296
        // ensures there is just one command on every line
297
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
298
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
299
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
300
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
301
        //       appear here in the list for completeness.
302 49
        $operators = [
303 49
            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
304 49
            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
305 49
            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
306 49
            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
307 49
            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
308 49
            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
309 49
        ];
310 49
        foreach ($operators as $operator) {
311 49
            $content = preg_replace(
312 49
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
313 49
                $operator."\r\n",
314 49
                $content
315 49
            );
316
        }
317
318
        // Restore the original content of the dictionary << >> commands
319 49
        $dictstore = array_reverse($dictstore, true);
320 49
        foreach ($dictstore as $id => $dict) {
321 18
            $content = str_replace('###'.$id.'###', $dict, $content);
322
        }
323
324
        // Restore the original string content
325 49
        $pdfstrings = array_reverse($pdfstrings, true);
326 49
        foreach ($pdfstrings as $id => $text) {
327
            // Strings may contain escaped newlines, or literal newlines
328
            // and we should clean these up before replacing the string
329
            // back into the content stream; this ensures no strings are
330
            // split between two lines (every command must be on one line)
331 40
            $text = str_replace(
332 40
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
333 40
                ['', '', '', '\r', '\n'],
334 40
                $text
335 40
            );
336
337 40
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
338
        }
339
340 49
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
341
342 49
        return $content;
343
    }
344
345
    /**
346
     * getSectionsText() now takes an entire, unformatted
347
     * document stream as a string, cleans it, then filters out
348
     * commands that aren't needed for text positioning/extraction. It
349
     * returns an array of unprocessed PDF commands, one command per
350
     * element.
351
     *
352
     * @internal
353
     */
354 52
    public function getSectionsText(?string $content): array
355
    {
356 52
        $sections = [];
357
358
        // A cleaned stream has one command on every line, so split the
359
        // cleaned stream content on \r\n into an array
360 52
        $textCleaned = preg_split(
361 52
            '/(\r\n|\n|\r)/',
362 52
            $this->formatContent($content),
363 52
            -1,
364 52
            \PREG_SPLIT_NO_EMPTY
365 52
        );
366
367 52
        $inTextBlock = false;
368 52
        foreach ($textCleaned as $line) {
369 49
            $line = trim($line);
370
371
            // Skip empty lines
372 49
            if ('' === $line) {
373
                continue;
374
            }
375
376
            // If a 'BT' is encountered, set the $inTextBlock flag
377 49
            if (preg_match('/BT$/', $line)) {
378 49
                $inTextBlock = true;
379 49
                $sections[] = $line;
380
381
            // If an 'ET' is encountered, unset the $inTextBlock flag
382 49
            } elseif ('ET' == $line) {
383 49
                $inTextBlock = false;
384 49
                $sections[] = $line;
385 49
            } elseif ($inTextBlock) {
386
                // If we are inside a BT ... ET text block, save all lines
387 49
                $sections[] = trim($line);
388
            } else {
389
                // Otherwise, if we are outside of a text block, only
390
                // save specific, necessary lines. Care should be taken
391
                // to ensure a command being checked for *only* matches
392
                // that command. For instance, a simple search for 'c'
393
                // may also match the 'sc' command. See the command
394
                // list in the formatContent() method above.
395
                // Add more commands to save here as you find them in
396
                // weird PDFs!
397 48
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
398
                    // Save and restore graphics state commands
399 42
                    $sections[] = $line;
400 48
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
401
                    // Begin marked content sequence
402 16
                    $sections[] = $line;
403 48
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
404
                    // Marked content point
405 1
                    $sections[] = $line;
406 47
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
407
                    // End marked content sequence
408 15
                    $sections[] = $line;
409 45
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
410
                    // Graphics position change commands
411 33
                    $sections[] = $line;
412 45
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
413
                    // Font change commands
414 3
                    $sections[] = $line;
415 45
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
416
                    // Invoke named XObject command
417 15
                    $sections[] = $line;
418
                }
419
            }
420
        }
421
422 52
        return $sections;
423
    }
424
425 46
    private function getDefaultFont(?Page $page = null): Font
426
    {
427 46
        $fonts = [];
428 46
        if (null !== $page) {
429 44
            $fonts = $page->getFonts();
430
        }
431
432 46
        $firstFont = $this->document->getFirstFont();
0 ignored issues
show
Bug introduced by
The method getFirstFont() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

432
        /** @scrutinizer ignore-call */ 
433
        $firstFont = $this->document->getFirstFont();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
433 46
        if (null !== $firstFont) {
434 43
            $fonts[] = $firstFont;
435
        }
436
437 46
        if (\count($fonts) > 0) {
438 43
            return reset($fonts);
439
        }
440
441 3
        return new Font($this->document, null, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

441
        return new Font(/** @scrutinizer ignore-type */ $this->document, null, null, $this->config);
Loading history...
442
    }
443
444
    /**
445
     * Decode a '[]TJ' command and attempt to use alternate
446
     * fonts if the current font results in output that contains
447
     * Unicode control characters.
448
     *
449
     * @internal
450
     *
451
     * @param array<int,array<string,string|bool>> $command
452
     */
453 43
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
454
    {
455 43
        $orig_text = $font->decodeText($command, $fontFactor);
456 43
        $text = $orig_text;
457
458
        // If we make this a Config option, we can add a check if it's
459
        // enabled here.
460 43
        if (null !== $page) {
461 43
            $font_ids = array_keys($page->getFonts());
462
463
            // If the decoded text contains UTF-8 control characters
464
            // then the font page being used is probably the wrong one.
465
            // Loop through the rest of the fonts to see if we can get
466
            // a good decode. Allow x09 to x0d which are whitespace.
467 43
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
468
                // If we're out of font IDs, then give up and use the
469
                // original string
470 3
                if (0 == \count($font_ids)) {
471 3
                    return $orig_text;
472
                }
473
474
                // Try the next font ID
475 3
                $font = $page->getFont(array_shift($font_ids));
476 3
                $text = $font->decodeText($command, $fontFactor);
477
            }
478
        }
479
480 43
        return $text;
481
    }
482
483
    /**
484
     * Expects a string that is a full PDF dictionary object,
485
     * including the outer enclosing << >> angle brackets
486
     *
487
     * @internal
488
     *
489
     * @throws \Exception
490
     */
491 17
    public function parseDictionary(string $dictionary): array
492
    {
493
        // Normalize whitespace
494 17
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
495
496 17
        if ('<<' != substr($dictionary, 0, 2)) {
497
            throw new \Exception('Not a valid dictionary object.');
498
        }
499
500 17
        $parsed = [];
501 17
        $stack = [];
502 17
        $currentName = '';
503 17
        $arrayTypeNumeric = false;
504
505
        // Remove outer layer of dictionary, and split on tokens
506 17
        $split = preg_split(
507 17
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
508 17
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
509 17
            -1,
510 17
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
511 17
        );
512
513 17
        foreach ($split as $token) {
514 17
            $token = trim($token);
515
            switch ($token) {
516 17
                case '':
517 7
                    break;
518
519
                    // Open numeric array
520 17
                case '[':
521 7
                    $parsed[$currentName] = [];
522 7
                    $arrayTypeNumeric = true;
523
524
                    // Move up one level in the stack
525 7
                    $stack[\count($stack)] = &$parsed;
526 7
                    $parsed = &$parsed[$currentName];
527 7
                    $currentName = '';
528 7
                    break;
529
530
                    // Open hashed array
531 17
                case '<<':
532 1
                    $parsed[$currentName] = [];
533 1
                    $arrayTypeNumeric = false;
534
535
                    // Move up one level in the stack
536 1
                    $stack[\count($stack)] = &$parsed;
537 1
                    $parsed = &$parsed[$currentName];
538 1
                    $currentName = '';
539 1
                    break;
540
541
                    // Close numeric array
542 17
                case ']':
543
                    // Revert string type arrays back to a single element
544 7
                    if (\is_array($parsed) && 1 == \count($parsed)
545 7
                        && isset($parsed[0]) && \is_string($parsed[0])
546 7
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
547 6
                        $parsed = '['.$parsed[0].']';
548
                    }
549
                    // Close hashed array
550
                    // no break
551 17
                case '>>':
552 7
                    $arrayTypeNumeric = false;
553
554
                    // Move down one level in the stack
555 7
                    $parsed = &$stack[\count($stack) - 1];
556 7
                    unset($stack[\count($stack) - 1]);
557 7
                    break;
558
559
                default:
560
                    // If value begins with a slash, then this is a name
561
                    // Add it to the appropriate array
562 17
                    if ('/' == substr($token, 0, 1)) {
563 17
                        $currentName = substr($token, 1);
564 17
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
565 6
                            $parsed[] = $currentName;
566 17
                            $currentName = '';
567
                        }
568 17
                    } elseif ('' != $currentName) {
569 17
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
570 17
                            $parsed[$currentName] = $token;
571
                        }
572 17
                        $currentName = '';
573 5
                    } elseif ('' == $currentName) {
574 5
                        $parsed[] = $token;
575
                    }
576
            }
577
        }
578
579 17
        return $parsed;
580
    }
581
582
    /**
583
     * Returns the text content of a PDF as a string. Attempts to add
584
     * whitespace for spacing and line-breaks where appropriate.
585
     *
586
     * getText() leverages getTextArray() to get the content
587
     * of the document, setting the addPositionWhitespace flag to true
588
     * so whitespace is inserted in a logical way for reading by
589
     * humans.
590
     */
591 37
    public function getText(?Page $page = null): string
592
    {
593 37
        $this->addPositionWhitespace = true;
594 37
        $result = $this->getTextArray($page);
595 37
        $this->addPositionWhitespace = false;
596
597 37
        return implode('', $result).' ';
598
    }
599
600
    /**
601
     * Returns the text content of a PDF as an array of strings. No
602
     * extra whitespace is inserted besides what is actually encoded in
603
     * the PDF text.
604
     *
605
     * @throws \Exception
606
     */
607 46
    public function getTextArray(?Page $page = null): array
608
    {
609 46
        $result = [];
610 46
        $text = [];
611
612 46
        $marked_stack = [];
613 46
        $last_written_position = false;
614
615 46
        $sections = $this->getSectionsText($this->content);
616 46
        $current_font = $this->getDefaultFont($page);
617 46
        $current_font_size = 1;
618 46
        $current_text_leading = 0;
619
620 46
        $current_position = ['x' => false, 'y' => false];
621 46
        $current_position_tm = [
622 46
            'a' => 1, 'b' => 0, 'c' => 0,
623 46
            'i' => 0, 'j' => 1, 'k' => 0,
624 46
            'x' => 0, 'y' => 0, 'z' => 1,
625 46
        ];
626 46
        $current_position_td = ['x' => 0, 'y' => 0];
627 46
        $current_position_cm = [
628 46
            'a' => 1, 'b' => 0, 'c' => 0,
629 46
            'i' => 0, 'j' => 1, 'k' => 0,
630 46
            'x' => 0, 'y' => 0, 'z' => 1,
631 46
        ];
632
633 46
        $clipped_font = [];
634 46
        $clipped_position_cm = [];
635
636 46
        self::$recursionStack[] = $this->getUniqueId();
637
638 46
        foreach ($sections as $section) {
639 43
            $commands = $this->getCommandsText($section);
640 43
            foreach ($commands as $command) {
641 43
                switch ($command[self::OPERATOR]) {
642
                    // Begin text object
643 43
                    case 'BT':
644
                        // Reset text positioning matrices
645 43
                        $current_position_tm = [
646 43
                            'a' => 1, 'b' => 0, 'c' => 0,
647 43
                            'i' => 0, 'j' => 1, 'k' => 0,
648 43
                            'x' => 0, 'y' => 0, 'z' => 1,
649 43
                        ];
650 43
                        $current_position_td = ['x' => 0, 'y' => 0];
651 43
                        $current_text_leading = 0;
652 43
                        break;
653
654
                        // Begin marked content sequence with property list
655 43
                    case 'BDC':
656 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
657 16
                            $dict = $this->parseDictionary($match[1]);
658
659
                            // Check for ActualText block
660 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
661 4
                                if ('[' == $dict['ActualText'][0]) {
662
                                    // Simulate a 'TJ' command on the stack
663
                                    $marked_stack[] = [
664
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
665
                                    ];
666 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
667
                                    // Simulate a 'Tj' command on the stack
668 4
                                    $marked_stack[] = [
669 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
670 4
                                    ];
671
                                }
672
                            }
673
                        }
674 16
                        break;
675
676
                        // Begin marked content sequence
677 43
                    case 'BMC':
678 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
679
                            // Upon encountering a ReversedChars command,
680
                            // add the characters we've built up so far to
681
                            // the result array
682 1
                            $result = array_merge($result, $text);
683
684
                            // Start a fresh $text array that will contain
685
                            // reversed characters
686 1
                            $text = [];
687
688
                            // Add the reversed text flag to the stack
689 1
                            $marked_stack[] = ['ReversedChars' => true];
690
                        }
691 2
                        break;
692
693
                        // set graphics position matrix
694 43
                    case 'cm':
695 29
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
696 29
                        $current_position_cm = [
697 29
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
698 29
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
699 29
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
700 29
                        ];
701 29
                        break;
702
703 43
                    case 'Do':
704 15
                        if (null !== $page) {
705 15
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
706 15
                            $id = trim(array_pop($args), '/ ');
707 15
                            $xobject = $page->getXObject($id);
708
709
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
710 15
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
711
                                // Not a circular reference.
712 15
                                $text[] = $xobject->getText($page);
713
                            }
714
                        }
715 15
                        break;
716
717
                        // Marked content point with (DP) & without (MP) property list
718 43
                    case 'DP':
719 43
                    case 'MP':
720 1
                        break;
721
722
                        // End text object
723 43
                    case 'ET':
724 43
                        break;
725
726
                        // Store current selected font and graphics matrix
727 43
                    case 'q':
728 37
                        $clipped_font[] = [$current_font, $current_font_size];
729 37
                        $clipped_position_cm[] = $current_position_cm;
730 37
                        break;
731
732
                        // Restore previous selected font and graphics matrix
733 43
                    case 'Q':
734 37
                        list($current_font, $current_font_size) = array_pop($clipped_font);
735 37
                        $current_position_cm = array_pop($clipped_position_cm);
736 37
                        break;
737
738
                        // End marked content sequence
739 43
                    case 'EMC':
740 17
                        $data = false;
741 17
                        if (\count($marked_stack)) {
742 5
                            $marked = array_pop($marked_stack);
743 5
                            $action = key($marked);
744 5
                            $data = $marked[$action];
745
746
                            switch ($action) {
747
                                // If we are in ReversedChars mode...
748 5
                                case 'ReversedChars':
749
                                    // Reverse the characters we've built up so far
750 1
                                    foreach ($text as $key => $t) {
751 1
                                        $text[$key] = implode('', array_reverse(
752 1
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

752
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
753 1
                                        ));
754
                                    }
755
756
                                    // Add these characters to the result array
757 1
                                    $result = array_merge($result, $text);
758
759
                                    // Start a fresh $text array that will contain
760
                                    // non-reversed characters
761 1
                                    $text = [];
762 1
                                    break;
763
764 4
                                case 'ActualText':
765
                                    // Use the content of the ActualText as a command
766 4
                                    $command = $data;
767 4
                                    break;
768
                            }
769
                        }
770
771
                        // If this EMC command has been transformed into a 'Tj'
772
                        // or 'TJ' command because of being ActualText, then bypass
773
                        // the break to proceed to the writing section below.
774 17
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
775 17
                            break;
776
                        }
777
778
                        // no break
779 43
                    case "'":
780 43
                    case '"':
781 4
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
782
                            // Move to next line and write text
783
                            $current_position['x'] = 0;
784
                            $current_position_td['x'] = 0;
785
                            $current_position_td['y'] += $current_text_leading;
786
                        }
787
                        // no break
788 43
                    case 'Tj':
789 35
                        $command[self::COMMAND] = [$command];
790
                        // no break
791 43
                    case 'TJ':
792
                        // Check the marked content stack for flags
793 43
                        $actual_text = false;
794 43
                        $reverse_text = false;
795 43
                        foreach ($marked_stack as $marked) {
796 5
                            if (isset($marked['ActualText'])) {
797 4
                                $actual_text = true;
798
                            }
799 5
                            if (isset($marked['ReversedChars'])) {
800 1
                                $reverse_text = true;
801
                            }
802
                        }
803
804
                        // Account for text position ONLY just before we write text
805 43
                        if (false === $actual_text && \is_array($last_written_position)) {
806
                            // If $last_written_position is an array, that
807
                            // means we have stored text position coordinates
808
                            // for placing an ActualText
809 4
                            $currentX = $last_written_position[0];
810 4
                            $currentY = $last_written_position[1];
811 4
                            $last_written_position = false;
812
                        } else {
813 43
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
814 43
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
815
                        }
816 43
                        $whiteSpace = '';
817
818 43
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
819 43
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
820
821 43
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
822 31
                            $curY = $currentY - $current_position['y'];
823 31
                            if (abs($curY) >= abs($factorY) / 4) {
824 30
                                $whiteSpace = "\n";
825
                            } else {
826 30
                                if (true === $reverse_text) {
827 1
                                    $curX = $current_position['x'] - $currentX;
828
                                } else {
829 30
                                    $curX = $currentX - $current_position['x'];
830
                                }
831
832
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
833
                                // as the number of apparent "spaces" in a document we
834
                                // would need before considering them a "tab". In the
835
                                // future, we might offer this value to users as a config
836
                                // option.
837 30
                                if ($curX >= abs($factorX * 7)) {
838 20
                                    $whiteSpace = "\t";
839 29
                                } elseif ($curX >= abs($factorX * 2)) {
840 17
                                    $whiteSpace = ' ';
841
                                }
842
                            }
843
                        }
844
845 43
                        $newtext = $this->getTJUsingFontFallback(
846 43
                            $current_font,
847 43
                            $command[self::COMMAND],
848 43
                            $page,
849 43
                            $factorX
850 43
                        );
851
852
                        // If there is no ActualText pending then write
853 43
                        if (false === $actual_text) {
854 43
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
855 43
                            if (false !== $reverse_text) {
856
                                // If we are in ReversedChars mode, add the whitespace last
857 1
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
858
                            } else {
859
                                // Otherwise add the whitespace first
860 43
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
861 16
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
862
                                }
863 43
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
864
                            }
865
866
                            // Record the position of this inserted text for comparison
867
                            // with the next text block.
868
                            // Provide a 'fudge' factor guess on how wide this text block
869
                            // is based on the number of characters. This helps limit the
870
                            // number of tabs inserted, but isn't perfect.
871 43
                            $factor = $factorX / 2;
872 43
                            $current_position = [
873 43
                                'x' => $currentX - mb_strlen($newtext) * $factor,
874 43
                                'y' => $currentY,
875 43
                            ];
876 4
                        } elseif (false === $last_written_position) {
877
                            // If there is an ActualText in the pipeline
878
                            // store the position this undisplayed text
879
                            // *would* have been written to, so the
880
                            // ActualText is displayed in the right spot
881 4
                            $last_written_position = [$currentX, $currentY];
882 4
                            $current_position['x'] = $currentX;
883
                        }
884 43
                        break;
885
886
                        // move to start of next line
887 43
                    case 'T*':
888 13
                        $current_position['x'] = 0;
889 13
                        $current_position_td['x'] = 0;
890 13
                        $current_position_td['y'] += $current_text_leading;
891 13
                        break;
892
893
                        // set character spacing
894 43
                    case 'Tc':
895 13
                        break;
896
897
                        // move text current point and set leading
898 43
                    case 'Td':
899 43
                    case 'TD':
900
                        // move text current point
901 32
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
902 32
                        $y = (float) array_pop($args);
903 32
                        $x = (float) array_pop($args);
904
905 32
                        if ('TD' == $command[self::OPERATOR]) {
906 7
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
907
                        }
908
909 32
                        $current_position_td = [
910 32
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
911 32
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
912 32
                        ];
913 32
                        break;
914
915 43
                    case 'Tf':
916 43
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
917 43
                        $size = (float) array_pop($args);
918 43
                        $id = trim(array_pop($args), '/');
919 43
                        if (null !== $page) {
920 43
                            $new_font = $page->getFont($id);
921
                            // If an invalid font ID is given, do not update the font.
922
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
923
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
924
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
925
                            // But we want to make sure that malformed PDFs do not simply crash.
926 43
                            if (null !== $new_font) {
927 39
                                $current_font = $new_font;
928 39
                                $current_font_size = $size;
929
                            }
930
                        }
931 43
                        break;
932
933
                        // set leading
934 37
                    case 'TL':
935 6
                        $y = (float) $command[self::COMMAND];
936 6
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
937 6
                        break;
938
939
                        // set text position matrix
940 37
                    case 'Tm':
941 34
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
942 34
                        $current_position_tm = [
943 34
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
944 34
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
945 34
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
946 34
                        ];
947 34
                        break;
948
949
                        // set text rendering mode
950 22
                    case 'Ts':
951
                        break;
952
953
                        // set super/subscripting text rise
954 22
                    case 'Ts':
955
                        break;
956
957
                        // set word spacing
958 22
                    case 'Tw':
959 9
                        break;
960
961
                        // set horizontal scaling
962 22
                    case 'Tz':
963
                        break;
964
965
                    default:
966
                }
967
            }
968
        }
969
970 46
        $result = array_merge($result, $text);
971
972 46
        return $result;
973
    }
974
975
    /**
976
     * getCommandsText() expects the content of $text_part to be an
977
     * already formatted, single-line command from a document stream.
978
     * The companion function getSectionsText() returns a document
979
     * stream as an array of single commands for just this purpose.
980
     * Because of this, the argument $offset is no longer used, and
981
     * may be removed in a future PdfParser release.
982
     *
983
     * A better name for this function would be getCommandText()
984
     * since it now always works on just one command.
985
     */
986 50
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

986
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
987
    {
988 50
        $commands = $matches = [];
989
990 50
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
991
992
        // If no valid command is detected, return an empty array
993 50
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
994 1
            return [];
995
        }
996
997 50
        $type = $matches[2];
998 50
        $operator = $matches[3];
999 50
        $command = trim($matches[1]);
1000
1001 50
        if ('TJ' == $operator) {
1002 40
            $subcommand = [];
1003 40
            $command = trim($command, '[]');
1004
            do {
1005 40
                $oldCommand = $command;
1006
1007
                // Search for parentheses string () format
1008 40
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
1009 34
                    $subcommand[] = [
1010 34
                        self::TYPE => '(',
1011 34
                        self::OPERATOR => 'TJ',
1012 34
                        self::COMMAND => $tjmatch[1],
1013 34
                    ];
1014 34
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1015 28
                        $subcommand[] = [
1016 28
                            self::TYPE => 'n',
1017 28
                            self::OPERATOR => '',
1018 28
                            self::COMMAND => $tjmatch[2],
1019 28
                        ];
1020
                    }
1021 34
                    $command = substr($command, \strlen($tjmatch[0]));
1022
                }
1023
1024
                // Search for hexadecimal <> format
1025 40
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1026 19
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1027 19
                    $subcommand[] = [
1028 19
                        self::TYPE => '<',
1029 19
                        self::OPERATOR => 'TJ',
1030 19
                        self::COMMAND => $tjmatch[1],
1031 19
                    ];
1032 19
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1033 18
                        $subcommand[] = [
1034 18
                            self::TYPE => 'n',
1035 18
                            self::OPERATOR => '',
1036 18
                            self::COMMAND => $tjmatch[2],
1037 18
                        ];
1038
                    }
1039 19
                    $command = substr($command, \strlen($tjmatch[0]));
1040
                }
1041 40
            } while ($command != $oldCommand);
1042
1043 40
            $command = $subcommand;
1044 50
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1045
            // Depending on the string type, trim the data of the
1046
            // appropriate delimiters
1047 39
            if ('(' == $type) {
1048
                // Don't use trim() here since a () string may end with
1049
                // a balanced or escaped right parentheses, and trim()
1050
                // will delete both. Both strings below are valid:
1051
                //   eg. (String())
1052
                //   eg. (String\))
1053 33
                $command = preg_replace('/^\(|\)$/', '', $command);
1054 15
            } elseif ('<' == $type) {
1055 39
                $command = trim($command, '<>');
1056
            }
1057 50
        } elseif ('/' == $type) {
1058 49
            $command = substr($command, 1);
1059
        }
1060
1061 50
        $commands[] = [
1062 50
            self::TYPE => $type,
1063 50
            self::OPERATOR => $operator,
1064 50
            self::COMMAND => $command,
1065 50
        ];
1066
1067 50
        return $commands;
1068
    }
1069
1070 65
    public static function factory(
1071
        Document $document,
1072
        Header $header,
1073
        ?string $content,
1074
        ?Config $config = null
1075
    ): self {
1076 65
        switch ($header->get('Type')->getContent()) {
1077 65
            case 'XObject':
1078 19
                switch ($header->get('Subtype')->getContent()) {
1079 19
                    case 'Image':
1080 12
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1080
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1081
1082 8
                    case 'Form':
1083 8
                        return new Form($document, $header, $content, $config);
1084
                }
1085
1086
                return new self($document, $header, $content, $config);
1087
1088 65
            case 'Pages':
1089 64
                return new Pages($document, $header, $content, $config);
1090
1091 65
            case 'Page':
1092 64
                return new Page($document, $header, $content, $config);
1093
1094 65
            case 'Encoding':
1095 12
                return new Encoding($document, $header, $content, $config);
1096
1097 65
            case 'Font':
1098 64
                $subtype = $header->get('Subtype')->getContent();
1099 64
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1100
1101 64
                if (class_exists($classname)) {
1102 64
                    return new $classname($document, $header, $content, $config);
1103
                }
1104
1105
                return new Font($document, $header, $content, $config);
1106
1107
            default:
1108 65
                return new self($document, $header, $content, $config);
1109
        }
1110
    }
1111
1112
    /**
1113
     * Returns unique id identifying the object.
1114
     */
1115 46
    protected function getUniqueId(): string
1116
    {
1117 46
        return spl_object_hash($this);
1118
    }
1119
}
1120