Issues (82)

src/Smalot/PdfParser/PDFObject.php (2 issues)

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document|null
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config|null
73
     */
74
    protected $config;
75
76
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81 96
    public function __construct(
82
        Document $document,
83
        ?Header $header = null,
84
        ?string $content = null,
85
        ?Config $config = null
86
    ) {
87 96
        $this->document = $document;
88 96
        $this->header = $header ?? new Header();
89 96
        $this->content = $content;
90 96
        $this->config = $config;
91
    }
92
93 72
    public function init()
94
    {
95 72
    }
96
97 4
    public function getDocument(): Document
98
    {
99 4
        return $this->document;
100
    }
101
102 72
    public function getHeader(): ?Header
103
    {
104 72
        return $this->header;
105
    }
106
107 4
    public function getConfig(): ?Config
108
    {
109 4
        return $this->config;
110
    }
111
112
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 75
    public function get(string $name)
116
    {
117 75
        return $this->header->get($name);
118
    }
119
120 74
    public function has(string $name): bool
121
    {
122 74
        return $this->header->has($name);
123
    }
124
125 4
    public function getDetails(bool $deep = true): array
126
    {
127 4
        return $this->header->getDetails($deep);
128
    }
129
130 59
    public function getContent(): ?string
131
    {
132 59
        return $this->content;
133
    }
134
135
    /**
136
     * Creates a duplicate of the document stream with
137
     * strings and other items replaced by $char. Formerly
138
     * getSectionsText() used this output to more easily gather offset
139
     * values to extract text from the *actual* document stream.
140
     *
141
     * @deprecated function is no longer used and will be removed in a future release
142
     *
143
     * @internal
144
     */
145 1
    public function cleanContent(string $content, string $char = 'X')
146
    {
147 1
        $char = $char[0];
148 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149
150
        // Remove image bloc with binary content
151 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152 1
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean content in square brackets [.....]
157 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

157
        /** @scrutinizer ignore-call */ 
158
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
158 1
        foreach ($matches[1] as $part) {
159 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162
        // Clean content in round brackets (.....)
163 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 1
        foreach ($matches[1] as $part) {
165 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
170 1
            $content = '';
171 1
            $level = 0;
172 1
            foreach ($parts as $part) {
173 1
                if ('<' == $part) {
174 1
                    ++$level;
175
                }
176
177 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178
179 1
                if ('>' == $part) {
180 1
                    --$level;
181
                }
182
            }
183
        }
184
185
        // Clean BDC and EMC markup
186 1
        preg_match_all(
187 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188 1
            $content,
189 1
            $matches,
190 1
            \PREG_OFFSET_CAPTURE
191 1
        );
192 1
        foreach ($matches[1] as $part) {
193 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 1
        foreach ($matches[1] as $part) {
198 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199
        }
200
201 1
        return $content;
202
    }
203
204
    /**
205
     * Takes a string of PDF document stream text and formats
206
     * it into a multi-line string with one PDF command on each line,
207
     * separated by \r\n. If the given string is null, or binary data
208
     * is detected instead of a document stream then return an empty
209
     * string.
210
     */
211 54
    private function formatContent(?string $content): string
212
    {
213 54
        if (null === $content) {
214 3
            return '';
215
        }
216
217
        // Outside of (String) and inline image content in PDF document
218
        // streams, all text should conform to UTF-8. Test for binary
219
        // content by deleting everything after the first open-
220
        // parenthesis ( which indicates the beginning of a string, or
221
        // the first ID command which indicates the beginning of binary
222
        // inline image content. Then test what remains for valid
223
        // UTF-8. If it's not UTF-8, return an empty string as this
224
        // $content is most likely binary. Unfortunately, using
225
        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
226
        // following regexp, adapted from the W3, is used. See:
227
        // https://www.w3.org/International/questions/qa-forms-utf-8.en
228
        // We use preg_replace() instead of preg_match() to avoid "JIT
229
        // stack limit exhausted" errors on larger files.
230 51
        $utf8Filter = preg_replace('/(
231
            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
232
            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
233
            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
234
            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
235
            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
236
            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
237
            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
238
            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
239 51
        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
240
241 51
        if ('' !== $utf8Filter) {
242 1
            return '';
243
        }
244
245
        // Find all inline image content and replace them so they aren't
246
        // affected by the next steps
247 51
        $pdfInlineImages = [];
248 51
        $offsetBI = 0;
249 51
        while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
250
            // Attempt to detemine if this instance of the 'BI' command
251
            // actually occured within a (string) using the following
252
            // steps:
253
254
            // Step 1: Remove any escaped slashes and parentheses from
255
            // the alleged image characteristics data
256 1
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]);
257
258
            // Step 2: Remove all correctly ordered and balanced
259
            // parentheses from (strings)
260
            do {
261 1
                $paraTest = $para;
262 1
                $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest);
263 1
            } while ($para != $paraTest);
264
265 1
            $paraOpen = strpos($para, '(');
266 1
            $paraClose = strpos($para, ')');
267
268
            // Check: If the remaining text contains a close parenthesis
269
            // ')' AND it occurs before any open parenthesis, then we
270
            // are almost certain to be inside a (string)
271 1
            if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
272
                // Bump the search offset forward and match again
273 1
                $offsetBI = (int) $text[1][1];
274 1
                continue;
275
            }
276
277
            // Step 3: Double check that this is actually inline image
278
            // data by parsing the alleged image characteristics as a
279
            // dictionary
280 1
            $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
281
282
            // Check if an image Width and Height are set in the dict
283 1
            if ((isset($dict['W']) || isset($dict['Width']))
284 1
                && (isset($dict['H']) || isset($dict['Height']))) {
285 1
                $id = uniqid('IMAGE_', true);
286 1
                $pdfInlineImages[$id] = [
287 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
288 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
289 1
                ];
290 1
                $content = preg_replace(
291 1
                    '/'.preg_quote($text[0][0], '/').'/',
292 1
                    '^^^'.$id.'^^^',
293 1
                    $content,
294 1
                    1
295 1
                );
296
            } else {
297
                // If there was no valid dictionary, or a height and width
298
                // weren't specified, then we don't know what this is, so
299
                // just leave it alone; bump the search offset forward and
300
                // match again
301
                $offsetBI = (int) $text[1][1];
302
            }
303
        }
304
305
        // Find all strings () and replace them so they aren't affected
306
        // by the next steps
307 51
        $pdfstrings = [];
308 51
        $attempt = '(';
309 51
        while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) {
310
            // Remove all escaped slashes and parentheses from the target text
311 42
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]);
312
313
            // PDF strings can contain unescaped parentheses as long as
314
            // they're balanced, so check for balanced parentheses
315 42
            $left = preg_match_all('/\(/', $para);
316 42
            $right = preg_match_all('/\)/', $para);
317
318 42
            if (')' == $para[-1] && $left == $right) {
319
                // Replace the string with a unique placeholder
320 42
                $id = uniqid('STRING_', true);
321 42
                $pdfstrings[$id] = $text[0];
322 42
                $content = preg_replace(
323 42
                    '/'.preg_quote($text[0], '/').'/',
324 42
                    '@@@'.$id.'@@@',
325 42
                    $content,
326 42
                    1
327 42
                );
328
329
                // Reset to search for the next string
330 42
                $attempt = '(';
331
            } else {
332
                // We had unbalanced parentheses, so use the current
333
                // match as a base to find a longer string
334 21
                $attempt = $text[0];
335
            }
336
        }
337
338
        // Remove all carriage returns and line-feeds from the document stream
339 51
        $content = str_replace(["\r", "\n"], ' ', trim($content));
340
341
        // Find all dictionary << >> commands and replace them so they
342
        // aren't affected by the next steps
343 51
        $dictstore = [];
344 51
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
345 18
            $dictid = uniqid('DICT_', true);
346 18
            $dictstore[$dictid] = $dicttext[1];
347 18
            $content = preg_replace(
348 18
                '/'.preg_quote($dicttext[0], '/').'/',
349 18
                ' ###'.$dictid.'###'.$dicttext[2],
350 18
                $content,
351 18
                1
352 18
            );
353
        }
354
355
        // Normalize white-space in the document stream
356 51
        $content = preg_replace('/\s{2,}/', ' ', $content);
357
358
        // Find all valid PDF operators and add \r\n after each; this
359
        // ensures there is just one command on every line
360
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
361
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
362
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
363
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
364
        //       appear here in the list for completeness.
365 51
        $operators = [
366 51
            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
367 51
            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
368 51
            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
369 51
            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
370 51
            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
371 51
            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
372 51
        ];
373 51
        foreach ($operators as $operator) {
374 51
            $content = preg_replace(
375 51
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
376 51
                $operator."\r\n",
377 51
                $content
378 51
            );
379
        }
380
381
        // Restore the original content of the dictionary << >> commands
382 51
        $dictstore = array_reverse($dictstore, true);
383 51
        foreach ($dictstore as $id => $dict) {
384 18
            $content = str_replace('###'.$id.'###', $dict, $content);
385
        }
386
387
        // Restore the original string content
388 51
        $pdfstrings = array_reverse($pdfstrings, true);
389 51
        foreach ($pdfstrings as $id => $text) {
390
            // Strings may contain escaped newlines, or literal newlines
391
            // and we should clean these up before replacing the string
392
            // back into the content stream; this ensures no strings are
393
            // split between two lines (every command must be on one line)
394 42
            $text = str_replace(
395 42
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
396 42
                ['', '', '', '\r', '\n'],
397 42
                $text
398 42
            );
399
400 42
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
401
        }
402
403
        // Restore the original content of any inline images
404 51
        $pdfInlineImages = array_reverse($pdfInlineImages, true);
405 51
        foreach ($pdfInlineImages as $id => $image) {
406 1
            $content = str_replace(
407 1
                '^^^'.$id.'^^^',
408 1
                "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
409 1
                $content
410 1
            );
411
        }
412
413 51
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
414
415 51
        return $content;
416
    }
417
418
    /**
419
     * getSectionsText() now takes an entire, unformatted
420
     * document stream as a string, cleans it, then filters out
421
     * commands that aren't needed for text positioning/extraction. It
422
     * returns an array of unprocessed PDF commands, one command per
423
     * element.
424
     *
425
     * @internal
426
     */
427 52
    public function getSectionsText(?string $content): array
428
    {
429 52
        $sections = [];
430
431
        // A cleaned stream has one command on every line, so split the
432
        // cleaned stream content on \r\n into an array
433 52
        $textCleaned = preg_split(
434 52
            '/(\r\n|\n|\r)/',
435 52
            $this->formatContent($content),
436 52
            -1,
437 52
            \PREG_SPLIT_NO_EMPTY
438 52
        );
439
440 52
        $inTextBlock = false;
441 52
        foreach ($textCleaned as $line) {
442 49
            $line = trim($line);
443
444
            // Skip empty lines
445 49
            if ('' === $line) {
446
                continue;
447
            }
448
449
            // If a 'BT' is encountered, set the $inTextBlock flag
450 49
            if (preg_match('/BT$/', $line)) {
451 49
                $inTextBlock = true;
452 49
                $sections[] = $line;
453
454
            // If an 'ET' is encountered, unset the $inTextBlock flag
455 49
            } elseif ('ET' == $line) {
456 49
                $inTextBlock = false;
457 49
                $sections[] = $line;
458 49
            } elseif ($inTextBlock) {
459
                // If we are inside a BT ... ET text block, save all lines
460 49
                $sections[] = trim($line);
461
            } else {
462
                // Otherwise, if we are outside of a text block, only
463
                // save specific, necessary lines. Care should be taken
464
                // to ensure a command being checked for *only* matches
465
                // that command. For instance, a simple search for 'c'
466
                // may also match the 'sc' command. See the command
467
                // list in the formatContent() method above.
468
                // Add more commands to save here as you find them in
469
                // weird PDFs!
470 48
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
471
                    // Save and restore graphics state commands
472 42
                    $sections[] = $line;
473 48
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
474
                    // Begin marked content sequence
475 16
                    $sections[] = $line;
476 48
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
477
                    // Marked content point
478 1
                    $sections[] = $line;
479 47
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
480
                    // End marked content sequence
481 15
                    $sections[] = $line;
482 45
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
483
                    // Graphics position change commands
484 33
                    $sections[] = $line;
485 45
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
486
                    // Font change commands
487 3
                    $sections[] = $line;
488 45
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
489
                    // Invoke named XObject command
490 15
                    $sections[] = $line;
491
                }
492
            }
493
        }
494
495 52
        return $sections;
496
    }
497
498 46
    private function getDefaultFont(?Page $page = null): Font
499
    {
500 46
        $fonts = [];
501 46
        if (null !== $page) {
502 44
            $fonts = $page->getFonts();
503
        }
504
505 46
        $firstFont = $this->document->getFirstFont();
506 46
        if (null !== $firstFont) {
507 43
            $fonts[] = $firstFont;
508
        }
509
510 46
        if (\count($fonts) > 0) {
511 43
            return reset($fonts);
512
        }
513
514 3
        return new Font($this->document, null, null, $this->config);
0 ignored issues
show
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

514
        return new Font(/** @scrutinizer ignore-type */ $this->document, null, null, $this->config);
Loading history...
515
    }
516
517
    /**
518
     * Decode a '[]TJ' command and attempt to use alternate
519
     * fonts if the current font results in output that contains
520
     * Unicode control characters.
521
     *
522
     * @internal
523
     *
524
     * @param array<int,array<string,string|bool>> $command
525
     */
526 43
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
527
    {
528 43
        $orig_text = $font->decodeText($command, $fontFactor);
529 43
        $text = $orig_text;
530
531
        // If we make this a Config option, we can add a check if it's
532
        // enabled here.
533 43
        if (null !== $page) {
534 43
            $font_ids = array_keys($page->getFonts());
535
536
            // If the decoded text contains UTF-8 control characters
537
            // then the font page being used is probably the wrong one.
538
            // Loop through the rest of the fonts to see if we can get
539
            // a good decode. Allow x09 to x0d which are whitespace.
540 43
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
541
                // If we're out of font IDs, then give up and use the
542
                // original string
543 3
                if (0 == \count($font_ids)) {
544 3
                    return $orig_text;
545
                }
546
547
                // Try the next font ID
548 3
                $font = $page->getFont(array_shift($font_ids));
549 3
                $text = $font->decodeText($command, $fontFactor);
550
            }
551
        }
552
553 43
        return $text;
554
    }
555
556
    /**
557
     * Expects a string that is a full PDF dictionary object,
558
     * including the outer enclosing << >> angle brackets
559
     *
560
     * @internal
561
     *
562
     * @throws \Exception
563
     */
564 18
    public function parseDictionary(string $dictionary): array
565
    {
566
        // Normalize whitespace
567 18
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
568
569 18
        if ('<<' != substr($dictionary, 0, 2)) {
570
            throw new \Exception('Not a valid dictionary object.');
571
        }
572
573 18
        $parsed = [];
574 18
        $stack = [];
575 18
        $currentName = '';
576 18
        $arrayTypeNumeric = false;
577
578
        // Remove outer layer of dictionary, and split on tokens
579 18
        $split = preg_split(
580 18
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
581 18
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
582 18
            -1,
583 18
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
584 18
        );
585
586 18
        foreach ($split as $token) {
587 18
            $token = trim($token);
588
            switch ($token) {
589 18
                case '':
590 8
                    break;
591
592
                    // Open numeric array
593 18
                case '[':
594 8
                    $parsed[$currentName] = [];
595 8
                    $arrayTypeNumeric = true;
596
597
                    // Move up one level in the stack
598 8
                    $stack[\count($stack)] = &$parsed;
599 8
                    $parsed = &$parsed[$currentName];
600 8
                    $currentName = '';
601 8
                    break;
602
603
                    // Open hashed array
604 18
                case '<<':
605 1
                    $parsed[$currentName] = [];
606 1
                    $arrayTypeNumeric = false;
607
608
                    // Move up one level in the stack
609 1
                    $stack[\count($stack)] = &$parsed;
610 1
                    $parsed = &$parsed[$currentName];
611 1
                    $currentName = '';
612 1
                    break;
613
614
                    // Close numeric array
615 18
                case ']':
616
                    // Revert string type arrays back to a single element
617 8
                    if (\is_array($parsed) && 1 == \count($parsed)
618 8
                        && isset($parsed[0]) && \is_string($parsed[0])
619 8
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
620 6
                        $parsed = '['.$parsed[0].']';
621
                    }
622
                    // Close hashed array
623
                    // no break
624 18
                case '>>':
625 8
                    $arrayTypeNumeric = false;
626
627
                    // Move down one level in the stack
628 8
                    $parsed = &$stack[\count($stack) - 1];
629 8
                    unset($stack[\count($stack) - 1]);
630 8
                    break;
631
632
                default:
633
                    // If value begins with a slash, then this is a name
634
                    // Add it to the appropriate array
635 18
                    if ('/' == substr($token, 0, 1)) {
636 18
                        $currentName = substr($token, 1);
637 18
                        if (true == $arrayTypeNumeric) {
638 7
                            $parsed[] = $currentName;
639 18
                            $currentName = '';
640
                        }
641 18
                    } elseif ('' != $currentName) {
642 18
                        if (false == $arrayTypeNumeric) {
643 18
                            $parsed[$currentName] = $token;
644
                        }
645 18
                        $currentName = '';
646 5
                    } elseif ('' == $currentName) {
647 5
                        $parsed[] = $token;
648
                    }
649
            }
650
        }
651
652 18
        return $parsed;
653
    }
654
655
    /**
656
     * Returns the text content of a PDF as a string. Attempts to add
657
     * whitespace for spacing and line-breaks where appropriate.
658
     *
659
     * getText() leverages getTextArray() to get the content
660
     * of the document, setting the addPositionWhitespace flag to true
661
     * so whitespace is inserted in a logical way for reading by
662
     * humans.
663
     */
664 37
    public function getText(?Page $page = null): string
665
    {
666 37
        $this->addPositionWhitespace = true;
667 37
        $result = $this->getTextArray($page);
668 37
        $this->addPositionWhitespace = false;
669
670 37
        return implode('', $result).' ';
671
    }
672
673
    /**
674
     * Returns the text content of a PDF as an array of strings. No
675
     * extra whitespace is inserted besides what is actually encoded in
676
     * the PDF text.
677
     *
678
     * @throws \Exception
679
     */
680 46
    public function getTextArray(?Page $page = null): array
681
    {
682 46
        $result = [];
683 46
        $text = [];
684
685 46
        $marked_stack = [];
686 46
        $last_written_position = false;
687
688 46
        $sections = $this->getSectionsText($this->content);
689 46
        $current_font = $this->getDefaultFont($page);
690 46
        $current_font_size = 1;
691 46
        $current_text_leading = 0;
692
693 46
        $current_position = ['x' => false, 'y' => false];
694 46
        $current_position_tm = [
695 46
            'a' => 1, 'b' => 0, 'c' => 0,
696 46
            'i' => 0, 'j' => 1, 'k' => 0,
697 46
            'x' => 0, 'y' => 0, 'z' => 1,
698 46
        ];
699 46
        $current_position_td = ['x' => 0, 'y' => 0];
700 46
        $current_position_cm = [
701 46
            'a' => 1, 'b' => 0, 'c' => 0,
702 46
            'i' => 0, 'j' => 1, 'k' => 0,
703 46
            'x' => 0, 'y' => 0, 'z' => 1,
704 46
        ];
705
706 46
        $clipped_font = [];
707 46
        $clipped_position_cm = [];
708
709 46
        self::$recursionStack[] = $this->getUniqueId();
710
711 46
        foreach ($sections as $section) {
712 43
            $commands = $this->getCommandsText($section);
713 43
            foreach ($commands as $command) {
714 43
                switch ($command[self::OPERATOR]) {
715
                    // Begin text object
716 43
                    case 'BT':
717
                        // Reset text positioning matrices
718 43
                        $current_position_tm = [
719 43
                            'a' => 1, 'b' => 0, 'c' => 0,
720 43
                            'i' => 0, 'j' => 1, 'k' => 0,
721 43
                            'x' => 0, 'y' => 0, 'z' => 1,
722 43
                        ];
723 43
                        $current_position_td = ['x' => 0, 'y' => 0];
724 43
                        $current_text_leading = 0;
725 43
                        break;
726
727
                        // Begin marked content sequence with property list
728 43
                    case 'BDC':
729 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
730 16
                            $dict = $this->parseDictionary($match[1]);
731
732
                            // Check for ActualText block
733 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
734 4
                                if ('[' == $dict['ActualText'][0]) {
735
                                    // Simulate a 'TJ' command on the stack
736
                                    $marked_stack[] = [
737
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
738
                                    ];
739 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
740
                                    // Simulate a 'Tj' command on the stack
741 4
                                    $marked_stack[] = [
742 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
743 4
                                    ];
744
                                }
745
                            }
746
                        }
747 16
                        break;
748
749
                        // Begin marked content sequence
750 43
                    case 'BMC':
751 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
752
                            // Upon encountering a ReversedChars command,
753
                            // add the characters we've built up so far to
754
                            // the result array
755 1
                            $result = array_merge($result, $text);
756
757
                            // Start a fresh $text array that will contain
758
                            // reversed characters
759 1
                            $text = [];
760
761
                            // Add the reversed text flag to the stack
762 1
                            $marked_stack[] = ['ReversedChars' => true];
763
                        }
764 2
                        break;
765
766
                        // set graphics position matrix
767 43
                    case 'cm':
768 29
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
769 29
                        $current_position_cm = [
770 29
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
771 29
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
772 29
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
773 29
                        ];
774 29
                        break;
775
776 43
                    case 'Do':
777 15
                        if (null !== $page) {
778 15
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
779 15
                            $id = trim(array_pop($args), '/ ');
780 15
                            $xobject = $page->getXObject($id);
781
782
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
783 15
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
784
                                // Not a circular reference.
785 15
                                $text[] = $xobject->getText($page);
786
                            }
787
                        }
788 15
                        break;
789
790
                        // Marked content point with (DP) & without (MP) property list
791 43
                    case 'DP':
792 43
                    case 'MP':
793 1
                        break;
794
795
                        // End text object
796 43
                    case 'ET':
797 43
                        break;
798
799
                        // Store current selected font and graphics matrix
800 43
                    case 'q':
801 37
                        $clipped_font[] = [$current_font, $current_font_size];
802 37
                        $clipped_position_cm[] = $current_position_cm;
803 37
                        break;
804
805
                        // Restore previous selected font and graphics matrix
806 43
                    case 'Q':
807 37
                        list($current_font, $current_font_size) = array_pop($clipped_font);
808 37
                        $current_position_cm = array_pop($clipped_position_cm);
809 37
                        break;
810
811
                        // End marked content sequence
812 43
                    case 'EMC':
813 17
                        $data = false;
814 17
                        if (\count($marked_stack)) {
815 5
                            $marked = array_pop($marked_stack);
816 5
                            $action = key($marked);
817 5
                            $data = $marked[$action];
818
819
                            switch ($action) {
820
                                // If we are in ReversedChars mode...
821 5
                                case 'ReversedChars':
822
                                    // Reverse the characters we've built up so far
823 1
                                    foreach ($text as $key => $t) {
824 1
                                        $text[$key] = implode('', array_reverse(
825 1
                                            mb_str_split($t, 1, mb_internal_encoding())
826 1
                                        ));
827
                                    }
828
829
                                    // Add these characters to the result array
830 1
                                    $result = array_merge($result, $text);
831
832
                                    // Start a fresh $text array that will contain
833
                                    // non-reversed characters
834 1
                                    $text = [];
835 1
                                    break;
836
837 4
                                case 'ActualText':
838
                                    // Use the content of the ActualText as a command
839 4
                                    $command = $data;
840 4
                                    break;
841
                            }
842
                        }
843
844
                        // If this EMC command has been transformed into a 'Tj'
845
                        // or 'TJ' command because of being ActualText, then bypass
846
                        // the break to proceed to the writing section below.
847 17
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
848 17
                            break;
849
                        }
850
851
                        // no break
852 43
                    case "'":
853 43
                    case '"':
854 4
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
855
                            // Move to next line and write text
856
                            $current_position['x'] = 0;
857
                            $current_position_td['x'] = 0;
858
                            $current_position_td['y'] += $current_text_leading;
859
                        }
860
                        // no break
861 43
                    case 'Tj':
862 35
                        $command[self::COMMAND] = [$command];
863
                        // no break
864 43
                    case 'TJ':
865
                        // Check the marked content stack for flags
866 43
                        $actual_text = false;
867 43
                        $reverse_text = false;
868 43
                        foreach ($marked_stack as $marked) {
869 5
                            if (isset($marked['ActualText'])) {
870 4
                                $actual_text = true;
871
                            }
872 5
                            if (isset($marked['ReversedChars'])) {
873 1
                                $reverse_text = true;
874
                            }
875
                        }
876
877
                        // Account for text position ONLY just before we write text
878 43
                        if (false === $actual_text && \is_array($last_written_position)) {
879
                            // If $last_written_position is an array, that
880
                            // means we have stored text position coordinates
881
                            // for placing an ActualText
882 4
                            $currentX = $last_written_position[0];
883 4
                            $currentY = $last_written_position[1];
884 4
                            $last_written_position = false;
885
                        } else {
886 43
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
887 43
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
888
                        }
889 43
                        $whiteSpace = '';
890
891 43
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
892 43
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
893
894 43
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
895 31
                            $curY = $currentY - $current_position['y'];
896 31
                            if (abs($curY) >= abs($factorY) / 4) {
897 30
                                $whiteSpace = "\n";
898
                            } else {
899 30
                                if (true === $reverse_text) {
900 1
                                    $curX = $current_position['x'] - $currentX;
901
                                } else {
902 30
                                    $curX = $currentX - $current_position['x'];
903
                                }
904
905
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
906
                                // as the number of apparent "spaces" in a document we
907
                                // would need before considering them a "tab". In the
908
                                // future, we might offer this value to users as a config
909
                                // option.
910 30
                                if ($curX >= abs($factorX * 7)) {
911 20
                                    $whiteSpace = "\t";
912 29
                                } elseif ($curX >= abs($factorX * 2)) {
913 19
                                    $whiteSpace = ' ';
914
                                }
915
                            }
916
                        }
917
918 43
                        $newtext = $this->getTJUsingFontFallback(
919 43
                            $current_font,
920 43
                            $command[self::COMMAND],
921 43
                            $page,
922 43
                            $factorX
923 43
                        );
924
925
                        // If there is no ActualText pending then write
926 43
                        if (false === $actual_text) {
927 43
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
928 43
                            if (false !== $reverse_text) {
929
                                // If we are in ReversedChars mode, add the whitespace last
930 1
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
931
                            } else {
932
                                // Otherwise add the whitespace first
933 43
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
934 18
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
935
                                }
936 43
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
937
                            }
938
939
                            // Record the position of this inserted text for comparison
940
                            // with the next text block.
941
                            // Provide a 'fudge' factor guess on how wide this text block
942
                            // is based on the number of characters. This helps limit the
943
                            // number of tabs inserted, but isn't perfect.
944 43
                            $factor = $factorX / 2;
945 43
                            $current_position = [
946 43
                                'x' => $currentX - mb_strlen($newtext) * $factor,
947 43
                                'y' => $currentY,
948 43
                            ];
949 4
                        } elseif (false === $last_written_position) {
950
                            // If there is an ActualText in the pipeline
951
                            // store the position this undisplayed text
952
                            // *would* have been written to, so the
953
                            // ActualText is displayed in the right spot
954 4
                            $last_written_position = [$currentX, $currentY];
955 4
                            $current_position['x'] = $currentX;
956
                        }
957 43
                        break;
958
959
                        // move to start of next line
960 43
                    case 'T*':
961 13
                        $current_position['x'] = 0;
962 13
                        $current_position_td['x'] = 0;
963 13
                        $current_position_td['y'] += $current_text_leading;
964 13
                        break;
965
966
                        // set character spacing
967 43
                    case 'Tc':
968 13
                        break;
969
970
                        // move text current point and set leading
971 43
                    case 'Td':
972 43
                    case 'TD':
973
                        // move text current point
974 32
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
975 32
                        $y = (float) array_pop($args);
976 32
                        $x = (float) array_pop($args);
977
978 32
                        if ('TD' == $command[self::OPERATOR]) {
979 7
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
980
                        }
981
982 32
                        $current_position_td = [
983 32
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
984 32
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
985 32
                        ];
986 32
                        break;
987
988 43
                    case 'Tf':
989 43
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
990 43
                        $size = (float) array_pop($args);
991 43
                        $id = trim(array_pop($args), '/');
992 43
                        if (null !== $page) {
993 43
                            $new_font = $page->getFont($id);
994
                            // If an invalid font ID is given, do not update the font.
995
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
996
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
997
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
998
                            // But we want to make sure that malformed PDFs do not simply crash.
999 43
                            if (null !== $new_font) {
1000 43
                                $current_font = $new_font;
1001 43
                                $current_font_size = $size;
1002
                            }
1003
                        }
1004 43
                        break;
1005
1006
                        // set leading
1007 37
                    case 'TL':
1008 6
                        $y = (float) $command[self::COMMAND];
1009 6
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
1010 6
                        break;
1011
1012
                        // set text position matrix
1013 37
                    case 'Tm':
1014 34
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
1015 34
                        $current_position_tm = [
1016 34
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
1017 34
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
1018 34
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
1019 34
                        ];
1020 34
                        break;
1021
1022
                        // set text rendering mode
1023 22
                    case 'Ts':
1024
                        break;
1025
1026
                        // set super/subscripting text rise
1027 22
                    case 'Ts':
1028
                        break;
1029
1030
                        // set word spacing
1031 22
                    case 'Tw':
1032 9
                        break;
1033
1034
                        // set horizontal scaling
1035 22
                    case 'Tz':
1036
                        break;
1037
1038
                    default:
1039
                }
1040
            }
1041
        }
1042
1043 46
        $result = array_merge($result, $text);
1044
1045 46
        return $result;
1046
    }
1047
1048
    /**
1049
     * getCommandsText() expects the content of $text_part to be an
1050
     * already formatted, single-line command from a document stream.
1051
     * The companion function getSectionsText() returns a document
1052
     * stream as an array of single commands for just this purpose.
1053
     * Because of this, the argument $offset is no longer used, and
1054
     * may be removed in a future PdfParser release.
1055
     *
1056
     * A better name for this function would be getCommandText()
1057
     * since it now always works on just one command.
1058
     */
1059 50
    public function getCommandsText(string $text_part, int &$offset = 0): array
1060
    {
1061 50
        $commands = $matches = [];
1062
1063 50
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
1064
1065
        // If no valid command is detected, return an empty array
1066 50
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
1067 1
            return [];
1068
        }
1069
1070 50
        $type = $matches[2];
1071 50
        $operator = $matches[3];
1072 50
        $command = trim($matches[1]);
1073
1074 50
        if ('TJ' == $operator) {
1075 40
            $subcommand = [];
1076 40
            $command = trim($command, '[]');
1077
            do {
1078 40
                $oldCommand = $command;
1079
1080
                // Search for parentheses string () format
1081 40
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
1082 34
                    $subcommand[] = [
1083 34
                        self::TYPE => '(',
1084 34
                        self::OPERATOR => 'TJ',
1085 34
                        self::COMMAND => $tjmatch[1],
1086 34
                    ];
1087 34
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1088 28
                        $subcommand[] = [
1089 28
                            self::TYPE => 'n',
1090 28
                            self::OPERATOR => '',
1091 28
                            self::COMMAND => $tjmatch[2],
1092 28
                        ];
1093
                    }
1094 34
                    $command = substr($command, \strlen($tjmatch[0]));
1095
                }
1096
1097
                // Search for hexadecimal <> format
1098 40
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1099 19
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1100 19
                    $subcommand[] = [
1101 19
                        self::TYPE => '<',
1102 19
                        self::OPERATOR => 'TJ',
1103 19
                        self::COMMAND => $tjmatch[1],
1104 19
                    ];
1105 19
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1106 18
                        $subcommand[] = [
1107 18
                            self::TYPE => 'n',
1108 18
                            self::OPERATOR => '',
1109 18
                            self::COMMAND => $tjmatch[2],
1110 18
                        ];
1111
                    }
1112 19
                    $command = substr($command, \strlen($tjmatch[0]));
1113
                }
1114 40
            } while ($command != $oldCommand);
1115
1116 40
            $command = $subcommand;
1117 50
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1118
            // Depending on the string type, trim the data of the
1119
            // appropriate delimiters
1120 39
            if ('(' == $type) {
1121
                // Don't use trim() here since a () string may end with
1122
                // a balanced or escaped right parentheses, and trim()
1123
                // will delete both. Both strings below are valid:
1124
                //   eg. (String())
1125
                //   eg. (String\))
1126 33
                $command = preg_replace('/^\(|\)$/', '', $command);
1127 15
            } elseif ('<' == $type) {
1128 39
                $command = trim($command, '<>');
1129
            }
1130 50
        } elseif ('/' == $type) {
1131 49
            $command = substr($command, 1);
1132
        }
1133
1134 50
        $commands[] = [
1135 50
            self::TYPE => $type,
1136 50
            self::OPERATOR => $operator,
1137 50
            self::COMMAND => $command,
1138 50
        ];
1139
1140 50
        return $commands;
1141
    }
1142
1143 65
    public static function factory(
1144
        Document $document,
1145
        Header $header,
1146
        ?string $content,
1147
        ?Config $config = null
1148
    ): self {
1149 65
        switch ($header->get('Type')->getContent()) {
1150 65
            case 'XObject':
1151 19
                switch ($header->get('Subtype')->getContent()) {
1152 19
                    case 'Image':
1153 12
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
1154
1155 8
                    case 'Form':
1156 8
                        return new Form($document, $header, $content, $config);
1157
                }
1158
1159
                return new self($document, $header, $content, $config);
1160
1161 65
            case 'Pages':
1162 64
                return new Pages($document, $header, $content, $config);
1163
1164 65
            case 'Page':
1165 64
                return new Page($document, $header, $content, $config);
1166
1167 65
            case 'Encoding':
1168 12
                return new Encoding($document, $header, $content, $config);
1169
1170 65
            case 'Font':
1171 64
                $subtype = $header->get('Subtype')->getContent();
1172 64
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1173
1174 64
                if (class_exists($classname)) {
1175 64
                    return new $classname($document, $header, $content, $config);
1176
                }
1177
1178
                return new Font($document, $header, $content, $config);
1179
1180
            default:
1181 65
                return new self($document, $header, $content, $config);
1182
        }
1183
    }
1184
1185
    /**
1186
     * Returns unique id identifying the object.
1187
     */
1188 46
    protected function getUniqueId(): string
1189
    {
1190 46
        return spl_object_hash($this);
1191
    }
1192
}
1193