Issues (82)

src/Smalot/PdfParser/PDFObject.php (3 issues)

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document|null
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config|null
73
     */
74
    protected $config;
75
76
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81 98
    public function __construct(
82
        Document $document,
83
        ?Header $header = null,
84
        ?string $content = null,
85
        ?Config $config = null
86
    ) {
87 98
        $this->document = $document;
88 98
        $this->header = $header ?? new Header();
89 98
        $this->content = $content;
90 98
        $this->config = $config;
91
    }
92
93 74
    public function init()
94
    {
95 74
    }
96
97 4
    public function getDocument(): Document
98
    {
99 4
        return $this->document;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->document could return the type null which is incompatible with the type-hinted return Smalot\PdfParser\Document. Consider adding an additional type-check to rule them out.
Loading history...
100
    }
101
102 74
    public function getHeader(): ?Header
103
    {
104 74
        return $this->header;
105
    }
106
107 4
    public function getConfig(): ?Config
108
    {
109 4
        return $this->config;
110
    }
111
112
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 77
    public function get(string $name)
116
    {
117 77
        return $this->header->get($name);
118
    }
119
120 76
    public function has(string $name): bool
121
    {
122 76
        return $this->header->has($name);
123
    }
124
125 4
    public function getDetails(bool $deep = true): array
126
    {
127 4
        return $this->header->getDetails($deep);
128
    }
129
130 60
    public function getContent(): ?string
131
    {
132 60
        return $this->content;
133
    }
134
135
    /**
136
     * Creates a duplicate of the document stream with
137
     * strings and other items replaced by $char. Formerly
138
     * getSectionsText() used this output to more easily gather offset
139
     * values to extract text from the *actual* document stream.
140
     *
141
     * @deprecated function is no longer used and will be removed in a future release
142
     *
143
     * @internal
144
     */
145 1
    public function cleanContent(string $content, string $char = 'X')
146
    {
147 1
        $char = $char[0];
148 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149
150
        // Remove image bloc with binary content
151 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152 1
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean content in square brackets [.....]
157 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
158 1
        foreach ($matches[1] as $part) {
159 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162
        // Clean content in round brackets (.....)
163 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 1
        foreach ($matches[1] as $part) {
165 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
170 1
            $content = '';
171 1
            $level = 0;
172 1
            foreach ($parts as $part) {
173 1
                if ('<' == $part) {
174 1
                    ++$level;
175
                }
176
177 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178
179 1
                if ('>' == $part) {
180 1
                    --$level;
181
                }
182
            }
183
        }
184
185
        // Clean BDC and EMC markup
186 1
        preg_match_all(
187 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188 1
            $content,
189 1
            $matches,
190 1
            \PREG_OFFSET_CAPTURE
191 1
        );
192 1
        foreach ($matches[1] as $part) {
193 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 1
        foreach ($matches[1] as $part) {
198 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199
        }
200
201 1
        return $content;
202
    }
203
204
    /**
205
     * Takes a string of PDF document stream text and formats
206
     * it into a multi-line string with one PDF command on each line,
207
     * separated by \r\n. If the given string is null, or binary data
208
     * is detected instead of a document stream then return an empty
209
     * string.
210
     */
211 56
    private function formatContent(?string $content): string
212
    {
213 56
        if (null === $content) {
214 3
            return '';
215
        }
216
217
        // Outside of (String) and inline image content in PDF document
218
        // streams, all text should conform to UTF-8. Test for binary
219
        // content by deleting everything after the first open-
220
        // parenthesis ( which indicates the beginning of a string, or
221
        // the first ID command which indicates the beginning of binary
222
        // inline image content. Then test what remains for valid
223
        // UTF-8. If it's not UTF-8, return an empty string as this
224
        // $content is most likely binary. Unfortunately, using
225
        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
226
        // following regexp, adapted from the W3, is used. See:
227
        // https://www.w3.org/International/questions/qa-forms-utf-8.en
228
        // We use preg_replace() instead of preg_match() to avoid "JIT
229
        // stack limit exhausted" errors on larger files.
230 53
        $utf8Filter = preg_replace('/(
231
            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
232
            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
233
            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
234
            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
235
            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
236
            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
237
            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
238
            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
239 53
        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
240
241 53
        if ('' !== $utf8Filter) {
242 1
            return '';
243
        }
244
245
        // Find all inline image content and replace them so they aren't
246
        // affected by the next steps
247 53
        $pdfInlineImages = [];
248 53
        $offsetBI = 0;
249 53
        while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
250
            // Attempt to detemine if this instance of the 'BI' command
251
            // actually occured within a (string) using the following
252
            // steps:
253
254
            // Step 1: Remove any escaped slashes and parentheses from
255
            // the alleged image characteristics data
256 1
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]);
257
258
            // Step 2: Remove all correctly ordered and balanced
259
            // parentheses from (strings)
260
            do {
261 1
                $paraTest = $para;
262 1
                $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest);
263 1
            } while ($para != $paraTest);
264
265 1
            $paraOpen = strpos($para, '(');
266 1
            $paraClose = strpos($para, ')');
267
268
            // Check: If the remaining text contains a close parenthesis
269
            // ')' AND it occurs before any open parenthesis, then we
270
            // are almost certain to be inside a (string)
271 1
            if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
272
                // Bump the search offset forward and match again
273 1
                $offsetBI = (int) $text[1][1];
274 1
                continue;
275
            }
276
277
            // Step 3: Double check that this is actually inline image
278
            // data by parsing the alleged image characteristics as a
279
            // dictionary
280 1
            $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
281
282
            // Check if an image Width and Height are set in the dict
283 1
            if ((isset($dict['W']) || isset($dict['Width']))
284 1
                && (isset($dict['H']) || isset($dict['Height']))) {
285 1
                $id = uniqid('IMAGE_', true);
286 1
                $pdfInlineImages[$id] = [
287 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
288 1
                    preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
289 1
                ];
290 1
                $content = preg_replace(
291 1
                    '/'.preg_quote($text[0][0], '/').'/',
292 1
                    '^^^'.$id.'^^^',
293 1
                    $content,
294 1
                    1
295 1
                );
296
            } else {
297
                // If there was no valid dictionary, or a height and width
298
                // weren't specified, then we don't know what this is, so
299
                // just leave it alone; bump the search offset forward and
300
                // match again
301
                $offsetBI = (int) $text[1][1];
302
            }
303
        }
304
305
        // Find all strings () and replace them so they aren't affected
306
        // by the next steps
307 53
        $pdfstrings = [];
308 53
        $attempt = '(';
309 53
        while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) {
310
            // Remove all escaped slashes and parentheses from the target text
311 42
            $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]);
312
313
            // PDF strings can contain unescaped parentheses as long as
314
            // they're balanced, so check for balanced parentheses
315 42
            $left = preg_match_all('/\(/', $para);
316 42
            $right = preg_match_all('/\)/', $para);
317
318 42
            if (')' == $para[-1] && $left == $right) {
319
                // Replace the string with a unique placeholder
320 42
                $id = uniqid('STRING_', true);
321 42
                $pdfstrings[$id] = $text[0];
322 42
                $content = preg_replace(
323 42
                    '/'.preg_quote($text[0], '/').'/',
324 42
                    '@@@'.$id.'@@@',
325 42
                    $content,
326 42
                    1
327 42
                );
328
329
                // Reset to search for the next string
330 42
                $attempt = '(';
331
            } else {
332
                // We had unbalanced parentheses, so use the current
333
                // match as a base to find a longer string
334 21
                $attempt = $text[0];
335
            }
336
        }
337
338
        // Remove all carriage returns and line-feeds from the document stream
339 53
        $content = str_replace(["\r", "\n"], ' ', trim($content));
340
341
        // Find all dictionary << >> commands and replace them so they
342
        // aren't affected by the next steps
343 53
        $dictstore = [];
344 53
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
345 18
            $dictid = uniqid('DICT_', true);
346 18
            $dictstore[$dictid] = $dicttext[1];
347 18
            $content = preg_replace(
348 18
                '/'.preg_quote($dicttext[0], '/').'/',
349 18
                ' ###'.$dictid.'###'.$dicttext[2],
350 18
                $content,
351 18
                1
352 18
            );
353
        }
354
355
        // Normalize white-space in the document stream
356 53
        $content = preg_replace('/\s{2,}/', ' ', $content);
357
358
        // Find all valid PDF operators and add \r\n after each; this
359
        // ensures there is just one command on every line
360
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
361
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
362
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
363
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
364
        //       appear here in the list for completeness.
365 53
        $operators = [
366 53
            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
367 53
            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
368 53
            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
369 53
            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
370 53
            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
371 53
            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
372 53
        ];
373 53
        foreach ($operators as $operator) {
374 53
            $content = preg_replace(
375 53
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
376 53
                $operator."\r\n",
377 53
                $content
378 53
            );
379
        }
380
381
        // Restore the original content of the dictionary << >> commands
382 53
        $dictstore = array_reverse($dictstore, true);
383 53
        foreach ($dictstore as $id => $dict) {
384 18
            $content = str_replace('###'.$id.'###', $dict, $content);
385
        }
386
387
        // Restore the original string content
388 53
        $pdfstrings = array_reverse($pdfstrings, true);
389 53
        foreach ($pdfstrings as $id => $text) {
390
            // Strings may contain escaped newlines, or literal newlines
391
            // and we should clean these up before replacing the string
392
            // back into the content stream; this ensures no strings are
393
            // split between two lines (every command must be on one line)
394 42
            $text = str_replace(
395 42
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
396 42
                ['', '', '', '\r', '\n'],
397 42
                $text
398 42
            );
399
400 42
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
401
        }
402
403
        // Restore the original content of any inline images
404 53
        $pdfInlineImages = array_reverse($pdfInlineImages, true);
405 53
        foreach ($pdfInlineImages as $id => $image) {
406 1
            $content = str_replace(
407 1
                '^^^'.$id.'^^^',
408 1
                "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
409 1
                $content
410 1
            );
411
        }
412
413 53
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
414
415 53
        return $content;
416
    }
417
418
    /**
419
     * getSectionsText() now takes an entire, unformatted
420
     * document stream as a string, cleans it, then filters out
421
     * commands that aren't needed for text positioning/extraction. It
422
     * returns an array of unprocessed PDF commands, one command per
423
     * element.
424
     *
425
     * @internal
426
     */
427 54
    public function getSectionsText(?string $content): array
428
    {
429 54
        $sections = [];
430
431
        // A cleaned stream has one command on every line, so split the
432
        // cleaned stream content on \r\n into an array
433 54
        $textCleaned = preg_split(
434 54
            '/(\r\n|\n|\r)/',
435 54
            $this->formatContent($content),
436 54
            -1,
437 54
            \PREG_SPLIT_NO_EMPTY
438 54
        );
439
440 54
        $inTextBlock = false;
441 54
        foreach ($textCleaned as $line) {
442 51
            $line = trim($line);
443
444
            // Skip empty lines
445 51
            if ('' === $line) {
446
                continue;
447
            }
448
449
            // If a 'BT' is encountered, set the $inTextBlock flag
450 51
            if (preg_match('/BT$/', $line)) {
451 50
                $inTextBlock = true;
452 50
                $sections[] = $line;
453
454
            // If an 'ET' is encountered, unset the $inTextBlock flag
455 51
            } elseif ('ET' == $line) {
456 50
                $inTextBlock = false;
457 50
                $sections[] = $line;
458 51
            } elseif ($inTextBlock) {
459
                // If we are inside a BT ... ET text block, save all lines
460 50
                $sections[] = trim($line);
461
            } else {
462
                // Otherwise, if we are outside of a text block, only
463
                // save specific, necessary lines. Care should be taken
464
                // to ensure a command being checked for *only* matches
465
                // that command. For instance, a simple search for 'c'
466
                // may also match the 'sc' command. See the command
467
                // list in the formatContent() method above.
468
                // Add more commands to save here as you find them in
469
                // weird PDFs!
470 50
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
471
                    // Save and restore graphics state commands
472 44
                    $sections[] = $line;
473 50
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
474
                    // Begin marked content sequence
475 16
                    $sections[] = $line;
476 50
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
477
                    // Marked content point
478 1
                    $sections[] = $line;
479 49
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
480
                    // End marked content sequence
481 15
                    $sections[] = $line;
482 47
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
483
                    // Graphics position change commands
484 35
                    $sections[] = $line;
485 47
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
486
                    // Font change commands
487 3
                    $sections[] = $line;
488 47
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
489
                    // Invoke named XObject command
490 16
                    $sections[] = $line;
491
                }
492
            }
493
        }
494
495 54
        return $sections;
496
    }
497
498 48
    private function getDefaultFont(?Page $page = null): Font
499
    {
500 48
        $fonts = [];
501 48
        if (null !== $page) {
502 46
            $fonts = $page->getFonts();
503
        }
504
505 48
        $firstFont = $this->document->getFirstFont();
506 48
        if (null !== $firstFont) {
507 44
            $fonts[] = $firstFont;
508
        }
509
510 48
        if (\count($fonts) > 0) {
511 44
            return reset($fonts);
512
        }
513
514 4
        return new Font($this->document, null, null, $this->config);
515
    }
516
517
    /**
518
     * Decode a '[]TJ' command and attempt to use alternate
519
     * fonts if the current font results in output that contains
520
     * Unicode control characters.
521
     *
522
     * @internal
523
     *
524
     * @param array<int,array<string,string|bool>> $command
525
     */
526 44
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
527
    {
528 44
        $orig_text = $font->decodeText($command, $fontFactor);
529 44
        $text = $orig_text;
530
531
        // If we make this a Config option, we can add a check if it's
532
        // enabled here.
533 44
        if (null !== $page) {
534 44
            $font_ids = array_keys($page->getFonts());
535
536
            // If the decoded text contains UTF-8 control characters
537
            // then the font page being used is probably the wrong one.
538
            // Loop through the rest of the fonts to see if we can get
539
            // a good decode. Allow x09 to x0d which are whitespace.
540 44
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
541
                // If we're out of font IDs, then give up and use the
542
                // original string
543 3
                if (0 == \count($font_ids)) {
544 3
                    return $orig_text;
545
                }
546
547
                // Try the next font ID
548 3
                $font = $page->getFont(array_shift($font_ids));
549 3
                $text = $font->decodeText($command, $fontFactor);
550
            }
551
        }
552
553 44
        return $text;
554
    }
555
556
    /**
557
     * Expects a string that is a full PDF dictionary object,
558
     * including the outer enclosing << >> angle brackets
559
     *
560
     * @internal
561
     *
562
     * @throws \Exception
563
     */
564 18
    public function parseDictionary(string $dictionary): array
565
    {
566
        // Normalize whitespace
567 18
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
568
569 18
        if ('<<' != substr($dictionary, 0, 2)) {
570
            throw new \Exception('Not a valid dictionary object.');
571
        }
572
573 18
        $parsed = [];
574 18
        $stack = [];
575 18
        $currentName = '';
576 18
        $arrayTypeNumeric = false;
577
578
        // Remove outer layer of dictionary, and split on tokens
579 18
        $split = preg_split(
580 18
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
581 18
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
582 18
            -1,
583 18
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
584 18
        );
585
586 18
        foreach ($split as $token) {
587 18
            $token = trim($token);
588
            switch ($token) {
589 18
                case '':
590 8
                    break;
591
592
                    // Open numeric array
593 18
                case '[':
594 8
                    $parsed[$currentName] = [];
595 8
                    $arrayTypeNumeric = true;
596
597
                    // Move up one level in the stack
598 8
                    $stack[\count($stack)] = &$parsed;
599 8
                    $parsed = &$parsed[$currentName];
600 8
                    $currentName = '';
601 8
                    break;
602
603
                    // Open hashed array
604 18
                case '<<':
605 1
                    $parsed[$currentName] = [];
606 1
                    $arrayTypeNumeric = false;
607
608
                    // Move up one level in the stack
609 1
                    $stack[\count($stack)] = &$parsed;
610 1
                    $parsed = &$parsed[$currentName];
611 1
                    $currentName = '';
612 1
                    break;
613
614
                    // Close numeric array
615 18
                case ']':
616
                    // Revert string type arrays back to a single element
617 8
                    if (\is_array($parsed) && 1 == \count($parsed)
618 8
                        && isset($parsed[0]) && \is_string($parsed[0])
619 8
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
620 6
                        $parsed = '['.$parsed[0].']';
621
                    }
622
                    // Close hashed array
623
                    // no break
624 18
                case '>>':
625 8
                    $arrayTypeNumeric = false;
626
627
                    // Move down one level in the stack
628 8
                    $parsed = &$stack[\count($stack) - 1];
629 8
                    unset($stack[\count($stack) - 1]);
630 8
                    break;
631
632
                default:
633
                    // If value begins with a slash, then this is a name
634
                    // Add it to the appropriate array
635 18
                    if ('/' == substr($token, 0, 1)) {
636 18
                        $currentName = substr($token, 1);
637 18
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
638 7
                            $parsed[] = $currentName;
639 18
                            $currentName = '';
640
                        }
641 18
                    } elseif ('' != $currentName) {
642 18
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
643 18
                            $parsed[$currentName] = $token;
644
                        }
645 18
                        $currentName = '';
646 5
                    } elseif ('' == $currentName) {
647 5
                        $parsed[] = $token;
648
                    }
649
            }
650
        }
651
652 18
        return $parsed;
653
    }
654
655
    /**
656
     * Returns the text content of a PDF as a string. Attempts to add
657
     * whitespace for spacing and line-breaks where appropriate.
658
     *
659
     * getText() leverages getTextArray() to get the content
660
     * of the document, setting the addPositionWhitespace flag to true
661
     * so whitespace is inserted in a logical way for reading by
662
     * humans.
663
     */
664 38
    public function getText(?Page $page = null): string
665
    {
666 38
        $this->addPositionWhitespace = true;
667 38
        $result = $this->getTextArray($page);
668 38
        $this->addPositionWhitespace = false;
669
670 38
        return implode('', $result).' ';
671
    }
672
673
    /**
674
     * Returns the text content of a PDF as an array of strings. No
675
     * extra whitespace is inserted besides what is actually encoded in
676
     * the PDF text.
677
     *
678
     * @throws \Exception
679
     */
680 48
    public function getTextArray(?Page $page = null): array
681
    {
682 48
        $result = [];
683 48
        $text = [];
684
685 48
        $marked_stack = [];
686 48
        $last_written_position = false;
687
688 48
        $sections = $this->getSectionsText($this->content);
689 48
        $current_font = $this->getDefaultFont($page);
690 48
        $current_font_size = 1;
691 48
        $current_text_leading = 0;
692
693 48
        $current_position = ['x' => false, 'y' => false];
694 48
        $current_position_tm = [
695 48
            'a' => 1, 'b' => 0, 'c' => 0,
696 48
            'i' => 0, 'j' => 1, 'k' => 0,
697 48
            'x' => 0, 'y' => 0, 'z' => 1,
698 48
        ];
699 48
        $current_position_td = ['x' => 0, 'y' => 0];
700 48
        $current_position_cm = [
701 48
            'a' => 1, 'b' => 0, 'c' => 0,
702 48
            'i' => 0, 'j' => 1, 'k' => 0,
703 48
            'x' => 0, 'y' => 0, 'z' => 1,
704 48
        ];
705
706 48
        $clipped_font = [];
707 48
        $clipped_position_cm = [];
708
709 48
        self::$recursionStack[] = $this->getUniqueId();
710
711 48
        foreach ($sections as $section) {
712 45
            $commands = $this->getCommandsText($section);
713 45
            foreach ($commands as $command) {
714 45
                switch ($command[self::OPERATOR]) {
715
                    // Begin text object
716 45
                    case 'BT':
717
                        // Reset text positioning matrices
718 44
                        $current_position_tm = [
719 44
                            'a' => 1, 'b' => 0, 'c' => 0,
720 44
                            'i' => 0, 'j' => 1, 'k' => 0,
721 44
                            'x' => 0, 'y' => 0, 'z' => 1,
722 44
                        ];
723 44
                        $current_position_td = ['x' => 0, 'y' => 0];
724 44
                        $current_text_leading = 0;
725 44
                        break;
726
727
                        // Begin marked content sequence with property list
728 45
                    case 'BDC':
729 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
730 16
                            $dict = $this->parseDictionary($match[1]);
731
732
                            // Check for ActualText block
733 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
734 4
                                if ('[' == $dict['ActualText'][0]) {
735
                                    // Simulate a 'TJ' command on the stack
736
                                    $marked_stack[] = [
737
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
738
                                    ];
739 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
740
                                    // Simulate a 'Tj' command on the stack
741 4
                                    $marked_stack[] = [
742 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
743 4
                                    ];
744
                                }
745
                            }
746
                        }
747 16
                        break;
748
749
                        // Begin marked content sequence
750 45
                    case 'BMC':
751 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
752
                            // Upon encountering a ReversedChars command,
753
                            // add the characters we've built up so far to
754
                            // the result array
755 1
                            $result = array_merge($result, $text);
756
757
                            // Start a fresh $text array that will contain
758
                            // reversed characters
759 1
                            $text = [];
760
761
                            // Add the reversed text flag to the stack
762 1
                            $marked_stack[] = ['ReversedChars' => true];
763
                        }
764 2
                        break;
765
766
                        // set graphics position matrix
767 45
                    case 'cm':
768 31
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
769 31
                        $current_position_cm = [
770 31
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
771 31
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
772 31
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
773 31
                        ];
774 31
                        break;
775
776 45
                    case 'Do':
777 16
                        if (null !== $page) {
778 16
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
779 16
                            $id = trim(array_pop($args), '/ ');
780 16
                            $xobject = $page->getXObject($id);
781
782
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
783 16
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
784
                                // Not a circular reference.
785 16
                                $text[] = $xobject->getText($page);
786
                            }
787
                        }
788 16
                        break;
789
790
                        // Marked content point with (DP) & without (MP) property list
791 45
                    case 'DP':
792 45
                    case 'MP':
793 1
                        break;
794
795
                        // End text object
796 45
                    case 'ET':
797 44
                        break;
798
799
                        // Store current selected font and graphics matrix
800 45
                    case 'q':
801 39
                        $clipped_font[] = [$current_font, $current_font_size];
802 39
                        $clipped_position_cm[] = $current_position_cm;
803 39
                        break;
804
805
                        // Restore previous selected font and graphics matrix
806 45
                    case 'Q':
807 39
                        list($current_font, $current_font_size) = array_pop($clipped_font);
808 39
                        $current_position_cm = array_pop($clipped_position_cm);
809 39
                        break;
810
811
                        // End marked content sequence
812 44
                    case 'EMC':
813 17
                        $data = false;
814 17
                        if (\count($marked_stack)) {
815 5
                            $marked = array_pop($marked_stack);
816 5
                            $action = key($marked);
817 5
                            $data = $marked[$action];
818
819
                            switch ($action) {
820
                                // If we are in ReversedChars mode...
821 5
                                case 'ReversedChars':
822
                                    // Reverse the characters we've built up so far
823 1
                                    foreach ($text as $key => $t) {
824 1
                                        $text[$key] = implode('', array_reverse(
825 1
                                            mb_str_split($t, 1, mb_internal_encoding())
826 1
                                        ));
827
                                    }
828
829
                                    // Add these characters to the result array
830 1
                                    $result = array_merge($result, $text);
831
832
                                    // Start a fresh $text array that will contain
833
                                    // non-reversed characters
834 1
                                    $text = [];
835 1
                                    break;
836
837 4
                                case 'ActualText':
838
                                    // Use the content of the ActualText as a command
839 4
                                    $command = $data;
840 4
                                    break;
841
                            }
842
                        }
843
844
                        // If this EMC command has been transformed into a 'Tj'
845
                        // or 'TJ' command because of being ActualText, then bypass
846
                        // the break to proceed to the writing section below.
847 17
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
848 17
                            break;
849
                        }
850
851
                        // no break
852 44
                    case "'":
853 44
                    case '"':
854 4
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
855
                            // Move to next line and write text
856
                            $current_position['x'] = 0;
857
                            $current_position_td['x'] = 0;
858
                            $current_position_td['y'] += $current_text_leading;
859
                        }
860
                        // no break
861 44
                    case 'Tj':
862 35
                        $command[self::COMMAND] = [$command];
863
                        // no break
864 44
                    case 'TJ':
865
                        // Check the marked content stack for flags
866 44
                        $actual_text = false;
867 44
                        $reverse_text = false;
868 44
                        foreach ($marked_stack as $marked) {
869 5
                            if (isset($marked['ActualText'])) {
870 4
                                $actual_text = true;
871
                            }
872 5
                            if (isset($marked['ReversedChars'])) {
873 1
                                $reverse_text = true;
874
                            }
875
                        }
876
877
                        // Account for text position ONLY just before we write text
878 44
                        if (false === $actual_text && \is_array($last_written_position)) {
879
                            // If $last_written_position is an array, that
880
                            // means we have stored text position coordinates
881
                            // for placing an ActualText
882 4
                            $currentX = $last_written_position[0];
883 4
                            $currentY = $last_written_position[1];
884 4
                            $last_written_position = false;
885
                        } else {
886 44
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
887 44
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
888
                        }
889 44
                        $whiteSpace = '';
890
891 44
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
892 44
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
893
894 44
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
895 31
                            $curY = $currentY - $current_position['y'];
896 31
                            if (abs($curY) >= abs($factorY) / 4) {
897 30
                                $whiteSpace = "\n";
898
                            } else {
899 30
                                if (true === $reverse_text) {
900 1
                                    $curX = $current_position['x'] - $currentX;
901
                                } else {
902 30
                                    $curX = $currentX - $current_position['x'];
903
                                }
904
905
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
906
                                // as the number of apparent "spaces" in a document we
907
                                // would need before considering them a "tab". In the
908
                                // future, we might offer this value to users as a config
909
                                // option.
910 30
                                if ($curX >= abs($factorX * 7)) {
911 20
                                    $whiteSpace = "\t";
912 29
                                } elseif ($curX >= abs($factorX * 2)) {
913 19
                                    $whiteSpace = ' ';
914
                                }
915
                            }
916
                        }
917
918 44
                        $newtext = $this->getTJUsingFontFallback(
919 44
                            $current_font,
920 44
                            $command[self::COMMAND],
921 44
                            $page,
922 44
                            $factorX
923 44
                        );
924
925
                        // If there is no ActualText pending then write
926 44
                        if (false === $actual_text) {
927 44
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
928 44
                            if (false !== $reverse_text) {
929
                                // If we are in ReversedChars mode, add the whitespace last
930 1
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
931
                            } else {
932
                                // Otherwise add the whitespace first
933 44
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
934 18
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
935
                                }
936 44
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
937
                            }
938
939
                            // Record the position of this inserted text for comparison
940
                            // with the next text block.
941
                            // Provide a 'fudge' factor guess on how wide this text block
942
                            // is based on the number of characters. This helps limit the
943
                            // number of tabs inserted, but isn't perfect.
944 44
                            $factor = $factorX / 2;
945 44
                            $current_position = [
946 44
                                'x' => $currentX - mb_strlen($newtext) * $factor,
947 44
                                'y' => $currentY,
948 44
                            ];
949 4
                        } elseif (false === $last_written_position) {
950
                            // If there is an ActualText in the pipeline
951
                            // store the position this undisplayed text
952
                            // *would* have been written to, so the
953
                            // ActualText is displayed in the right spot
954 4
                            $last_written_position = [$currentX, $currentY];
955 4
                            $current_position['x'] = $currentX;
956
                        }
957 44
                        break;
958
959
                        // move to start of next line
960 44
                    case 'T*':
961 13
                        $current_position['x'] = 0;
962 13
                        $current_position_td['x'] = 0;
963 13
                        $current_position_td['y'] += $current_text_leading;
964 13
                        break;
965
966
                        // set character spacing
967 44
                    case 'Tc':
968 13
                        break;
969
970
                        // move text current point and set leading
971 44
                    case 'Td':
972 44
                    case 'TD':
973
                        // move text current point
974 32
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
975 32
                        $y = (float) array_pop($args);
976 32
                        $x = (float) array_pop($args);
977
978 32
                        if ('TD' == $command[self::OPERATOR]) {
979 7
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
980
                        }
981
982 32
                        $current_position_td = [
983 32
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
984 32
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
985 32
                        ];
986 32
                        break;
987
988 44
                    case 'Tf':
989 44
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
990 44
                        $size = (float) array_pop($args);
991 44
                        $id = trim(array_pop($args), '/');
992 44
                        if (null !== $page) {
993 44
                            $new_font = $page->getFont($id);
994
                            // If an invalid font ID is given, do not update the font.
995
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
996
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
997
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
998
                            // But we want to make sure that malformed PDFs do not simply crash.
999 44
                            if (null !== $new_font) {
1000 44
                                $current_font = $new_font;
1001 44
                                $current_font_size = $size;
1002
                            }
1003
                        }
1004 44
                        break;
1005
1006
                        // set leading
1007 38
                    case 'TL':
1008 6
                        $y = (float) $command[self::COMMAND];
1009 6
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
1010 6
                        break;
1011
1012
                        // set text position matrix
1013 38
                    case 'Tm':
1014 35
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
1015 35
                        $current_position_tm = [
1016 35
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
1017 35
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
1018 35
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
1019 35
                        ];
1020 35
                        break;
1021
1022
                        // set text rendering mode
1023 23
                    case 'Ts':
1024
                        break;
1025
1026
                        // set super/subscripting text rise
1027 23
                    case 'Ts':
1028
                        break;
1029
1030
                        // set word spacing
1031 23
                    case 'Tw':
1032 9
                        break;
1033
1034
                        // set horizontal scaling
1035 23
                    case 'Tz':
1036
                        break;
1037
1038
                    default:
1039
                }
1040
            }
1041
        }
1042
1043 48
        $result = array_merge($result, $text);
1044
1045 48
        return $result;
1046
    }
1047
1048
    /**
1049
     * getCommandsText() expects the content of $text_part to be an
1050
     * already formatted, single-line command from a document stream.
1051
     * The companion function getSectionsText() returns a document
1052
     * stream as an array of single commands for just this purpose.
1053
     * Because of this, the argument $offset is no longer used, and
1054
     * may be removed in a future PdfParser release.
1055
     *
1056
     * A better name for this function would be getCommandText()
1057
     * since it now always works on just one command.
1058
     */
1059 52
    public function getCommandsText(string $text_part, int &$offset = 0): array
1060
    {
1061 52
        $commands = $matches = [];
1062
1063 52
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
1064
1065
        // If no valid command is detected, return an empty array
1066 52
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
1067 1
            return [];
1068
        }
1069
1070 52
        $type = $matches[2];
1071 52
        $operator = $matches[3];
1072 52
        $command = trim($matches[1]);
1073
1074 52
        if ('TJ' == $operator) {
1075 41
            $subcommand = [];
1076 41
            $command = trim($command, '[]');
1077
            do {
1078 41
                $oldCommand = $command;
1079
1080
                // Search for parentheses string () format
1081 41
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
1082 34
                    $subcommand[] = [
1083 34
                        self::TYPE => '(',
1084 34
                        self::OPERATOR => 'TJ',
1085 34
                        self::COMMAND => $tjmatch[1],
1086 34
                    ];
1087 34
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1088 28
                        $subcommand[] = [
1089 28
                            self::TYPE => 'n',
1090 28
                            self::OPERATOR => '',
1091 28
                            self::COMMAND => $tjmatch[2],
1092 28
                        ];
1093
                    }
1094 34
                    $command = substr($command, \strlen($tjmatch[0]));
1095
                }
1096
1097
                // Search for hexadecimal <> format
1098 41
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1099 20
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1100 20
                    $subcommand[] = [
1101 20
                        self::TYPE => '<',
1102 20
                        self::OPERATOR => 'TJ',
1103 20
                        self::COMMAND => $tjmatch[1],
1104 20
                    ];
1105 20
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1106 19
                        $subcommand[] = [
1107 19
                            self::TYPE => 'n',
1108 19
                            self::OPERATOR => '',
1109 19
                            self::COMMAND => $tjmatch[2],
1110 19
                        ];
1111
                    }
1112 20
                    $command = substr($command, \strlen($tjmatch[0]));
1113
                }
1114 41
            } while ($command != $oldCommand);
1115
1116 41
            $command = $subcommand;
1117 52
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1118
            // Depending on the string type, trim the data of the
1119
            // appropriate delimiters
1120 39
            if ('(' == $type) {
1121
                // Don't use trim() here since a () string may end with
1122
                // a balanced or escaped right parentheses, and trim()
1123
                // will delete both. Both strings below are valid:
1124
                //   eg. (String())
1125
                //   eg. (String\))
1126 33
                $command = preg_replace('/^\(|\)$/', '', $command);
1127 15
            } elseif ('<' == $type) {
1128 39
                $command = trim($command, '<>');
1129
            }
1130 52
        } elseif ('/' == $type) {
1131 51
            $command = substr($command, 1);
1132
        }
1133
1134 52
        $commands[] = [
1135 52
            self::TYPE => $type,
1136 52
            self::OPERATOR => $operator,
1137 52
            self::COMMAND => $command,
1138 52
        ];
1139
1140 52
        return $commands;
1141
    }
1142
1143 67
    public static function factory(
1144
        Document $document,
1145
        Header $header,
1146
        ?string $content,
1147
        ?Config $config = null
1148
    ): self {
1149 67
        switch ($header->get('Type')->getContent()) {
1150 67
            case 'XObject':
1151 20
                switch ($header->get('Subtype')->getContent()) {
1152 20
                    case 'Image':
1153 13
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
1154
1155 8
                    case 'Form':
1156 8
                        return new Form($document, $header, $content, $config);
1157
                }
1158
1159
                return new self($document, $header, $content, $config);
1160
1161 67
            case 'Pages':
1162 66
                return new Pages($document, $header, $content, $config);
1163
1164 67
            case 'Page':
1165 66
                return new Page($document, $header, $content, $config);
1166
1167 67
            case 'Encoding':
1168 12
                return new Encoding($document, $header, $content, $config);
1169
1170 67
            case 'Font':
1171 65
                $subtype = $header->get('Subtype')->getContent();
1172 65
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1173
1174 65
                if (class_exists($classname)) {
1175 65
                    return new $classname($document, $header, $content, $config);
1176
                }
1177
1178
                return new Font($document, $header, $content, $config);
1179
1180
            default:
1181 67
                return new self($document, $header, $content, $config);
1182
        }
1183
    }
1184
1185
    /**
1186
     * Returns unique id identifying the object.
1187
     */
1188 48
    protected function getUniqueId(): string
1189
    {
1190 48
        return spl_object_hash($this);
1191
    }
1192
}
1193