Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

PDFObject::getText()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 4
c 1
b 1
f 0
nc 1
nop 1
dl 0
loc 7
ccs 5
cts 5
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document|null
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config|null
73
     */
74
    protected $config;
75
76
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81 92
    public function __construct(
82
        Document $document,
83
        ?Header $header = null,
84
        ?string $content = null,
85
        ?Config $config = null
86
    ) {
87 92
        $this->document = $document;
88 92
        $this->header = $header ?? new Header();
89 92
        $this->content = $content;
90 92
        $this->config = $config;
91
    }
92
93 71
    public function init()
94
    {
95 71
    }
96
97 4
    public function getDocument(): Document
98
    {
99 4
        return $this->document;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->document could return the type null which is incompatible with the type-hinted return Smalot\PdfParser\Document. Consider adding an additional type-check to rule them out.
Loading history...
100
    }
101
102 71
    public function getHeader(): ?Header
103
    {
104 71
        return $this->header;
105
    }
106
107 4
    public function getConfig(): ?Config
108
    {
109 4
        return $this->config;
110
    }
111
112
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 73
    public function get(string $name)
116
    {
117 73
        return $this->header->get($name);
118
    }
119
120 72
    public function has(string $name): bool
121
    {
122 72
        return $this->header->has($name);
123
    }
124
125 4
    public function getDetails(bool $deep = true): array
126
    {
127 4
        return $this->header->getDetails($deep);
128
    }
129
130 59
    public function getContent(): ?string
131
    {
132 59
        return $this->content;
133
    }
134
135
    /**
136
     * Creates a duplicate of the document stream with
137
     * strings and other items replaced by $char. Formerly
138
     * getSectionsText() used this output to more easily gather offset
139
     * values to extract text from the *actual* document stream.
140
     *
141
     * @deprecated function is no longer used and will be removed in a future release
142
     *
143
     * @internal
144
     */
145 1
    public function cleanContent(string $content, string $char = 'X')
146
    {
147 1
        $char = $char[0];
148 1
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149
150
        // Remove image bloc with binary content
151 1
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152 1
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154
        }
155
156
        // Clean content in square brackets [.....]
157 1
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

157
        /** @scrutinizer ignore-call */ 
158
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
158 1
        foreach ($matches[1] as $part) {
159 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162
        // Clean content in round brackets (.....)
163 1
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 1
        foreach ($matches[1] as $part) {
165 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169 1
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

169
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
170 1
            $content = '';
171 1
            $level = 0;
172 1
            foreach ($parts as $part) {
173 1
                if ('<' == $part) {
174 1
                    ++$level;
175
                }
176
177 1
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178
179 1
                if ('>' == $part) {
180 1
                    --$level;
181
                }
182
            }
183
        }
184
185
        // Clean BDC and EMC markup
186 1
        preg_match_all(
187 1
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188 1
            $content,
189 1
            $matches,
190 1
            \PREG_OFFSET_CAPTURE
191 1
        );
192 1
        foreach ($matches[1] as $part) {
193 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 1
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 1
        foreach ($matches[1] as $part) {
198 1
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199
        }
200
201 1
        return $content;
202
    }
203
204
    /**
205
     * Takes a string of PDF document stream text and formats
206
     * it into a multi-line string with one PDF command on each line,
207
     * separated by \r\n. If the given string is null, or binary data
208
     * is detected instead of a document stream then return an empty
209
     * string.
210
     */
211 51
    private function formatContent(?string $content): string
212
    {
213 51
        if (null === $content) {
214 3
            return '';
215
        }
216
217
        // Outside of (String) content in PDF document streams, all
218
        // text should conform to UTF-8. Test for binary content by
219
        // deleting everything after the first open-parenthesis ( which
220
        // indicates the beginning of a string. Then test what remains
221
        // for valid UTF-8. If it's not UTF-8, return an empty string
222
        // as this $content is most likely binary.
223 48
        if (false === mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) {
224 1
            return '';
225
        }
226
227
        // Find all strings () and replace them so they aren't affected
228
        // by the next steps
229 48
        $pdfstrings = [];
230 48
        $attempt = '(';
231 48
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
232
            // PDF strings can contain unescaped parentheses as long as
233
            // they're balanced, so check for balanced parentheses
234 39
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
235 39
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
236
237 39
            if ($left == $right) {
238
                // Replace the string with a unique placeholder
239 39
                $id = uniqid('STRING_', true);
240 39
                $pdfstrings[$id] = $text[0];
241 39
                $content = preg_replace(
242 39
                    '/'.preg_quote($text[0], '/').'/',
243 39
                    '@@@'.$id.'@@@',
244 39
                    $content,
245 39
                    1
246 39
                );
247
248
                // Reset to search for the next string
249 39
                $attempt = '(';
250
            } else {
251
                // We had unbalanced parentheses, so use the current
252
                // match as a base to find a longer string
253
                $attempt = $text[0];
254
            }
255
        }
256
257
        // Remove all carriage returns and line-feeds from the document stream
258 48
        $content = str_replace(["\r", "\n"], ' ', trim($content));
259
260
        // Find all dictionary << >> commands and replace them so they
261
        // aren't affected by the next steps
262 48
        $dictstore = [];
263 48
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
264 18
            $dictid = uniqid('DICT_', true);
265 18
            $dictstore[$dictid] = $dicttext[1];
266 18
            $content = preg_replace(
267 18
                '/'.preg_quote($dicttext[0], '/').'/',
268 18
                ' ###'.$dictid.'###'.$dicttext[2],
269 18
                $content,
270 18
                1
271 18
            );
272
        }
273
274
        // Normalize white-space in the document stream
275 48
        $content = preg_replace('/\s{2,}/', ' ', $content);
276
277
        // Find all valid PDF operators and add \r\n after each; this
278
        // ensures there is just one command on every line
279
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
280
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
281
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
282
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
283
        //       appear here in the list for completeness.
284 48
        $operators = [
285 48
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
286 48
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
287 48
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
288 48
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
289 48
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
290 48
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
291 48
        ];
292 48
        foreach ($operators as $operator) {
293 48
            $content = preg_replace(
294 48
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
295 48
                $operator."\r\n",
296 48
                $content
297 48
            );
298
        }
299
300
        // Restore the original content of the dictionary << >> commands
301 48
        $dictstore = array_reverse($dictstore, true);
302 48
        foreach ($dictstore as $id => $dict) {
303 18
            $content = str_replace('###'.$id.'###', $dict, $content);
304
        }
305
306
        // Restore the original string content
307 48
        $pdfstrings = array_reverse($pdfstrings, true);
308 48
        foreach ($pdfstrings as $id => $text) {
309
            // Strings may contain escaped newlines, or literal newlines
310
            // and we should clean these up before replacing the string
311
            // back into the content stream; this ensures no strings are
312
            // split between two lines (every command must be on one line)
313 39
            $text = str_replace(
314 39
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
315 39
                ['', '', '', '\r', '\n'],
316 39
                $text
317 39
            );
318
319 39
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
320
        }
321
322 48
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
323
324 48
        return $content;
325
    }
326
327
    /**
328
     * getSectionsText() now takes an entire, unformatted
329
     * document stream as a string, cleans it, then filters out
330
     * commands that aren't needed for text positioning/extraction. It
331
     * returns an array of unprocessed PDF commands, one command per
332
     * element.
333
     *
334
     * @internal
335
     */
336 51
    public function getSectionsText(?string $content): array
337
    {
338 51
        $sections = [];
339
340
        // A cleaned stream has one command on every line, so split the
341
        // cleaned stream content on \r\n into an array
342 51
        $textCleaned = preg_split(
343 51
            '/(\r\n|\n|\r)/',
344 51
            $this->formatContent($content),
345 51
            -1,
346 51
            \PREG_SPLIT_NO_EMPTY
347 51
        );
348
349 51
        $inTextBlock = false;
350 51
        foreach ($textCleaned as $line) {
351 48
            $line = trim($line);
352
353
            // Skip empty lines
354 48
            if ('' === $line) {
355
                continue;
356
            }
357
358
            // If a 'BT' is encountered, set the $inTextBlock flag
359 48
            if (preg_match('/BT$/', $line)) {
360 48
                $inTextBlock = true;
361 48
                $sections[] = $line;
362
363
            // If an 'ET' is encountered, unset the $inTextBlock flag
364 48
            } elseif ('ET' == $line) {
365 48
                $inTextBlock = false;
366 48
                $sections[] = $line;
367 48
            } elseif ($inTextBlock) {
368
                // If we are inside a BT ... ET text block, save all lines
369 48
                $sections[] = trim($line);
370
            } else {
371
                // Otherwise, if we are outside of a text block, only
372
                // save specific, necessary lines. Care should be taken
373
                // to ensure a command being checked for *only* matches
374
                // that command. For instance, a simple search for 'c'
375
                // may also match the 'sc' command. See the command
376
                // list in the formatContent() method above.
377
                // Add more commands to save here as you find them in
378
                // weird PDFs!
379 47
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
380
                    // Save and restore graphics state commands
381 41
                    $sections[] = $line;
382 47
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
383
                    // Begin marked content sequence
384 16
                    $sections[] = $line;
385 47
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
386
                    // Marked content point
387 1
                    $sections[] = $line;
388 46
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
389
                    // End marked content sequence
390 15
                    $sections[] = $line;
391 44
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
392
                    // Graphics position change commands
393 33
                    $sections[] = $line;
394 44
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
395
                    // Font change commands
396 3
                    $sections[] = $line;
397 44
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
398
                    // Invoke named XObject command
399 15
                    $sections[] = $line;
400
                }
401
            }
402
        }
403
404 51
        return $sections;
405
    }
406
407 45
    private function getDefaultFont(?Page $page = null): Font
408
    {
409 45
        $fonts = [];
410 45
        if (null !== $page) {
411 43
            $fonts = $page->getFonts();
412
        }
413
414 45
        $firstFont = $this->document->getFirstFont();
0 ignored issues
show
Bug introduced by
The method getFirstFont() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

414
        /** @scrutinizer ignore-call */ 
415
        $firstFont = $this->document->getFirstFont();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
415 45
        if (null !== $firstFont) {
416 42
            $fonts[] = $firstFont;
417
        }
418
419 45
        if (\count($fonts) > 0) {
420 42
            return reset($fonts);
421
        }
422
423 3
        return new Font($this->document, null, null, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

423
        return new Font(/** @scrutinizer ignore-type */ $this->document, null, null, $this->config);
Loading history...
424
    }
425
426
    /**
427
     * Decode a '[]TJ' command and attempt to use alternate
428
     * fonts if the current font results in output that contains
429
     * Unicode control characters.
430
     *
431
     * @internal
432
     *
433
     * @param array<int,array<string,string|bool>> $command
434
     */
435 42
    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
436
    {
437 42
        $orig_text = $font->decodeText($command, $fontFactor);
438 42
        $text = $orig_text;
439
440
        // If we make this a Config option, we can add a check if it's
441
        // enabled here.
442 42
        if (null !== $page) {
443 42
            $font_ids = array_keys($page->getFonts());
444
445
            // If the decoded text contains UTF-8 control characters
446
            // then the font page being used is probably the wrong one.
447
            // Loop through the rest of the fonts to see if we can get
448
            // a good decode. Allow x09 to x0d which are whitespace.
449 42
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
450
                // If we're out of font IDs, then give up and use the
451
                // original string
452 3
                if (0 == \count($font_ids)) {
453 3
                    return $orig_text;
454
                }
455
456
                // Try the next font ID
457 3
                $font = $page->getFont(array_shift($font_ids));
458 3
                $text = $font->decodeText($command, $fontFactor);
459
            }
460
        }
461
462 42
        return $text;
463
    }
464
465
    /**
466
     * Expects a string that is a full PDF dictionary object,
467
     * including the outer enclosing << >> angle brackets
468
     *
469
     * @internal
470
     *
471
     * @throws \Exception
472
     */
473 17
    public function parseDictionary(string $dictionary): array
474
    {
475
        // Normalize whitespace
476 17
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
477
478 17
        if ('<<' != substr($dictionary, 0, 2)) {
479
            throw new \Exception('Not a valid dictionary object.');
480
        }
481
482 17
        $parsed = [];
483 17
        $stack = [];
484 17
        $currentName = '';
485 17
        $arrayTypeNumeric = false;
486
487
        // Remove outer layer of dictionary, and split on tokens
488 17
        $split = preg_split(
489 17
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
490 17
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
491 17
            -1,
492 17
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
493 17
        );
494
495 17
        foreach ($split as $token) {
496 17
            $token = trim($token);
497
            switch ($token) {
498 17
                case '':
499 7
                    break;
500
501
                    // Open numeric array
502 17
                case '[':
503 7
                    $parsed[$currentName] = [];
504 7
                    $arrayTypeNumeric = true;
505
506
                    // Move up one level in the stack
507 7
                    $stack[\count($stack)] = &$parsed;
508 7
                    $parsed = &$parsed[$currentName];
509 7
                    $currentName = '';
510 7
                    break;
511
512
                    // Open hashed array
513 17
                case '<<':
514 1
                    $parsed[$currentName] = [];
515 1
                    $arrayTypeNumeric = false;
516
517
                    // Move up one level in the stack
518 1
                    $stack[\count($stack)] = &$parsed;
519 1
                    $parsed = &$parsed[$currentName];
520 1
                    $currentName = '';
521 1
                    break;
522
523
                    // Close numeric array
524 17
                case ']':
525
                    // Revert string type arrays back to a single element
526 7
                    if (\is_array($parsed) && 1 == \count($parsed)
527 7
                        && isset($parsed[0]) && \is_string($parsed[0])
528 7
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
529 6
                        $parsed = '['.$parsed[0].']';
530
                    }
531
                    // Close hashed array
532
                    // no break
533 17
                case '>>':
534 7
                    $arrayTypeNumeric = false;
535
536
                    // Move down one level in the stack
537 7
                    $parsed = &$stack[\count($stack) - 1];
538 7
                    unset($stack[\count($stack) - 1]);
539 7
                    break;
540
541
                default:
542
                    // If value begins with a slash, then this is a name
543
                    // Add it to the appropriate array
544 17
                    if ('/' == substr($token, 0, 1)) {
545 17
                        $currentName = substr($token, 1);
546 17
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
547 6
                            $parsed[] = $currentName;
548 17
                            $currentName = '';
549
                        }
550 17
                    } elseif ('' != $currentName) {
551 17
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
552 17
                            $parsed[$currentName] = $token;
553
                        }
554 17
                        $currentName = '';
555 5
                    } elseif ('' == $currentName) {
556 5
                        $parsed[] = $token;
557
                    }
558
            }
559
        }
560
561 17
        return $parsed;
562
    }
563
564
    /**
565
     * Returns the text content of a PDF as a string. Attempts to add
566
     * whitespace for spacing and line-breaks where appropriate.
567
     *
568
     * getText() leverages getTextArray() to get the content
569
     * of the document, setting the addPositionWhitespace flag to true
570
     * so whitespace is inserted in a logical way for reading by
571
     * humans.
572
     */
573 36
    public function getText(?Page $page = null): string
574
    {
575 36
        $this->addPositionWhitespace = true;
576 36
        $result = $this->getTextArray($page);
577 36
        $this->addPositionWhitespace = false;
578
579 36
        return implode('', $result).' ';
580
    }
581
582
    /**
583
     * Returns the text content of a PDF as an array of strings. No
584
     * extra whitespace is inserted besides what is actually encoded in
585
     * the PDF text.
586
     *
587
     * @throws \Exception
588
     */
589 45
    public function getTextArray(?Page $page = null): array
590
    {
591 45
        $result = [];
592 45
        $text = [];
593
594 45
        $marked_stack = [];
595 45
        $last_written_position = false;
596
597 45
        $sections = $this->getSectionsText($this->content);
598 45
        $current_font = $this->getDefaultFont($page);
599 45
        $current_font_size = 1;
600 45
        $current_text_leading = 0;
601
602 45
        $current_position = ['x' => false, 'y' => false];
603 45
        $current_position_tm = [
604 45
            'a' => 1, 'b' => 0, 'c' => 0,
605 45
            'i' => 0, 'j' => 1, 'k' => 0,
606 45
            'x' => 0, 'y' => 0, 'z' => 1,
607 45
        ];
608 45
        $current_position_td = ['x' => 0, 'y' => 0];
609 45
        $current_position_cm = [
610 45
            'a' => 1, 'b' => 0, 'c' => 0,
611 45
            'i' => 0, 'j' => 1, 'k' => 0,
612 45
            'x' => 0, 'y' => 0, 'z' => 1,
613 45
        ];
614
615 45
        $clipped_font = [];
616 45
        $clipped_position_cm = [];
617
618 45
        self::$recursionStack[] = $this->getUniqueId();
619
620 45
        foreach ($sections as $section) {
621 42
            $commands = $this->getCommandsText($section);
622 42
            foreach ($commands as $command) {
623 42
                switch ($command[self::OPERATOR]) {
624
                    // Begin text object
625 42
                    case 'BT':
626
                        // Reset text positioning matrices
627 42
                        $current_position_tm = [
628 42
                            'a' => 1, 'b' => 0, 'c' => 0,
629 42
                            'i' => 0, 'j' => 1, 'k' => 0,
630 42
                            'x' => 0, 'y' => 0, 'z' => 1,
631 42
                        ];
632 42
                        $current_position_td = ['x' => 0, 'y' => 0];
633 42
                        $current_text_leading = 0;
634 42
                        break;
635
636
                        // Begin marked content sequence with property list
637 42
                    case 'BDC':
638 16
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
639 16
                            $dict = $this->parseDictionary($match[1]);
640
641
                            // Check for ActualText block
642 16
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
643 4
                                if ('[' == $dict['ActualText'][0]) {
644
                                    // Simulate a 'TJ' command on the stack
645
                                    $marked_stack[] = [
646
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
647
                                    ];
648 4
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
649
                                    // Simulate a 'Tj' command on the stack
650 4
                                    $marked_stack[] = [
651 4
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
652 4
                                    ];
653
                                }
654
                            }
655
                        }
656 16
                        break;
657
658
                        // Begin marked content sequence
659 42
                    case 'BMC':
660 2
                        if ('ReversedChars' == $command[self::COMMAND]) {
661
                            // Upon encountering a ReversedChars command,
662
                            // add the characters we've built up so far to
663
                            // the result array
664 1
                            $result = array_merge($result, $text);
665
666
                            // Start a fresh $text array that will contain
667
                            // reversed characters
668 1
                            $text = [];
669
670
                            // Add the reversed text flag to the stack
671 1
                            $marked_stack[] = ['ReversedChars' => true];
672
                        }
673 2
                        break;
674
675
                        // set graphics position matrix
676 42
                    case 'cm':
677 29
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
678 29
                        $current_position_cm = [
679 29
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
680 29
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
681 29
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
682 29
                        ];
683 29
                        break;
684
685 42
                    case 'Do':
686 15
                        if (null !== $page) {
687 15
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
688 15
                            $id = trim(array_pop($args), '/ ');
689 15
                            $xobject = $page->getXObject($id);
690
691
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
692 15
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
693
                                // Not a circular reference.
694 15
                                $text[] = $xobject->getText($page);
695
                            }
696
                        }
697 15
                        break;
698
699
                        // Marked content point with (DP) & without (MP) property list
700 42
                    case 'DP':
701 42
                    case 'MP':
702 1
                        break;
703
704
                        // End text object
705 42
                    case 'ET':
706 42
                        break;
707
708
                        // Store current selected font and graphics matrix
709 42
                    case 'q':
710 36
                        $clipped_font[] = [$current_font, $current_font_size];
711 36
                        $clipped_position_cm[] = $current_position_cm;
712 36
                        break;
713
714
                        // Restore previous selected font and graphics matrix
715 42
                    case 'Q':
716 36
                        list($current_font, $current_font_size) = array_pop($clipped_font);
717 36
                        $current_position_cm = array_pop($clipped_position_cm);
718 36
                        break;
719
720
                        // End marked content sequence
721 42
                    case 'EMC':
722 17
                        $data = false;
723 17
                        if (\count($marked_stack)) {
724 5
                            $marked = array_pop($marked_stack);
725 5
                            $action = key($marked);
726 5
                            $data = $marked[$action];
727
728
                            switch ($action) {
729
                                // If we are in ReversedChars mode...
730 5
                                case 'ReversedChars':
731
                                    // Reverse the characters we've built up so far
732 1
                                    foreach ($text as $key => $t) {
733 1
                                        $text[$key] = implode('', array_reverse(
734 1
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

734
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
735 1
                                        ));
736
                                    }
737
738
                                    // Add these characters to the result array
739 1
                                    $result = array_merge($result, $text);
740
741
                                    // Start a fresh $text array that will contain
742
                                    // non-reversed characters
743 1
                                    $text = [];
744 1
                                    break;
745
746 4
                                case 'ActualText':
747
                                    // Use the content of the ActualText as a command
748 4
                                    $command = $data;
749 4
                                    break;
750
                            }
751
                        }
752
753
                        // If this EMC command has been transformed into a 'Tj'
754
                        // or 'TJ' command because of being ActualText, then bypass
755
                        // the break to proceed to the writing section below.
756 17
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
757 17
                            break;
758
                        }
759
760
                        // no break
761 42
                    case "'":
762 42
                    case '"':
763 4
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
764
                            // Move to next line and write text
765
                            $current_position['x'] = 0;
766
                            $current_position_td['x'] = 0;
767
                            $current_position_td['y'] += $current_text_leading;
768
                        }
769
                        // no break
770 42
                    case 'Tj':
771 34
                        $command[self::COMMAND] = [$command];
772
                        // no break
773 42
                    case 'TJ':
774
                        // Check the marked content stack for flags
775 42
                        $actual_text = false;
776 42
                        $reverse_text = false;
777 42
                        foreach ($marked_stack as $marked) {
778 5
                            if (isset($marked['ActualText'])) {
779 4
                                $actual_text = true;
780
                            }
781 5
                            if (isset($marked['ReversedChars'])) {
782 1
                                $reverse_text = true;
783
                            }
784
                        }
785
786
                        // Account for text position ONLY just before we write text
787 42
                        if (false === $actual_text && \is_array($last_written_position)) {
788
                            // If $last_written_position is an array, that
789
                            // means we have stored text position coordinates
790
                            // for placing an ActualText
791 4
                            $currentX = $last_written_position[0];
792 4
                            $currentY = $last_written_position[1];
793 4
                            $last_written_position = false;
794
                        } else {
795 42
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
796 42
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
797
                        }
798 42
                        $whiteSpace = '';
799
800 42
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
801 42
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
802
803 42
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
804 30
                            $curY = $currentY - $current_position['y'];
805 30
                            if (abs($curY) >= abs($factorY) / 4) {
806 29
                                $whiteSpace = "\n";
807
                            } else {
808 29
                                if (true === $reverse_text) {
809 1
                                    $curX = $current_position['x'] - $currentX;
810
                                } else {
811 29
                                    $curX = $currentX - $current_position['x'];
812
                                }
813
814
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
815
                                // as the number of apparent "spaces" in a document we
816
                                // would need before considering them a "tab". In the
817
                                // future, we might offer this value to users as a config
818
                                // option.
819 29
                                if ($curX >= abs($factorX * 7)) {
820 19
                                    $whiteSpace = "\t";
821 28
                                } elseif ($curX >= abs($factorX * 2)) {
822 17
                                    $whiteSpace = ' ';
823
                                }
824
                            }
825
                        }
826
827 42
                        $newtext = $this->getTJUsingFontFallback(
828 42
                            $current_font,
829 42
                            $command[self::COMMAND],
830 42
                            $page,
831 42
                            $factorX
832 42
                        );
833
834
                        // If there is no ActualText pending then write
835 42
                        if (false === $actual_text) {
836 42
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
837 42
                            if (false !== $reverse_text) {
838
                                // If we are in ReversedChars mode, add the whitespace last
839 1
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
840
                            } else {
841
                                // Otherwise add the whitespace first
842 42
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
843 16
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
844
                                }
845 42
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
846
                            }
847
848
                            // Record the position of this inserted text for comparison
849
                            // with the next text block.
850
                            // Provide a 'fudge' factor guess on how wide this text block
851
                            // is based on the number of characters. This helps limit the
852
                            // number of tabs inserted, but isn't perfect.
853 42
                            $factor = $factorX / 2;
854 42
                            $current_position = [
855 42
                                'x' => $currentX - mb_strlen($newtext) * $factor,
856 42
                                'y' => $currentY,
857 42
                            ];
858 4
                        } elseif (false === $last_written_position) {
859
                            // If there is an ActualText in the pipeline
860
                            // store the position this undisplayed text
861
                            // *would* have been written to, so the
862
                            // ActualText is displayed in the right spot
863 4
                            $last_written_position = [$currentX, $currentY];
864 4
                            $current_position['x'] = $currentX;
865
                        }
866 42
                        break;
867
868
                        // move to start of next line
869 42
                    case 'T*':
870 13
                        $current_position['x'] = 0;
871 13
                        $current_position_td['x'] = 0;
872 13
                        $current_position_td['y'] += $current_text_leading;
873 13
                        break;
874
875
                        // set character spacing
876 42
                    case 'Tc':
877 13
                        break;
878
879
                        // move text current point and set leading
880 42
                    case 'Td':
881 42
                    case 'TD':
882
                        // move text current point
883 31
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
884 31
                        $y = (float) array_pop($args);
885 31
                        $x = (float) array_pop($args);
886
887 31
                        if ('TD' == $command[self::OPERATOR]) {
888 7
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
889
                        }
890
891 31
                        $current_position_td = [
892 31
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
893 31
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
894 31
                        ];
895 31
                        break;
896
897 42
                    case 'Tf':
898 42
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
899 42
                        $size = (float) array_pop($args);
900 42
                        $id = trim(array_pop($args), '/');
901 42
                        if (null !== $page) {
902 42
                            $new_font = $page->getFont($id);
903
                            // If an invalid font ID is given, do not update the font.
904
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
905
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
906
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
907
                            // But we want to make sure that malformed PDFs do not simply crash.
908 42
                            if (null !== $new_font) {
909 39
                                $current_font = $new_font;
910 39
                                $current_font_size = $size;
911
                            }
912
                        }
913 42
                        break;
914
915
                        // set leading
916 36
                    case 'TL':
917 6
                        $y = (float) $command[self::COMMAND];
918 6
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
919 6
                        break;
920
921
                        // set text position matrix
922 36
                    case 'Tm':
923 34
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
924 34
                        $current_position_tm = [
925 34
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
926 34
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
927 34
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
928 34
                        ];
929 34
                        break;
930
931
                        // set text rendering mode
932 21
                    case 'Ts':
933
                        break;
934
935
                        // set super/subscripting text rise
936 21
                    case 'Ts':
937
                        break;
938
939
                        // set word spacing
940 21
                    case 'Tw':
941 9
                        break;
942
943
                        // set horizontal scaling
944 21
                    case 'Tz':
945
                        break;
946
947
                    default:
948
                }
949
            }
950
        }
951
952 45
        $result = array_merge($result, $text);
953
954 45
        return $result;
955
    }
956
957
    /**
958
     * getCommandsText() expects the content of $text_part to be an
959
     * already formatted, single-line command from a document stream.
960
     * The companion function getSectionsText() returns a document
961
     * stream as an array of single commands for just this purpose.
962
     * Because of this, the argument $offset is no longer used, and
963
     * may be removed in a future PdfParser release.
964
     *
965
     * A better name for this function would be getCommandText()
966
     * since it now always works on just one command.
967
     */
968 49
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

968
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
969
    {
970 49
        $commands = $matches = [];
971
972 49
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
973
974
        // If no valid command is detected, return an empty array
975 49
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
976 1
            return [];
977
        }
978
979 49
        $type = $matches[2];
980 49
        $operator = $matches[3];
981 49
        $command = trim($matches[1]);
982
983 49
        if ('TJ' == $operator) {
984 40
            $subcommand = [];
985 40
            $command = trim($command, '[]');
986
            do {
987 40
                $oldCommand = $command;
988
989
                // Search for parentheses string () format
990 40
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
991 34
                    $subcommand[] = [
992 34
                        self::TYPE => '(',
993 34
                        self::OPERATOR => 'TJ',
994 34
                        self::COMMAND => $tjmatch[1],
995 34
                    ];
996 34
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
997 28
                        $subcommand[] = [
998 28
                            self::TYPE => 'n',
999 28
                            self::OPERATOR => '',
1000 28
                            self::COMMAND => $tjmatch[2],
1001 28
                        ];
1002
                    }
1003 34
                    $command = substr($command, \strlen($tjmatch[0]));
1004
                }
1005
1006
                // Search for hexadecimal <> format
1007 40
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1008 19
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1009 19
                    $subcommand[] = [
1010 19
                        self::TYPE => '<',
1011 19
                        self::OPERATOR => 'TJ',
1012 19
                        self::COMMAND => $tjmatch[1],
1013 19
                    ];
1014 19
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1015 18
                        $subcommand[] = [
1016 18
                            self::TYPE => 'n',
1017 18
                            self::OPERATOR => '',
1018 18
                            self::COMMAND => $tjmatch[2],
1019 18
                        ];
1020
                    }
1021 19
                    $command = substr($command, \strlen($tjmatch[0]));
1022
                }
1023 40
            } while ($command != $oldCommand);
1024
1025 40
            $command = $subcommand;
1026 49
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1027
            // Depending on the string type, trim the data of the
1028
            // appropriate delimiters
1029 38
            if ('(' == $type) {
1030
                // Don't use trim() here since a () string may end with
1031
                // a balanced or escaped right parentheses, and trim()
1032
                // will delete both. Both strings below are valid:
1033
                //   eg. (String())
1034
                //   eg. (String\))
1035 32
                $command = preg_replace('/^\(|\)$/', '', $command);
1036 15
            } elseif ('<' == $type) {
1037 38
                $command = trim($command, '<>');
1038
            }
1039 49
        } elseif ('/' == $type) {
1040 48
            $command = substr($command, 1);
1041
        }
1042
1043 49
        $commands[] = [
1044 49
            self::TYPE => $type,
1045 49
            self::OPERATOR => $operator,
1046 49
            self::COMMAND => $command,
1047 49
        ];
1048
1049 49
        return $commands;
1050
    }
1051
1052 64
    public static function factory(
1053
        Document $document,
1054
        Header $header,
1055
        ?string $content,
1056
        ?Config $config = null
1057
    ): self {
1058 64
        switch ($header->get('Type')->getContent()) {
1059 64
            case 'XObject':
1060 19
                switch ($header->get('Subtype')->getContent()) {
1061 19
                    case 'Image':
1062 12
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1062
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1063
1064 8
                    case 'Form':
1065 8
                        return new Form($document, $header, $content, $config);
1066
                }
1067
1068
                return new self($document, $header, $content, $config);
1069
1070 64
            case 'Pages':
1071 63
                return new Pages($document, $header, $content, $config);
1072
1073 64
            case 'Page':
1074 63
                return new Page($document, $header, $content, $config);
1075
1076 64
            case 'Encoding':
1077 11
                return new Encoding($document, $header, $content, $config);
1078
1079 64
            case 'Font':
1080 63
                $subtype = $header->get('Subtype')->getContent();
1081 63
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1082
1083 63
                if (class_exists($classname)) {
1084 63
                    return new $classname($document, $header, $content, $config);
1085
                }
1086
1087
                return new Font($document, $header, $content, $config);
1088
1089
            default:
1090 64
                return new self($document, $header, $content, $config);
1091
        }
1092
    }
1093
1094
    /**
1095
     * Returns unique id identifying the object.
1096
     */
1097 45
    protected function getUniqueId(): string
1098
    {
1099 45
        return spl_object_hash($this);
1100
    }
1101
}
1102