Test Failed
Pull Request — master (#634)
by
unknown
01:58
created

PDFObject::getContent()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    /**
136 32
     * Creates a duplicate of the document stream with strings and other
137 32
     * items replaced by $char. Formerly getSectionsText() used this
138
     * output to more easily gather offset values to extract text from
139
     * the *actual* document stream. As getSectionsText() now uses
140
     * formatContent() instead, this function is no longer used, and
141
     * could be deleted in a future version of PDFParser.
142 32
     *
143 32
     * @internal For internal use only, not part of the public API
144 22
     */
145
    public function cleanContent(string $content, string $char = 'X')
146
    {
147
        $char = $char[0];
148 32
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149 32
150 21
        // Remove image bloc with binary content
151
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154 32
        }
155 32
156 32
        // Clean content in square brackets [.....]
157 32
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

157
        /** @scrutinizer ignore-call */ 
158
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
158 32
        foreach ($matches[1] as $part) {
159 18
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162 32
        // Clean content in round brackets (.....)
163
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 32
        foreach ($matches[1] as $part) {
165 18
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

169
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
170
            $content = '';
171 32
            $level = 0;
172 32
            foreach ($parts as $part) {
173
                if ('<' == $part) {
174
                    ++$level;
175 32
                }
176
177 32
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178 7
179
                if ('>' == $part) {
180
                    --$level;
181 32
                }
182 32
            }
183 11
        }
184
185
        // Clean BDC and EMC markup
186 32
        preg_match_all(
187
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188
            $content,
189 31
            $matches,
190
            \PREG_OFFSET_CAPTURE
191 31
        );
192 31
        foreach ($matches[1] as $part) {
193 31
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 31
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 29
        foreach ($matches[1] as $part) {
198 29
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199 29
        }
200
201
        return $content;
202 29
    }
203 29
204
    /**
205
     * Takes a string of PDF document stream text and formats it into
206 29
     * a multi-line string with one PDF command on each line, separated
207
     * by \r\n. If the given string is null, or binary data is detected
208
     * instead of a document stream then return an empty string.
209
     */
210 29
    public function formatContent(?string $content): string
211
    {
212 29
        if (null === $content) {
213
            return '';
214
        }
215
216
        // Find all strings () and replace them so they aren't affected
217 31
        // by the next steps
218 4
        $pdfstrings = [];
219 4
        $attempt = '(';
220 4
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
221 4
            // PDF strings can contain unescaped parentheses as long as
222
            // they're balanced, so check for balanced parentheses
223 4
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
224
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
225
226
            if ($left == $right) {
227 31
                // Replace the string with a unique placeholder
228
                $id = uniqid('STRING_', true);
229
                $pdfstrings[$id] = $text[0];
230 20
                $content = preg_replace(
231
                    '/'.preg_quote($text[0], '/').'/',
232 20
                    '@@@'.$id.'@@@',
233 20
                    $content,
234 19
                    1
235
                );
236
237 20
                // Reset to search for the next string
238 20
                $attempt = '(';
239 18
            } else {
240
                // We had unbalanced parentheses, so use the current
241
                // match as a base to find a longer string
242 20
                $attempt = $text[0];
243 18
            }
244
        }
245
246 2
        // Remove all carriage returns and line-feeds from the document stream
247
        $content = str_replace(["\r", "\n"], ' ', trim($content));
248
249
        // Find all dictionary << >> commands and replace them so they
250
        // aren't affected by the next steps
251
        $dictstore = [];
252 20
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
253
            $dictid = uniqid('DICT_', true);
254 20
            $dictstore[$dictid] = $dicttext[1];
255 20
            $content = preg_replace(
256 20
                '/'.preg_quote($dicttext[0], '/').'/',
257 20
                ' ###'.$dictid.'###'.$dicttext[2],
258
                $content,
259 20
                1
260 20
            );
261
        }
262 20
263
        // Now that all strings and dictionaries are hidden, the only
264 20
        // PDF commands left should all be plain text.
265 18
        // Detect text encoding of the current string to prevent reading
266 18
        // content streams that are images, etc. This prevents PHP
267 18
        // error messages when JPEG content is sent to this function
268
        // by the sample file '12249.pdf' from:
269 18
        // https://github.com/smalot/pdfparser/issues/458
270 18
        if (false === mb_detect_encoding($content, null, true)) {
271 18
            return '';
272 1
        }
273 1
274
        // Normalize white-space in the document stream
275 1
        $content = preg_replace('/\s{2,}/', ' ', $content);
276
277
        // Find all valid PDF operators and add \r\n after each; this
278 18
        // ensures there is just one command on every line
279 5
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
280
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
281
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
282 18
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
283 15
        //       appear here in the list for completeness.
284 15
        $operators = [
285 15
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
286 15
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
287 15
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
288
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
289
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
290 11
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
291 15
        ];
292 15
        foreach ($operators as $operator) {
293
            $content = preg_replace(
294 12
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
295
                $operator."\r\n",
296 15
                $content
297 15
            );
298
        }
299
300 18
        // Restore the original content of the dictionary << >> commands
301 3
        $dictstore = array_reverse($dictstore, true);
302 3
        foreach ($dictstore as $id => $dict) {
303 3
            $content = str_replace('###'.$id.'###', $dict, $content);
304 3
        }
305 3
306
        // Restore the original string content
307
        $pdfstrings = array_reverse($pdfstrings, true);
308
        foreach ($pdfstrings as $id => $text) {
309 3
            // Strings may contain escaped newlines, or literal newlines
310
            // and we should clean these up before replacing the string
311 18
            // back into the content stream; this ensures no strings are
312 18
            // split between two lines (every command must be on one line)
313 18
            $text = str_replace(
314 18
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
315 18
                ['', '', '', '\r', '\n'],
316
                $text
317
            );
318
319
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
320
        }
321 18
322 16
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
323
324
        return $content;
325 18
    }
326
327 18
    /**
328
     * getSectionsText() now takes an entire, unformatted document
329 5
     * stream as a string, cleans it, then filters out commands that
330 5
     * aren't needed for text positioning/extraction. It returns an
331
     * array of unprocessed PDF commands, one command per element.
332 18
     */
333
    public function getSectionsText(?string $content): array
334 6
    {
335 6
        $sections = [];
336
337 18
        // A cleaned stream has one command on every line, so split the
338 18
        // cleaned stream content on \r\n into an array
339 13
        $textCleaned = preg_split(
340
            '/(\r\n|\n|\r)/',
341 17
            $this->formatContent($content),
342 18
            -1,
343 18
            \PREG_SPLIT_NO_EMPTY
344 18
        );
345
346
        $inTextBlock = false;
347 15
        foreach ($textCleaned as $line) {
348 1
            $line = trim($line);
349 1
350
            // Skip empty lines
351 15
            if ('' === $line) {
352 14
                continue;
353 14
            }
354 14
355 14
            // If a 'BT' is encountered, set the $inTextBlock flag
356 14
            if (preg_match('/BT$/', $line)) {
357 14
                $inTextBlock = true;
358 12
                $sections[] = $line;
359
360
                // If an 'ET' is encountered, unset the $inTextBlock flag
361 14
            } elseif ('ET' == $line) {
362 14
                $inTextBlock = false;
363 14
                $sections[] = $line;
364 10
            } elseif ($inTextBlock) {
365
                // If we are inside a BT ... ET text block, save all lines
366
                $sections[] = trim($line);
367 14
            } else {
368 14
                // Otherwise, if we are outside of a text block, only
369
                // save specific, necessary lines. Care should be taken
370
                // to ensure a command being checked for *only* matches
371 12
                // that command. For instance, a simple search for 'c'
372
                // may also match the 'sc' command. See the command
373
                // list in the formatContent() method above.
374
                // Add more commands to save here as you find them in
375 12
                // weird PDFs!
376 4
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
377
                    // Save and restore graphics state commands
378
                    $sections[] = $line;
379 12
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
380
                    // Begin marked content sequence
381
                    $sections[] = $line;
382
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
383
                    // Marked content point
384 12
                    $sections[] = $line;
385 4
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
386 4
                    // End marked content sequence
387
                    $sections[] = $line;
388 11
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
389
                    // Graphics position change commands
390
                    $sections[] = $line;
391 11
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
392 4
                    // Font change commands
393 4
                    $sections[] = $line;
394 4
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
395 4
                    // Invoke named XObject command
396
                    $sections[] = $line;
397
                }
398 4
            }
399
        }
400 4
401
        return $sections;
402
    }
403 4
404
    private function getDefaultFont(Page $page = null): Font
405 9
    {
406 8
        $fonts = [];
407 2
        if (null !== $page) {
408
            $fonts = $page->getFonts();
409 8
        }
410
411
        $firstFont = $this->document->getFirstFont();
412 8
        if (null !== $firstFont) {
413
            $fonts[] = $firstFont;
414
        }
415 8
416 3
        if (\count($fonts) > 0) {
417
            return reset($fonts);
418 8
        }
419 3
420
        return new Font($this->document, null, null, $this->config);
421 7
    }
422
423
    /**
424 7
     * Decode a '[]TJ' command and attempt to use alternate fonts if
425 7
     * the current font results in output that contains Unicode control
426
     * characters. See Font::decodeText for a full description of
427
     * $textMatrix
428 7
     *
429 7
     * @param array<int,array<string,string|bool>> $command
430 1
     */
431
    private function getTJUsingFontFallback(Font $font, array $command, Page $page = null): string
432 6
    {
433
        $orig_text = $font->decodeText($command);
434
        $text = $orig_text;
435 6
436 6
        // If we make this a Config option, we can add a check if it's
437
        // enabled here.
438
        if (null !== $page) {
439
            $font_ids = array_keys($page->getFonts());
440
441
            // If the decoded text contains UTF-8 control characters
442
            // then the font page being used is probably the wrong one.
443
            // Loop through the rest of the fonts to see if we can get
444
            // a good decode. Allow x09 to x0d which are whitespace.
445 18
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
446 1
                // If we're out of font IDs, then give up and use the
447 1
                // original string
448
                if (0 == \count($font_ids)) {
449
                    return $orig_text;
450 18
                }
451
452
                // Try the next font ID
453 20
                $font = $page->getFont(array_shift($font_ids));
454
                $text = $font->decodeText($command);
455
            }
456
        }
457
458
        return $text;
459 6
    }
460
461 6
    /**
462 6
     * Expects a string that is a full PDF dictionary object, including
463 6
     * the outer enclosing << >> angle brackets.
464
     *
465 6
     * @throws \Exception
466 6
     */
467
    public function parseDictionary(string $dictionary): array
468 6
    {
469 6
        // Normalize whitespace
470
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
471 6
472 3
        if ('<<' != substr($dictionary, 0, 2)) {
473
            throw new \Exception('Not a valid dictionary object.');
474
        }
475 6
476 6
        $parsed = [];
477
        $stack = [];
478
        $currentName = '';
479 6
        $arrayTypeNumeric = false;
480
481
        // Remove outer layer of dictionary, and split on tokens
482 6
        $split = preg_split(
483 6
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
484 6
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
485 6
            -1,
486 6
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
487
        );
488 6
489
        foreach ($split as $token) {
490 6
            $token = trim($token);
491 6
            switch ($token) {
492 5
                case '':
493
                    break;
494 6
495 6
                    // Open numeric array
496 6
                case '[':
497 6
                    $parsed[$currentName] = [];
498
                    $arrayTypeNumeric = true;
499
500 5
                    // Move up one level in the stack
501 4
                    $stack[\count($stack)] = &$parsed;
502
                    $parsed = &$parsed[$currentName];
503 5
                    $currentName = '';
504 4
                    break;
505
506
                    // Open hashed array
507 5
                case '<<':
508
                    $parsed[$currentName] = [];
509
                    $arrayTypeNumeric = false;
510
511 5
                    // Move up one level in the stack
512 2
                    $stack[\count($stack)] = &$parsed;
513
                    $parsed = &$parsed[$currentName];
514
                    $currentName = '';
515 5
                    break;
516
517
                    // Close numeric array
518
                case ']':
519
                    // Revert string type arrays back to a single element
520 5
                    if (\is_array($parsed) && 1 == \count($parsed)
521
                        && isset($parsed[0]) && \is_string($parsed[0])
522 4
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
523
                        $parsed = '['.$parsed[0].']';
524 4
                    }
525
                    // Close hashed array
526
                    // no break
527 4
                case '>>':
528
                    $arrayTypeNumeric = false;
529
530
                    // Move down one level in the stack
531
                    $parsed = &$stack[\count($stack) - 1];
532
                    unset($stack[\count($stack) - 1]);
533
                    break;
534
535
                default:
536
                    // If value begins with a slash, then this is a name
537 4
                    // Add it to the appropriate array
538 4
                    if ('/' == substr($token, 0, 1)) {
539 2
                        $currentName = substr($token, 1);
540
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
541 4
                            $parsed[] = $currentName;
542
                            $currentName = '';
543
                        }
544 4
                    } elseif ('' != $currentName) {
545
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
546
                            $parsed[$currentName] = $token;
547 4
                        }
548
                        $currentName = '';
549
                    } elseif ('' == $currentName) {
550 4
                        $parsed[] = $token;
551 1
                    }
552
            }
553 4
        }
554
555
        return $parsed;
556 4
    }
557 4
558
    /**
559
     * getText() leverages getTextArray() to get the content of the
560 4
     * document, setting the addPositionWhitespace flag to true so
561 4
     * whitespace is inserted in a logical way for reading by humans.
562 2
     */
563
    public function getText(Page $page = null): string
564 2
    {
565
        $this->addPositionWhitespace = true;
566
        $result = $this->getTextArray($page);
567 2
        $this->addPositionWhitespace = false;
568 2
569
        return implode('', $result).' ';
570
    }
571
572
    /**
573
     * getTextArray() returns the text objects of a document in an
574
     * array. By default no positioning whitespace is added to the
575
     * output unless the addPositionWhitespace flag is set to true.
576 6
     *
577
     * @throws \Exception
578
     */
579 29
    public function getTextArray(Page $page = null): array
580
    {
581 29
        $result = [];
582
        $text = [];
583 29
584 29
        $marked_stack = [];
585 29
        $last_written_position = false;
586
587 29
        $sections = $this->getSectionsText($this->content);
588 29
        $current_font = $this->getDefaultFont($page);
589 29
        $current_font_size = 1;
590
        $current_text_leading = 0;
591 29
592 29
        $current_position = ['x' => false, 'y' => false];
593 29
        $current_position_tm = [
594 29
            'a' => 1, 'b' => 0, 'c' => 0,
595 29
            'i' => 0, 'j' => 1, 'k' => 0,
596 29
            'x' => 0, 'y' => 0, 'z' => 1,
597
        ];
598
        $current_position_td = ['x' => 0, 'y' => 0];
599
        $current_position_cm = [
600 29
            'a' => 1, 'b' => 0, 'c' => 0,
601 29
            'i' => 0, 'j' => 1, 'k' => 0,
602 29
            'x' => 0, 'y' => 0, 'z' => 1,
603 11
        ];
604 11
605 11
        $clipped_font = [];
606
        $clipped_position_cm = [];
607
608
        self::$recursionStack[] = $this->getUniqueId();
609 11
610 11
        foreach ($sections as $section) {
611 11
            $commands = $this->getCommandsText($section);
612
            foreach ($commands as $command) {
613 29
                switch ($command[self::OPERATOR]) {
614
                    // Begin text object
615 29
                    case 'BT':
616 29
                        // Reset text positioning matrices
617
                        $current_position_tm = [
618 25
                            'a' => 1, 'b' => 0, 'c' => 0,
619 25
                            'i' => 0, 'j' => 1, 'k' => 0,
620 25
                            'x' => 0, 'y' => 0, 'z' => 1,
621
                        ];
622 25
                        $current_position_td = ['x' => 0, 'y' => 0];
623
                        $current_text_leading = 0;
624 25
                        break;
625 25
626 25
                        // Begin marked content sequence with property list
627
                    case 'BDC':
628
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_match() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

628
                        if (preg_match('/(<<.*>>)$/', /** @scrutinizer ignore-type */ $command[self::COMMAND], $match)) {
Loading history...
629 25
                            $dict = $this->parseDictionary($match[1]);
630 25
631
                            // Check for ActualText block
632 25
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
633
                                if ('[' == $dict['ActualText'][0]) {
634 29
                                    // Simulate a 'TJ' command on the stack
635 29
                                    $marked_stack[] = [
636
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
637 14
                                    ];
638 14
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
639 14
                                    // Simulate a 'Tj' command on the stack
640 14
                                    $marked_stack[] = [
641 14
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
642 14
                                    ];
643
                                }
644
                            }
645 14
                        }
646 9
                        break;
647 9
648
                        // Begin marked content sequence
649 14
                    case 'BMC':
650
                        if ('ReversedChars' == $command[self::COMMAND]) {
651 29
                            // Upon encountering a ReversedChars command,
652 29
                            // add the characters we've built up so far to
653 22
                            // the result array
654 22
                            $result = array_merge($result, $text);
655 22
656 22
                            // Start a fresh $text array that will contain
657 22
                            // reversed characters
658 22
                            $text = [];
659 22
660
                            // Add the reversed text flag to the stack
661
                            $marked_stack[] = ['ReversedChars' => true];
662 22
                        }
663 22
                        break;
664 22
665
                        // set graphics position matrix
666
                    case 'cm':
667 16
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
0 ignored issues
show
Bug introduced by
It seems like $command[self::COMMAND] can also be of type array and array<mixed,array<string,mixed|string>>; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

667
                        $args = preg_split('/\s+/s', /** @scrutinizer ignore-type */ $command[self::COMMAND]);
Loading history...
668 16
                        $current_position_cm = [
669
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
670 22
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
671
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
672
                        ];
673
                        break;
674
675 22
                    case 'Do':
676
                        if (null !== $page) {
677 22
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
678 22
                            $id = trim(array_pop($args), '/ ');
679
                            $xobject = $page->getXObject($id);
680 22
681
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
682 22
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
683 22
                                // Not a circular reference.
684
                                $text[] = $xobject->getText($page);
685 22
                            }
686 18
                        }
687 18
                        break;
688
689
                        // Marked content point with (DP) & without (MP) property list
690 22
                    case 'DP':
691
                    case 'MP':
692
                        break;
693 29
694 1
                        // End text object
695 29
                    case 'ET':
696 29
                        break;
697 29
698
                        // Store current selected font and graphics matrix
699
                    case 'q':
700
                        $clipped_font[] = [$current_font, $current_font_size];
701 29
                        $clipped_position_cm[] = $current_position_cm;
702 29
                        break;
703 29
704 24
                        // Restore previous selected font and graphics matrix
705 22
                    case 'Q':
706 22
                        list($current_font, $current_font_size) = array_pop($clipped_font);
707 22
                        $current_position_cm = array_pop($clipped_position_cm);
708 17
                        break;
709 17
710 17
                        // End marked content sequence
711 17
                    case 'EMC':
712 17
                        $data = false;
713
                        if (\count($marked_stack)) {
714
                            $marked = array_pop($marked_stack);
715
                            $action = key($marked);
716 29
                            $data = $marked[$action];
717 29
718 29
                            switch ($action) {
719 29
                                // If we are in ReversedChars mode...
720 29
                                case 'ReversedChars':
721
                                    // Reverse the characters we've built up so far
722
                                    foreach ($text as $key => $t) {
723 25
                                        $text[$key] = implode('', array_reverse(
724
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

724
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
725
                                        ));
726
                                    }
727 29
728
                                    // Add these characters to the result array
729
                                    $result = array_merge($result, $text);
730 42
731
                                    // Start a fresh $text array that will contain
732
                                    // non-reversed characters
733
                                    $text = [];
734
                                    break;
735
736 42
                                case 'ActualText':
737 42
                                    // Use the content of the ActualText as a command
738 8
                                    $command = $data;
739 8
                                    break;
740 3
                            }
741
                        }
742 6
743 6
                        // If this EMC command has been transformed into a 'Tj'
744
                        // or 'TJ' command because of being ActualText, then bypass
745
                        // the break to proceed to the writing section below.
746
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
747
                            break;
748 42
                        }
749 41
750
                        // no break
751 42
                    case "'":
752 41
                    case '"':
753
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
754 42
                            // Move to next line and write text
755 6
                            $current_position['x'] = 0;
756
                            $current_position_td['x'] = 0;
757 42
                            $current_position_td['y'] += $current_text_leading;
758 41
                        }
759 41
                        // no break
760
                    case 'Tj':
761 41
                        $command[self::COMMAND] = [$command];
762 41
                        // no break
763
                    case 'TJ':
764
                        // Check the marked content stack for flags
765
                        $actual_text = false;
766
                        $reverse_text = false;
767
                        foreach ($marked_stack as $marked) {
768 42
                            if (isset($marked['ActualText'])) {
769
                                $actual_text = true;
770
                            }
771
                            if (isset($marked['ReversedChars'])) {
772
                                $reverse_text = true;
773
                            }
774
                        }
775 20
776
                        // Account for text position ONLY just before we write text
777 20
                        if (false === $actual_text && \is_array($last_written_position)) {
778
                            // If $last_written_position is an array, that
779
                            // means we have stored text position coordinates
780
                            // for placing an ActualText
781
                            $currentX = $last_written_position[0];
782
                            $currentY = $last_written_position[1];
783
                            $last_written_position = false;
784
                        } else {
785
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
786
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
787
                        }
788
                        $whiteSpace = '';
789
790
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
791
                            $curY = $currentY - $current_position['y'];
792
                            $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
793
                            if (abs($curY) >= abs($factorY)) {
794
                                $whiteSpace = "\n";
795
                            } else {
796
                                if (true === $reverse_text) {
797
                                    $curX = $current_position['x'] - $currentX;
798
                                } else {
799
                                    $curX = $currentX - $current_position['x'];
800
                                }
801
                                $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
802
803
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
804
                                // as the number of apparent "spaces" in a document we
805
                                // would need before considering them a "tab". In the
806
                                // future, we might offer this value to users as a config
807
                                // option.
808
                                if ($curX >= abs($factorX * 7)) {
809
                                    $whiteSpace = "\t";
810
                                } elseif ($curX >= abs($factorX * 2)) {
811
                                    $whiteSpace = ' ';
812
                                }
813
                            }
814
                        }
815
816
                        $newtext = $this->getTJUsingFontFallback(
817
                            $current_font,
818
                            $command[self::COMMAND],
819
                            $page
820
                        );
821
822
                        // If there is no ActualText pending then write
823
                        if (false === $actual_text) {
824
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
825
                            if (false !== $reverse_text) {
826
                                // If we are in ReversedChars mode, add the whitespace last
827
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
828
                            } else {
829
                                // Otherwise add the whitespace first
830
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
831
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
832
                                }
833
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
834
                            }
835
836
                            // Record the position of this inserted text for comparison
837
                            // with the next text block.
838
                            // Provide a 'fudge' factor guess on how wide this text block
839
                            // is based on the number of characters. This helps limit the
840
                            // number of tabs inserted, but isn't perfect.
841
                            $factor = $current_font_size / 2;
842
                            $current_position = [
843
                                'x' => $currentX + mb_strlen($newtext) * $factor,
844
                                'y' => $currentY,
845
                            ];
846
                        } elseif (false === $last_written_position) {
847
                            // If there is an ActualText in the pipeline
848
                            // store the position this undisplayed text
849
                            // *would* have been written to, so the
850
                            // ActualText is displayed in the right spot
851
                            $last_written_position = [$currentX, $currentY];
852
                            $current_position['x'] = $currentX;
853
                        }
854
                        break;
855
856
                        // move to start of next line
857
                    case 'T*':
858
                        $current_position['x'] = 0;
859
                        $current_position_td['x'] = 0;
860
                        $current_position_td['y'] += $current_text_leading;
861
                        break;
862
863
                        // set character spacing
864
                    case 'Tc':
865
                        break;
866
867
                        // move text current point and set leading
868
                    case 'Td':
869
                    case 'TD':
870
                        // move text current point
871
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
872
                        $y = (float) array_pop($args);
873
                        $x = (float) array_pop($args);
874
875
                        if ('TD' == $command[self::OPERATOR]) {
876
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
877
                        }
878
879
                        $current_position_td = [
880
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
881
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
882
                        ];
883
                        break;
884
885
                    case 'Tf':
886
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
887
                        $size = (float) array_pop($args);
888
                        $id = trim(array_pop($args), '/');
889
                        if (null !== $page) {
890
                            $new_font = $page->getFont($id);
891
                            // If an invalid font ID is given, do not update the font.
892
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
893
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
894
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
895
                            // But we want to make sure that malformed PDFs do not simply crash.
896
                            if (null !== $new_font) {
897
                                $current_font = $new_font;
898
                                $current_font_size = $size;
899
                            }
900
                        }
901
                        break;
902
903
                        // set leading
904
                    case 'TL':
905
                        $y = (float) $command[self::COMMAND];
906
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
907
                        break;
908
909
                        // set text position matrix
910
                    case 'Tm':
911
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
912
                        $current_position_tm = [
913
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
914
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
915
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
916
                        ];
917
                        break;
918
919
                        // set text rendering mode
920
                    case 'Ts':
921
                        break;
922
923
                        // set super/subscripting text rise
924
                    case 'Ts':
925
                        break;
926
927
                        // set word spacing
928
                    case 'Tw':
929
                        break;
930
931
                        // set horizontal scaling
932
                    case 'Tz':
933
                        break;
934
935
                    default:
936
                }
937
            }
938
        }
939
940
        $result = array_merge($result, $text);
941
942
        return $result;
943
    }
944
945
    /**
946
     * getCommandsText() expects the content of $text_part to be an
947
     * already formatted, single-line command from a document stream.
948
     * The companion function getSectionsText() returns a document
949
     * stream as an array of single commands for just this purpose.
950
     *
951
     * A better name for this function would be getCommandText()
952
     * since it now always works on just one command.
953
     */
954
    public function getCommandsText(string $text_part): array
955
    {
956
        $commands = $matches = [];
957
958
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
959
960
        $type = $matches[2];
961
        $operator = $matches[3];
962
        $command = trim($matches[1]);
963
964
        if ('TJ' == $operator) {
965
            $subcommand = [];
966
            $command = trim($command, '[]');
967
            do {
968
                $oldCommand = $command;
969
970
                // Search for parentheses string () format
971
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
972
                    $subcommand[] = [
973
                        self::TYPE => '(',
974
                        self::OPERATOR => 'TJ',
975
                        self::COMMAND => $tjmatch[1],
976
                    ];
977
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
978
                        $subcommand[] = [
979
                            self::TYPE => 'n',
980
                            self::OPERATOR => '',
981
                            self::COMMAND => $tjmatch[2],
982
                        ];
983
                    }
984
                    $command = substr($command, \strlen($tjmatch[0]));
985
                }
986
987
                // Search for hexadecimal <> format
988
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
989
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
990
                    $subcommand[] = [
991
                        self::TYPE => '<',
992
                        self::OPERATOR => 'TJ',
993
                        self::COMMAND => $tjmatch[1],
994
                    ];
995
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
996
                        $subcommand[] = [
997
                            self::TYPE => 'n',
998
                            self::OPERATOR => '',
999
                            self::COMMAND => $tjmatch[2],
1000
                        ];
1001
                    }
1002
                    $command = substr($command, \strlen($tjmatch[0]));
1003
                }
1004
            } while ($command != $oldCommand);
1005
1006
            $command = $subcommand;
1007
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1008
            // Depending on the string type, trim the data of the
1009
            // appropriate delimiters
1010
            if ('(' == $type) {
1011
                // Don't use trim() here since a () string may end with
1012
                // a balanced or escaped right parentheses, and trim()
1013
                // will delete both. Both strings below are valid:
1014
                //   eg. (String())
1015
                //   eg. (String\))
1016
                $command = preg_replace('/^\(|\)$/', '', $command);
1017
            } elseif ('<' == $type) {
1018
                $command = trim($command, '<>');
1019
            }
1020
        } elseif ('/' == $type) {
1021
            $command = substr($command, 1);
1022
        }
1023
1024
        $commands[] = [
1025
            self::TYPE => $type,
1026
            self::OPERATOR => $operator,
1027
            self::COMMAND => $command,
1028
        ];
1029
1030
        return $commands;
1031
    }
1032
1033
    public static function factory(
1034
        Document $document,
1035
        Header $header,
1036
        ?string $content,
1037
        Config $config = null
1038
    ): self {
1039
        switch ($header->get('Type')->getContent()) {
1040
            case 'XObject':
1041
                switch ($header->get('Subtype')->getContent()) {
1042
                    case 'Image':
1043
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1043
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1044
1045
                    case 'Form':
1046
                        return new Form($document, $header, $content, $config);
1047
                }
1048
1049
                return new self($document, $header, $content, $config);
1050
1051
            case 'Pages':
1052
                return new Pages($document, $header, $content, $config);
1053
1054
            case 'Page':
1055
                return new Page($document, $header, $content, $config);
1056
1057
            case 'Encoding':
1058
                return new Encoding($document, $header, $content, $config);
1059
1060
            case 'Font':
1061
                $subtype = $header->get('Subtype')->getContent();
1062
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1063
1064
                if (class_exists($classname)) {
1065
                    return new $classname($document, $header, $content, $config);
1066
                }
1067
1068
                return new Font($document, $header, $content, $config);
1069
1070
            default:
1071
                return new self($document, $header, $content, $config);
1072
        }
1073
    }
1074
1075
    /**
1076
     * Returns unique id identifying the object.
1077
     */
1078
    protected function getUniqueId(): string
1079
    {
1080
        return spl_object_hash($this);
1081
    }
1082
}
1083