Test Failed
Push — master ( 52c4f6...feaf39 )
by Konrad
07:50
created

PDFObject::has()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
cc 1
nc 1
nop 1
crap 1
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\XObject\Form;
36
use Smalot\PdfParser\XObject\Image;
37
38
/**
39
 * Class PDFObject
40
 */
41
class PDFObject
42
{
43
    public const TYPE = 't';
44
45
    public const OPERATOR = 'o';
46
47
    public const COMMAND = 'c';
48
49
    /**
50
     * The recursion stack.
51
     *
52
     * @var array
53
     */
54
    public static $recursionStack = [];
55
56
    /**
57
     * @var Document
58
     */
59
    protected $document;
60
61
    /**
62
     * @var Header
63
     */
64
    protected $header;
65
66
    /**
67
     * @var string
68
     */
69
    protected $content;
70
71
    /**
72
     * @var Config
73
     */
74
    protected $config;
75
76 62
    /**
77
     * @var bool
78
     */
79
    protected $addPositionWhitespace = false;
80
81
    public function __construct(
82 62
        Document $document,
83 62
        Header $header = null,
84 62
        string $content = null,
85 62
        Config $config = null
86 62
    ) {
87
        $this->document = $document;
88 49
        $this->header = $header ?? new Header();
89
        $this->content = $content;
90 49
        $this->config = $config;
91
    }
92 3
93
    public function init()
94 3
    {
95
    }
96
97 49
    public function getDocument(): Document
98
    {
99 49
        return $this->document;
100
    }
101
102 3
    public function getHeader(): ?Header
103
    {
104 3
        return $this->header;
105
    }
106
107
    public function getConfig(): ?Config
108
    {
109
        return $this->config;
110 50
    }
111
112 50
    /**
113
     * @return Element|PDFObject|Header
114
     */
115 47
    public function get(string $name)
116
    {
117 47
        return $this->header->get($name);
118
    }
119
120 3
    public function has(string $name): bool
121
    {
122 3
        return $this->header->has($name);
123
    }
124
125 38
    public function getDetails(bool $deep = true): array
126
    {
127 38
        return $this->header->getDetails($deep);
128
    }
129
130 32
    public function getContent(): ?string
131
    {
132 32
        return $this->content;
133 32
    }
134
135
    /**
136 32
     * Creates a duplicate of the document stream with
137 32
     * strings and other items replaced by $char. Formerly
138
     * getSectionsText() used this output to more easily gather offset
139
     * values to extract text from the *actual* document stream.
140
     *
141
     * @deprecated function is no longer used and will be removed in a future release
142 32
     *
143 32
     * @internal
144 22
     */
145
    public function cleanContent(string $content, string $char = 'X')
146
    {
147
        $char = $char[0];
148 32
        $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
149 32
150 21
        // Remove image bloc with binary content
151
        preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
152
        foreach ($matches[0] as $part) {
153
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
154 32
        }
155 32
156 32
        // Clean content in square brackets [.....]
157 32
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);
0 ignored issues
show
Unused Code introduced by
The call to preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

157
        /** @scrutinizer ignore-call */ 
158
        preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
158 32
        foreach ($matches[1] as $part) {
159 18
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
160
        }
161
162 32
        // Clean content in round brackets (.....)
163
        preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE);
164 32
        foreach ($matches[1] as $part) {
165 18
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
166
        }
167
168
        // Clean structure
169
        if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type array; however, parameter $subject of preg_split() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

169
        if ($parts = preg_split('/(<|>)/s', /** @scrutinizer ignore-type */ $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) {
Loading history...
170
            $content = '';
171 32
            $level = 0;
172 32
            foreach ($parts as $part) {
173
                if ('<' == $part) {
174
                    ++$level;
175 32
                }
176
177 32
                $content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
178 7
179
                if ('>' == $part) {
180
                    --$level;
181 32
                }
182 32
            }
183 11
        }
184
185
        // Clean BDC and EMC markup
186 32
        preg_match_all(
187
            '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s',
188
            $content,
189 31
            $matches,
190
            \PREG_OFFSET_CAPTURE
191 31
        );
192 31
        foreach ($matches[1] as $part) {
193 31
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
194
        }
195
196 31
        preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE);
197 29
        foreach ($matches[1] as $part) {
198 29
            $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0]));
199 29
        }
200
201
        return $content;
202 29
    }
203 29
204
    /**
205
     * Takes a string of PDF document stream text and formats
206 29
     * it into a multi-line string with one PDF command on each line,
207
     * separated by \r\n. If the given string is null, or binary data
208
     * is detected instead of a document stream then return an empty
209
     * string.
210 29
     */
211
    private function formatContent(?string $content): string
212 29
    {
213
        if (null === $content) {
214
            return '';
215
        }
216
217 31
        // Find all strings () and replace them so they aren't affected
218 4
        // by the next steps
219 4
        $pdfstrings = [];
220 4
        $attempt = '(';
221 4
        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
222
            // PDF strings can contain unescaped parentheses as long as
223 4
            // they're balanced, so check for balanced parentheses
224
            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
225
            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
226
227 31
            if ($left == $right) {
228
                // Replace the string with a unique placeholder
229
                $id = uniqid('STRING_', true);
230 20
                $pdfstrings[$id] = $text[0];
231
                $content = preg_replace(
232 20
                    '/'.preg_quote($text[0], '/').'/',
233 20
                    '@@@'.$id.'@@@',
234 19
                    $content,
235
                    1
236
                );
237 20
238 20
                // Reset to search for the next string
239 18
                $attempt = '(';
240
            } else {
241
                // We had unbalanced parentheses, so use the current
242 20
                // match as a base to find a longer string
243 18
                $attempt = $text[0];
244
            }
245
        }
246 2
247
        // Remove all carriage returns and line-feeds from the document stream
248
        $content = str_replace(["\r", "\n"], ' ', trim($content));
249
250
        // Find all dictionary << >> commands and replace them so they
251
        // aren't affected by the next steps
252 20
        $dictstore = [];
253
        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) {
254 20
            $dictid = uniqid('DICT_', true);
255 20
            $dictstore[$dictid] = $dicttext[1];
256 20
            $content = preg_replace(
257 20
                '/'.preg_quote($dicttext[0], '/').'/',
258
                ' ###'.$dictid.'###'.$dicttext[2],
259 20
                $content,
260 20
                1
261
            );
262 20
        }
263
264 20
        // Now that all strings and dictionaries are hidden, the only
265 18
        // PDF commands left should all be plain text.
266 18
        // Detect text encoding of the current string to prevent reading
267 18
        // content streams that are images, etc. This prevents PHP
268
        // error messages when JPEG content is sent to this function
269 18
        // by the sample file '12249.pdf' from:
270 18
        // https://github.com/smalot/pdfparser/issues/458
271 18
        if (false === mb_detect_encoding($content, null, true)) {
272 1
            return '';
273 1
        }
274
275 1
        // Normalize white-space in the document stream
276
        $content = preg_replace('/\s{2,}/', ' ', $content);
277
278 18
        // Find all valid PDF operators and add \r\n after each; this
279 5
        // ensures there is just one command on every line
280
        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
281
        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
282 18
        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
283 15
        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
284 15
        //       appear here in the list for completeness.
285 15
        $operators = [
286 15
          'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
287 15
          'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
288
          'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
289
          'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
290 11
          'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
291 15
          'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
292 15
        ];
293
        foreach ($operators as $operator) {
294 12
            $content = preg_replace(
295
                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
296 15
                $operator."\r\n",
297 15
                $content
298
            );
299
        }
300 18
301 3
        // Restore the original content of the dictionary << >> commands
302 3
        $dictstore = array_reverse($dictstore, true);
303 3
        foreach ($dictstore as $id => $dict) {
304 3
            $content = str_replace('###'.$id.'###', $dict, $content);
305 3
        }
306
307
        // Restore the original string content
308
        $pdfstrings = array_reverse($pdfstrings, true);
309 3
        foreach ($pdfstrings as $id => $text) {
310
            // Strings may contain escaped newlines, or literal newlines
311 18
            // and we should clean these up before replacing the string
312 18
            // back into the content stream; this ensures no strings are
313 18
            // split between two lines (every command must be on one line)
314 18
            $text = str_replace(
315 18
                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
316
                ['', '', '', '\r', '\n'],
317
                $text
318
            );
319
320
            $content = str_replace('@@@'.$id.'@@@', $text, $content);
321 18
        }
322 16
323
        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
324
325 18
        return $content;
326
    }
327 18
328
    /**
329 5
     * getSectionsText() now takes an entire, unformatted
330 5
     * document stream as a string, cleans it, then filters out
331
     * commands that aren't needed for text positioning/extraction. It
332 18
     * returns an array of unprocessed PDF commands, one command per
333
     * element.
334 6
     *
335 6
     * @internal
336
     */
337 18
    public function getSectionsText(?string $content): array
338 18
    {
339 13
        $sections = [];
340
341 17
        // A cleaned stream has one command on every line, so split the
342 18
        // cleaned stream content on \r\n into an array
343 18
        $textCleaned = preg_split(
344 18
            '/(\r\n|\n|\r)/',
345
            $this->formatContent($content),
346
            -1,
347 15
            \PREG_SPLIT_NO_EMPTY
348 1
        );
349 1
350
        $inTextBlock = false;
351 15
        foreach ($textCleaned as $line) {
352 14
            $line = trim($line);
353 14
354 14
            // Skip empty lines
355 14
            if ('' === $line) {
356 14
                continue;
357 14
            }
358 12
359
            // If a 'BT' is encountered, set the $inTextBlock flag
360
            if (preg_match('/BT$/', $line)) {
361 14
                $inTextBlock = true;
362 14
                $sections[] = $line;
363 14
364 10
                // If an 'ET' is encountered, unset the $inTextBlock flag
365
            } elseif ('ET' == $line) {
366
                $inTextBlock = false;
367 14
                $sections[] = $line;
368 14
            } elseif ($inTextBlock) {
369
                // If we are inside a BT ... ET text block, save all lines
370
                $sections[] = trim($line);
371 12
            } else {
372
                // Otherwise, if we are outside of a text block, only
373
                // save specific, necessary lines. Care should be taken
374
                // to ensure a command being checked for *only* matches
375 12
                // that command. For instance, a simple search for 'c'
376 4
                // may also match the 'sc' command. See the command
377
                // list in the formatContent() method above.
378
                // Add more commands to save here as you find them in
379 12
                // weird PDFs!
380
                if ('q' == $line[-1] || 'Q' == $line[-1]) {
381
                    // Save and restore graphics state commands
382
                    $sections[] = $line;
383
                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
384 12
                    // Begin marked content sequence
385 4
                    $sections[] = $line;
386 4
                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
387
                    // Marked content point
388 11
                    $sections[] = $line;
389
                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
390
                    // End marked content sequence
391 11
                    $sections[] = $line;
392 4
                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
393 4
                    // Graphics position change commands
394 4
                    $sections[] = $line;
395 4
                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
396
                    // Font change commands
397
                    $sections[] = $line;
398 4
                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
399
                    // Invoke named XObject command
400 4
                    $sections[] = $line;
401
                }
402
            }
403 4
        }
404
405 9
        return $sections;
406 8
    }
407 2
408
    private function getDefaultFont(Page $page = null): Font
409 8
    {
410
        $fonts = [];
411
        if (null !== $page) {
412 8
            $fonts = $page->getFonts();
413
        }
414
415 8
        $firstFont = $this->document->getFirstFont();
416 3
        if (null !== $firstFont) {
417
            $fonts[] = $firstFont;
418 8
        }
419 3
420
        if (\count($fonts) > 0) {
421 7
            return reset($fonts);
422
        }
423
424 7
        return new Font($this->document, null, null, $this->config);
425 7
    }
426
427
    /**
428 7
     * Decode a '[]TJ' command and attempt to use alternate
429 7
     * fonts if the current font results in output that contains
430 1
     * Unicode control characters.
431
     *
432 6
     * @internal
433
     *
434
     * @param array<int,array<string,string|bool>> $command
435 6
     */
436 6
    private function getTJUsingFontFallback(Font $font, array $command, Page $page = null, float $fontFactor = 4): string
437
    {
438
        $orig_text = $font->decodeText($command, $fontFactor);
439
        $text = $orig_text;
440
441
        // If we make this a Config option, we can add a check if it's
442
        // enabled here.
443
        if (null !== $page) {
444
            $font_ids = array_keys($page->getFonts());
445 18
446 1
            // If the decoded text contains UTF-8 control characters
447 1
            // then the font page being used is probably the wrong one.
448
            // Loop through the rest of the fonts to see if we can get
449
            // a good decode. Allow x09 to x0d which are whitespace.
450 18
            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
451
                // If we're out of font IDs, then give up and use the
452
                // original string
453 20
                if (0 == \count($font_ids)) {
454
                    return $orig_text;
455
                }
456
457
                // Try the next font ID
458
                $font = $page->getFont(array_shift($font_ids));
459 6
                $text = $font->decodeText($command, $fontFactor);
460
            }
461 6
        }
462 6
463 6
        return $text;
464
    }
465 6
466 6
    /**
467
     * Expects a string that is a full PDF dictionary object,
468 6
     * including the outer enclosing << >> angle brackets
469 6
     *
470
     * @internal
471 6
     *
472 3
     * @throws \Exception
473
     */
474
    public function parseDictionary(string $dictionary): array
475 6
    {
476 6
        // Normalize whitespace
477
        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
478
479 6
        if ('<<' != substr($dictionary, 0, 2)) {
480
            throw new \Exception('Not a valid dictionary object.');
481
        }
482 6
483 6
        $parsed = [];
484 6
        $stack = [];
485 6
        $currentName = '';
486 6
        $arrayTypeNumeric = false;
487
488 6
        // Remove outer layer of dictionary, and split on tokens
489
        $split = preg_split(
490 6
            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
491 6
            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
492 5
            -1,
493
            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
494 6
        );
495 6
496 6
        foreach ($split as $token) {
497 6
            $token = trim($token);
498
            switch ($token) {
499
                case '':
500 5
                    break;
501 4
502
                    // Open numeric array
503 5
                case '[':
504 4
                    $parsed[$currentName] = [];
505
                    $arrayTypeNumeric = true;
506
507 5
                    // Move up one level in the stack
508
                    $stack[\count($stack)] = &$parsed;
509
                    $parsed = &$parsed[$currentName];
510
                    $currentName = '';
511 5
                    break;
512 2
513
                    // Open hashed array
514
                case '<<':
515 5
                    $parsed[$currentName] = [];
516
                    $arrayTypeNumeric = false;
517
518
                    // Move up one level in the stack
519
                    $stack[\count($stack)] = &$parsed;
520 5
                    $parsed = &$parsed[$currentName];
521
                    $currentName = '';
522 4
                    break;
523
524 4
                    // Close numeric array
525
                case ']':
526
                    // Revert string type arrays back to a single element
527 4
                    if (\is_array($parsed) && 1 == \count($parsed)
528
                        && isset($parsed[0]) && \is_string($parsed[0])
529
                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
530
                        $parsed = '['.$parsed[0].']';
531
                    }
532
                    // Close hashed array
533
                    // no break
534
                case '>>':
535
                    $arrayTypeNumeric = false;
536
537 4
                    // Move down one level in the stack
538 4
                    $parsed = &$stack[\count($stack) - 1];
539 2
                    unset($stack[\count($stack) - 1]);
540
                    break;
541 4
542
                default:
543
                    // If value begins with a slash, then this is a name
544 4
                    // Add it to the appropriate array
545
                    if ('/' == substr($token, 0, 1)) {
546
                        $currentName = substr($token, 1);
547 4
                        if (true == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
548
                            $parsed[] = $currentName;
549
                            $currentName = '';
550 4
                        }
551 1
                    } elseif ('' != $currentName) {
552
                        if (false == $arrayTypeNumeric) {
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like you are loosely comparing two booleans. Considering using the strict comparison === instead.

When comparing two booleans, it is generally considered safer to use the strict comparison operator.

Loading history...
553 4
                            $parsed[$currentName] = $token;
554
                        }
555
                        $currentName = '';
556 4
                    } elseif ('' == $currentName) {
557 4
                        $parsed[] = $token;
558
                    }
559
            }
560 4
        }
561 4
562 2
        return $parsed;
563
    }
564 2
565
    /**
566
     * Returns the text content of a PDF as a string. Attempts to add
567 2
     * whitespace for spacing and line-breaks where appropriate.
568 2
     *
569
     * getText() leverages getTextArray() to get the content
570
     * of the document, setting the addPositionWhitespace flag to true
571
     * so whitespace is inserted in a logical way for reading by
572
     * humans.
573
     */
574
    public function getText(Page $page = null): string
575
    {
576 6
        $this->addPositionWhitespace = true;
577
        $result = $this->getTextArray($page);
578
        $this->addPositionWhitespace = false;
579 29
580
        return implode('', $result).' ';
581 29
    }
582
583 29
    /**
584 29
     * Returns the text content of a PDF as an array of strings. No
585 29
     * extra whitespace is inserted besides what is actually encoded in
586
     * the PDF text.
587 29
     *
588 29
     * @throws \Exception
589 29
     */
590
    public function getTextArray(Page $page = null): array
591 29
    {
592 29
        $result = [];
593 29
        $text = [];
594 29
595 29
        $marked_stack = [];
596 29
        $last_written_position = false;
597
598
        $sections = $this->getSectionsText($this->content);
599
        $current_font = $this->getDefaultFont($page);
600 29
        $current_font_size = 1;
601 29
        $current_text_leading = 0;
602 29
603 11
        $current_position = ['x' => false, 'y' => false];
604 11
        $current_position_tm = [
605 11
            'a' => 1, 'b' => 0, 'c' => 0,
606
            'i' => 0, 'j' => 1, 'k' => 0,
607
            'x' => 0, 'y' => 0, 'z' => 1,
608
        ];
609 11
        $current_position_td = ['x' => 0, 'y' => 0];
610 11
        $current_position_cm = [
611 11
            'a' => 1, 'b' => 0, 'c' => 0,
612
            'i' => 0, 'j' => 1, 'k' => 0,
613 29
            'x' => 0, 'y' => 0, 'z' => 1,
614
        ];
615 29
616 29
        $clipped_font = [];
617
        $clipped_position_cm = [];
618 25
619 25
        self::$recursionStack[] = $this->getUniqueId();
620 25
621
        foreach ($sections as $section) {
622 25
            $commands = $this->getCommandsText($section);
623
            foreach ($commands as $command) {
624 25
                switch ($command[self::OPERATOR]) {
625 25
                    // Begin text object
626 25
                    case 'BT':
627
                        // Reset text positioning matrices
628
                        $current_position_tm = [
629 25
                            'a' => 1, 'b' => 0, 'c' => 0,
630 25
                            'i' => 0, 'j' => 1, 'k' => 0,
631
                            'x' => 0, 'y' => 0, 'z' => 1,
632 25
                        ];
633
                        $current_position_td = ['x' => 0, 'y' => 0];
634 29
                        $current_text_leading = 0;
635 29
                        break;
636
637 14
                        // Begin marked content sequence with property list
638 14
                    case 'BDC':
639 14
                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
640 14
                            $dict = $this->parseDictionary($match[1]);
641 14
642 14
                            // Check for ActualText block
643
                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
644
                                if ('[' == $dict['ActualText'][0]) {
645 14
                                    // Simulate a 'TJ' command on the stack
646 9
                                    $marked_stack[] = [
647 9
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
648
                                    ];
649 14
                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
650
                                    // Simulate a 'Tj' command on the stack
651 29
                                    $marked_stack[] = [
652 29
                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
653 22
                                    ];
654 22
                                }
655 22
                            }
656 22
                        }
657 22
                        break;
658 22
659 22
                        // Begin marked content sequence
660
                    case 'BMC':
661
                        if ('ReversedChars' == $command[self::COMMAND]) {
662 22
                            // Upon encountering a ReversedChars command,
663 22
                            // add the characters we've built up so far to
664 22
                            // the result array
665
                            $result = array_merge($result, $text);
666
667 16
                            // Start a fresh $text array that will contain
668 16
                            // reversed characters
669
                            $text = [];
670 22
671
                            // Add the reversed text flag to the stack
672
                            $marked_stack[] = ['ReversedChars' => true];
673
                        }
674
                        break;
675 22
676
                        // set graphics position matrix
677 22
                    case 'cm':
678 22
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
679
                        $current_position_cm = [
680 22
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
681
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
682 22
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
683 22
                        ];
684
                        break;
685 22
686 18
                    case 'Do':
687 18
                        if (null !== $page) {
688
                            $args = preg_split('/\s/s', $command[self::COMMAND]);
689
                            $id = trim(array_pop($args), '/ ');
690 22
                            $xobject = $page->getXObject($id);
691
692
                            // @todo $xobject could be a ElementXRef object, which would then throw an error
693 29
                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
694 1
                                // Not a circular reference.
695 29
                                $text[] = $xobject->getText($page);
696 29
                            }
697 29
                        }
698
                        break;
699
700
                        // Marked content point with (DP) & without (MP) property list
701 29
                    case 'DP':
702 29
                    case 'MP':
703 29
                        break;
704 24
705 22
                        // End text object
706 22
                    case 'ET':
707 22
                        break;
708 17
709 17
                        // Store current selected font and graphics matrix
710 17
                    case 'q':
711 17
                        $clipped_font[] = [$current_font, $current_font_size];
712 17
                        $clipped_position_cm[] = $current_position_cm;
713
                        break;
714
715
                        // Restore previous selected font and graphics matrix
716 29
                    case 'Q':
717 29
                        list($current_font, $current_font_size) = array_pop($clipped_font);
718 29
                        $current_position_cm = array_pop($clipped_position_cm);
719 29
                        break;
720 29
721
                        // End marked content sequence
722
                    case 'EMC':
723 25
                        $data = false;
724
                        if (\count($marked_stack)) {
725
                            $marked = array_pop($marked_stack);
726
                            $action = key($marked);
727 29
                            $data = $marked[$action];
728
729
                            switch ($action) {
730 42
                                // If we are in ReversedChars mode...
731
                                case 'ReversedChars':
732
                                    // Reverse the characters we've built up so far
733
                                    foreach ($text as $key => $t) {
734
                                        $text[$key] = implode('', array_reverse(
735
                                            mb_str_split($t, 1, mb_internal_encoding())
0 ignored issues
show
Bug introduced by
It seems like mb_internal_encoding() can also be of type true; however, parameter $encoding of mb_str_split() does only seem to accept null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

735
                                            mb_str_split($t, 1, /** @scrutinizer ignore-type */ mb_internal_encoding())
Loading history...
736 42
                                        ));
737 42
                                    }
738 8
739 8
                                    // Add these characters to the result array
740 3
                                    $result = array_merge($result, $text);
741
742 6
                                    // Start a fresh $text array that will contain
743 6
                                    // non-reversed characters
744
                                    $text = [];
745
                                    break;
746
747
                                case 'ActualText':
748 42
                                    // Use the content of the ActualText as a command
749 41
                                    $command = $data;
750
                                    break;
751 42
                            }
752 41
                        }
753
754 42
                        // If this EMC command has been transformed into a 'Tj'
755 6
                        // or 'TJ' command because of being ActualText, then bypass
756
                        // the break to proceed to the writing section below.
757 42
                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
758 41
                            break;
759 41
                        }
760
761 41
                        // no break
762 41
                    case "'":
763
                    case '"':
764
                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
765
                            // Move to next line and write text
766
                            $current_position['x'] = 0;
767
                            $current_position_td['x'] = 0;
768 42
                            $current_position_td['y'] += $current_text_leading;
769
                        }
770
                        // no break
771
                    case 'Tj':
772
                        $command[self::COMMAND] = [$command];
773
                        // no break
774
                    case 'TJ':
775 20
                        // Check the marked content stack for flags
776
                        $actual_text = false;
777 20
                        $reverse_text = false;
778
                        foreach ($marked_stack as $marked) {
779
                            if (isset($marked['ActualText'])) {
780
                                $actual_text = true;
781
                            }
782
                            if (isset($marked['ReversedChars'])) {
783
                                $reverse_text = true;
784
                            }
785
                        }
786
787
                        // Account for text position ONLY just before we write text
788
                        if (false === $actual_text && \is_array($last_written_position)) {
789
                            // If $last_written_position is an array, that
790
                            // means we have stored text position coordinates
791
                            // for placing an ActualText
792
                            $currentX = $last_written_position[0];
793
                            $currentY = $last_written_position[1];
794
                            $last_written_position = false;
795
                        } else {
796
                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
797
                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
798
                        }
799
                        $whiteSpace = '';
800
801
                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
802
                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
803
804
                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
805
                            $curY = $currentY - $current_position['y'];
806
                            if (abs($curY) >= abs($factorY) / 4) {
807
                                $whiteSpace = "\n";
808
                            } else {
809
                                if (true === $reverse_text) {
810
                                    $curX = $current_position['x'] - $currentX;
811
                                } else {
812
                                    $curX = $currentX - $current_position['x'];
813
                                }
814
815
                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
816
                                // as the number of apparent "spaces" in a document we
817
                                // would need before considering them a "tab". In the
818
                                // future, we might offer this value to users as a config
819
                                // option.
820
                                if ($curX >= abs($factorX * 7)) {
821
                                    $whiteSpace = "\t";
822
                                } elseif ($curX >= abs($factorX * 2)) {
823
                                    $whiteSpace = ' ';
824
                                }
825
                            }
826
                        }
827
828
                        $newtext = $this->getTJUsingFontFallback(
829
                            $current_font,
830
                            $command[self::COMMAND],
831
                            $page,
832
                            $factorX
833
                        );
834
835
                        // If there is no ActualText pending then write
836
                        if (false === $actual_text) {
837
                            $newtext = str_replace(["\r", "\n"], '', $newtext);
838
                            if (false !== $reverse_text) {
839
                                // If we are in ReversedChars mode, add the whitespace last
840
                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
841
                            } else {
842
                                // Otherwise add the whitespace first
843
                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
844
                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
845
                                }
846
                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
847
                            }
848
849
                            // Record the position of this inserted text for comparison
850
                            // with the next text block.
851
                            // Provide a 'fudge' factor guess on how wide this text block
852
                            // is based on the number of characters. This helps limit the
853
                            // number of tabs inserted, but isn't perfect.
854
                            $factor = $factorX / 2;
855
                            $current_position = [
856
                                'x' => $currentX - mb_strlen($newtext) * $factor,
857
                                'y' => $currentY,
858
                            ];
859
                        } elseif (false === $last_written_position) {
860
                            // If there is an ActualText in the pipeline
861
                            // store the position this undisplayed text
862
                            // *would* have been written to, so the
863
                            // ActualText is displayed in the right spot
864
                            $last_written_position = [$currentX, $currentY];
865
                            $current_position['x'] = $currentX;
866
                        }
867
                        break;
868
869
                        // move to start of next line
870
                    case 'T*':
871
                        $current_position['x'] = 0;
872
                        $current_position_td['x'] = 0;
873
                        $current_position_td['y'] += $current_text_leading;
874
                        break;
875
876
                        // set character spacing
877
                    case 'Tc':
878
                        break;
879
880
                        // move text current point and set leading
881
                    case 'Td':
882
                    case 'TD':
883
                        // move text current point
884
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
885
                        $y = (float) array_pop($args);
886
                        $x = (float) array_pop($args);
887
888
                        if ('TD' == $command[self::OPERATOR]) {
889
                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
890
                        }
891
892
                        $current_position_td = [
893
                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
894
                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
895
                        ];
896
                        break;
897
898
                    case 'Tf':
899
                        $args = preg_split('/\s/s', $command[self::COMMAND]);
900
                        $size = (float) array_pop($args);
901
                        $id = trim(array_pop($args), '/');
902
                        if (null !== $page) {
903
                            $new_font = $page->getFont($id);
904
                            // If an invalid font ID is given, do not update the font.
905
                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
906
                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
907
                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
908
                            // But we want to make sure that malformed PDFs do not simply crash.
909
                            if (null !== $new_font) {
910
                                $current_font = $new_font;
911
                                $current_font_size = $size;
912
                            }
913
                        }
914
                        break;
915
916
                        // set leading
917
                    case 'TL':
918
                        $y = (float) $command[self::COMMAND];
919
                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
920
                        break;
921
922
                        // set text position matrix
923
                    case 'Tm':
924
                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
925
                        $current_position_tm = [
926
                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
927
                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
928
                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
929
                        ];
930
                        break;
931
932
                        // set text rendering mode
933
                    case 'Ts':
934
                        break;
935
936
                        // set super/subscripting text rise
937
                    case 'Ts':
938
                        break;
939
940
                        // set word spacing
941
                    case 'Tw':
942
                        break;
943
944
                        // set horizontal scaling
945
                    case 'Tz':
946
                        break;
947
948
                    default:
949
                }
950
            }
951
        }
952
953
        $result = array_merge($result, $text);
954
955
        return $result;
956
    }
957
958
    /**
959
     * getCommandsText() expects the content of $text_part to be an
960
     * already formatted, single-line command from a document stream.
961
     * The companion function getSectionsText() returns a document
962
     * stream as an array of single commands for just this purpose.
963
     * Because of this, the argument $offset is no longer used, and
964
     * may be removed in a future PdfParser release.
965
     *
966
     * A better name for this function would be getCommandText()
967
     * since it now always works on just one command.
968
     */
969
    public function getCommandsText(string $text_part, int &$offset = 0): array
0 ignored issues
show
Unused Code introduced by
The parameter $offset is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

969
    public function getCommandsText(string $text_part, /** @scrutinizer ignore-unused */ int &$offset = 0): array

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
970
    {
971
        $commands = $matches = [];
972
973
        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
974
975
        // If no valid command is detected, return an empty array
976
        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
977
            return [];
978
        }
979
980
        $type = $matches[2];
981
        $operator = $matches[3];
982
        $command = trim($matches[1]);
983
984
        if ('TJ' == $operator) {
985
            $subcommand = [];
986
            $command = trim($command, '[]');
987
            do {
988
                $oldCommand = $command;
989
990
                // Search for parentheses string () format
991
                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
992
                    $subcommand[] = [
993
                        self::TYPE => '(',
994
                        self::OPERATOR => 'TJ',
995
                        self::COMMAND => $tjmatch[1],
996
                    ];
997
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
998
                        $subcommand[] = [
999
                            self::TYPE => 'n',
1000
                            self::OPERATOR => '',
1001
                            self::COMMAND => $tjmatch[2],
1002
                        ];
1003
                    }
1004
                    $command = substr($command, \strlen($tjmatch[0]));
1005
                }
1006
1007
                // Search for hexadecimal <> format
1008
                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
1009
                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
1010
                    $subcommand[] = [
1011
                        self::TYPE => '<',
1012
                        self::OPERATOR => 'TJ',
1013
                        self::COMMAND => $tjmatch[1],
1014
                    ];
1015
                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
1016
                        $subcommand[] = [
1017
                            self::TYPE => 'n',
1018
                            self::OPERATOR => '',
1019
                            self::COMMAND => $tjmatch[2],
1020
                        ];
1021
                    }
1022
                    $command = substr($command, \strlen($tjmatch[0]));
1023
                }
1024
            } while ($command != $oldCommand);
1025
1026
            $command = $subcommand;
1027
        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
1028
            // Depending on the string type, trim the data of the
1029
            // appropriate delimiters
1030
            if ('(' == $type) {
1031
                // Don't use trim() here since a () string may end with
1032
                // a balanced or escaped right parentheses, and trim()
1033
                // will delete both. Both strings below are valid:
1034
                //   eg. (String())
1035
                //   eg. (String\))
1036
                $command = preg_replace('/^\(|\)$/', '', $command);
1037
            } elseif ('<' == $type) {
1038
                $command = trim($command, '<>');
1039
            }
1040
        } elseif ('/' == $type) {
1041
            $command = substr($command, 1);
1042
        }
1043
1044
        $commands[] = [
1045
            self::TYPE => $type,
1046
            self::OPERATOR => $operator,
1047
            self::COMMAND => $command,
1048
        ];
1049
1050
        return $commands;
1051
    }
1052
1053
    public static function factory(
1054
        Document $document,
1055
        Header $header,
1056
        ?string $content,
1057
        Config $config = null
1058
    ): self {
1059
        switch ($header->get('Type')->getContent()) {
1060
            case 'XObject':
1061
                switch ($header->get('Subtype')->getContent()) {
1062
                    case 'Image':
1063
                        return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
0 ignored issues
show
Bug introduced by
The method getRetainImageContent() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1063
                        return new Image($document, $header, $config->/** @scrutinizer ignore-call */ getRetainImageContent() ? $content : null, $config);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1064
1065
                    case 'Form':
1066
                        return new Form($document, $header, $content, $config);
1067
                }
1068
1069
                return new self($document, $header, $content, $config);
1070
1071
            case 'Pages':
1072
                return new Pages($document, $header, $content, $config);
1073
1074
            case 'Page':
1075
                return new Page($document, $header, $content, $config);
1076
1077
            case 'Encoding':
1078
                return new Encoding($document, $header, $content, $config);
1079
1080
            case 'Font':
1081
                $subtype = $header->get('Subtype')->getContent();
1082
                $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
1083
1084
                if (class_exists($classname)) {
1085
                    return new $classname($document, $header, $content, $config);
1086
                }
1087
1088
                return new Font($document, $header, $content, $config);
1089
1090
            default:
1091
                return new self($document, $header, $content, $config);
1092
        }
1093
    }
1094
1095
    /**
1096
     * Returns unique id identifying the object.
1097
     */
1098
    protected function getUniqueId(): string
1099
    {
1100
        return spl_object_hash($this);
1101
    }
1102
}
1103