Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

Font::decodeEntities()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 3
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 5
ccs 4
cts 4
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
36
use Smalot\PdfParser\Exception\EncodingNotFoundException;
37
38
/**
39
 * Class Font
40
 */
41
class Font extends PDFObject
42
{
43
    public const MISSING = '?';
44
45
    /**
46
     * @var array
47
     */
48
    protected $table;
49
50
    /**
51
     * @var array
52
     */
53
    protected $tableSizes;
54
55
    /**
56
     * Caches results from uchr.
57
     *
58
     * @var array
59
     */
60
    private static $uchrCache = [];
61
62
    /**
63
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
64
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
65
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
66
     *
67
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
68
     *
69
     * @var Encoding
70
     *
71
     * @see https://github.com/smalot/pdfparser/pull/500
72
     */
73
    private $initializedEncodingByPdfObject;
74
75 65
    public function init()
76
    {
77
        // Load translate table.
78 65
        $this->loadTranslateTable();
79
    }
80
81 4
    public function getName(): string
82
    {
83 4
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
84
    }
85
86 4
    public function getType(): string
87
    {
88 4
        return (string) $this->header->get('Subtype');
89
    }
90
91 3
    public function getDetails(bool $deep = true): array
92
    {
93 3
        $details = [];
94
95 3
        $details['Name'] = $this->getName();
96 3
        $details['Type'] = $this->getType();
97 3
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
98
99 3
        $details += parent::getDetails($deep);
100
101 3
        return $details;
102
    }
103
104
    /**
105
     * @return string|bool
106
     */
107 43
    public function translateChar(string $char, bool $use_default = true)
108
    {
109 43
        $dec = hexdec(bin2hex($char));
110
111 43
        if (\array_key_exists($dec, $this->table)) {
112 40
            return $this->table[$dec];
113
        }
114
115
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
116 10
        $fallbackDecoded = $char;
117
        if (
118 10
            \strlen($char) < 2
119 10
            && $this->has('Encoding')
120 10
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
121
        ) {
122
            try {
123 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
124 2
                    $fallbackDecoded = self::uchr($dec);
125
                }
126
            } catch (EncodingNotFoundException $e) {
127
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
128
                // See table 5.11 on PDF 1.5 specs for more info
129
            }
130
        }
131
132 10
        return $use_default ? self::MISSING : $fallbackDecoded;
133
    }
134
135
    /**
136
     * Convert unicode character code to "utf-8" encoded string.
137
     *
138
     * @param int|float $code Unicode character code. Will be casted to int internally!
139
     */
140 61
    public static function uchr($code): string
141
    {
142
        // note:
143
        // $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
144
        // because in some cases uchr was called with a float instead of an integer.
145 61
        $code = (int) $code;
146
147 61
        if (!isset(self::$uchrCache[$code])) {
148
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
149
            // therefore, we use mb_convert_encoding() instead
150 20
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
151
        }
152
153 61
        return self::$uchrCache[$code];
154
    }
155
156
    /**
157
     * Init internal chars translation table by ToUnicode CMap.
158
     */
159 65
    public function loadTranslateTable(): array
160
    {
161 65
        if (null !== $this->table) {
162 1
            return $this->table;
163
        }
164
165 65
        $this->table = [];
166 65
        $this->tableSizes = [
167 65
            'from' => 1,
168 65
            'to' => 1,
169 65
        ];
170
171 65
        if ($this->has('ToUnicode')) {
172 56
            $content = $this->get('ToUnicode')->getContent();
173 56
            $matches = [];
174
175
            // Support for multiple spacerange sections
176 56
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
177 55
                foreach ($matches['sections'] as $section) {
178 55
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
179
180 55
                    preg_match_all($regexp, $section, $matches);
181
182 55
                    $this->tableSizes = [
183 55
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
184 55
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
185 55
                    ];
186
187 55
                    break;
188
                }
189
            }
190
191
            // Support for multiple bfchar sections
192 56
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
193 30
                foreach ($matches['sections'] as $section) {
194 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
195
196 30
                    preg_match_all($regexp, $section, $matches);
197
198 30
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
199
200 30
                    foreach ($matches['from'] as $key => $from) {
201 30
                        $parts = preg_split(
202 30
                            '/([0-9A-F]{4})/i',
203 30
                            $matches['to'][$key],
204 30
                            0,
205 30
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
206 30
                        );
207 30
                        $text = '';
208 30
                        foreach ($parts as $part) {
209 30
                            $text .= self::uchr(hexdec($part));
210
                        }
211 30
                        $this->table[hexdec($from)] = $text;
212
                    }
213
                }
214
            }
215
216
            // Support for multiple bfrange sections
217 56
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
218 38
                foreach ($matches['sections'] as $section) {
219
                    // Support for : <srcCode1> <srcCode2> <dstString>
220 38
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
221
222 38
                    preg_match_all($regexp, $section, $matches);
223
224 38
                    foreach ($matches['from'] as $key => $from) {
225 38
                        $char_from = hexdec($from);
226 38
                        $char_to = hexdec($matches['to'][$key]);
227 38
                        $offset = hexdec($matches['offset'][$key]);
228
229 38
                        for ($char = $char_from; $char <= $char_to; ++$char) {
230 38
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
231
                        }
232
                    }
233
234
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
235
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
236 38
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
237
238 38
                    preg_match_all($regexp, $section, $matches);
239
240 38
                    foreach ($matches['from'] as $key => $from) {
241 4
                        $char_from = hexdec($from);
242 4
                        $strings = [];
243
244 4
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
245
246 4
                        foreach ($strings['string'] as $position => $string) {
247 4
                            $parts = preg_split(
248 4
                                '/([0-9A-F]{4})/i',
249 4
                                $string,
250 4
                                0,
251 4
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
252 4
                            );
253 4
                            $text = '';
254 4
                            foreach ($parts as $part) {
255 4
                                $text .= self::uchr(hexdec($part));
256
                            }
257 4
                            $this->table[$char_from + $position] = $text;
258
                        }
259
                    }
260
                }
261
            }
262
        }
263
264 65
        return $this->table;
265
    }
266
267
    /**
268
     * Set custom char translation table where:
269
     * - key - integer character code;
270
     * - value - "utf-8" encoded value;
271
     *
272
     * @return void
273
     */
274 2
    public function setTable(array $table)
275
    {
276 2
        $this->table = $table;
277
    }
278
279
    /**
280
     * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
281
     */
282 2
    public function calculateTextWidth(string $text, ?array &$missing = null): ?float
283
    {
284 2
        $index_map = array_flip($this->table);
285 2
        $details = $this->getDetails();
286
287
        // Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
288 2
        $widths = $details['Widths'] ?? [];
289
290
        // Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
291 2
        $width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
292
293 2
        $width = null;
294 2
        $missing = [];
295 2
        $textLength = mb_strlen($text);
296 2
        for ($i = 0; $i < $textLength; ++$i) {
297 2
            $char = mb_substr($text, $i, 1);
298
            if (
299 2
                !\array_key_exists($char, $index_map)
300 1
                || !\array_key_exists($index_map[$char], $width_map)
301 2
                || !\array_key_exists($width_map[$index_map[$char]], $widths)
302
            ) {
303 2
                $missing[] = $char;
304 2
                continue;
305
            }
306 1
            $width_index = $width_map[$index_map[$char]];
307 1
            $width += $widths[$width_index];
308
        }
309
310 2
        return $width;
311
    }
312
313
    /**
314
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
315
     */
316 69
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
317
    {
318
        // Special shortcut for XML content.
319 69
        if (false !== stripos($hexa, '<?xml')) {
320 2
            return $hexa;
321
        }
322
323 69
        $text = '';
324 69
        $parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
325
326 69
        foreach ($parts as $part) {
327 69
            if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
328
                // strip whitespace
329 27
                $part = preg_replace("/\s/", '', $part);
330 27
                $part = trim($part, '<>');
331 27
                if ($add_braces) {
332 1
                    $text .= '(';
333
                }
334
335 27
                $part = pack('H*', $part);
336 27
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
337
338 27
                if ($add_braces) {
339 27
                    $text .= ')';
340
                }
341
            } else {
342 69
                $text .= $part;
343
            }
344
        }
345
346 69
        return $text;
347
    }
348
349
    /**
350
     * Decode string with octal-decoded chunks.
351
     */
352 68
    public static function decodeOctal(string $text): string
353
    {
354
        // Replace all double backslashes \\ with a special string
355 68
        $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
356
357
        // Now we can replace all octal codes without worrying about
358
        // escaped backslashes
359 68
        $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
360 26
            return \chr(octdec($m[1]));
0 ignored issues
show
Bug introduced by
It seems like octdec($m[1]) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

360
            return \chr(/** @scrutinizer ignore-type */ octdec($m[1]));
Loading history...
361 68
        }, $text);
362
363
        // Unescape any parentheses
364 68
        $text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
365
366
        // Replace instances of the special string with a single backslash
367 68
        return str_replace('[**pdfparserdblslsh**]', '\\', $text);
368
    }
369
370
    /**
371
     * Decode string with html entity encoded chars.
372
     */
373 82
    public static function decodeEntities(string $text): string
374
    {
375 82
        return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
376 7
            return \chr(hexdec($m[1]));
0 ignored issues
show
Bug introduced by
It seems like hexdec($m[1]) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

376
            return \chr(/** @scrutinizer ignore-type */ hexdec($m[1]));
Loading history...
377 82
        }, $text);
378
    }
379
380
    /**
381
     * Check if given string is Unicode text (by BOM);
382
     * If true - decode to "utf-8" encoded string.
383
     * Otherwise - return text as is.
384
     *
385
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
386
     */
387 69
    public static function decodeUnicode(string $text): string
388
    {
389 69
        if ("\xFE\xFF" === substr($text, 0, 2)) {
390
            // Strip U+FEFF byte order marker.
391 40
            $decode = substr($text, 2);
392 40
            $text = '';
393 40
            $length = \strlen($decode);
394
395 40
            for ($i = 0; $i < $length; $i += 2) {
396 40
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
397
            }
398
        }
399
400 69
        return $text;
401
    }
402
403
    /**
404
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
405
     */
406 45
    protected function getFontSpaceLimit(): int
407
    {
408 45
        return $this->config->getFontSpaceLimit();
0 ignored issues
show
Bug introduced by
The method getFontSpaceLimit() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

408
        return $this->config->/** @scrutinizer ignore-call */ getFontSpaceLimit();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
409
    }
410
411
    /**
412
     * Decode text by commands array.
413
     */
414 45
    public function decodeText(array $commands, float $fontFactor = 4): string
415
    {
416 45
        $word_position = 0;
417 45
        $words = [];
418 45
        $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;
419
420 45
        foreach ($commands as $command) {
421 44
            switch ($command[PDFObject::TYPE]) {
422 44
                case 'n':
423 32
                    $offset = (float) trim($command[PDFObject::COMMAND]);
424 32
                    if ($offset - (float) $font_space < 0) {
425 16
                        $word_position = \count($words);
426
                    }
427 32
                    continue 2;
428 44
                case '<':
429
                    // Decode hexadecimal.
430 25
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
431 25
                    break;
432
433
                default:
434
                    // Decode octal (if necessary).
435 34
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
436
            }
437
438
            // replace escaped chars
439 44
            $text = str_replace(
440 44
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
441 44
                [\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
442 44
                $text
443 44
            );
444
445
            // add content to result string
446 44
            if (isset($words[$word_position])) {
447 32
                $words[$word_position] .= $text;
448
            } else {
449 44
                $words[$word_position] = $text;
450
            }
451
        }
452
453 45
        foreach ($words as &$word) {
454 44
            $word = $this->decodeContent($word);
455 44
            $word = str_replace("\t", ' ', $word);
456
        }
457
458
        // Remove internal "words" that are just spaces, but leave them
459
        // if they are at either end of the array of words. This fixes,
460
        // for   example,   lines   that   are   justified   to   fill
461
        // a whole row.
462 45
        for ($x = \count($words) - 2; $x >= 1; --$x) {
463 12
            if ('' === trim($words[$x], ' ')) {
464 4
                unset($words[$x]);
465
            }
466
        }
467 45
        $words = array_values($words);
468
469
        // Cut down on the number of unnecessary internal spaces by
470
        // imploding the string on the null byte, and checking if the
471
        // text includes extra spaces on either side. If so, merge
472
        // where appropriate.
473 45
        $words = implode("\x00\x00", $words);
474 45
        $words = str_replace(
475 45
            [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
476 45
            ['  ', ' ', ' ', ' '],
477 45
            $words
478 45
        );
479
480 45
        return $words;
481
    }
482
483
    /**
484
     * Decode given $text to "utf-8" encoded string.
485
     *
486
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
487
     */
488 48
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

488
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
489
    {
490
        // If this string begins with a UTF-16BE BOM, then decode it
491
        // directly as Unicode
492 48
        if ("\xFE\xFF" === substr($text, 0, 2)) {
493 3
            return $this->decodeUnicode($text);
494
        }
495
496 47
        if ($this->has('ToUnicode')) {
497 40
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
498
        }
499
500 31
        if ($this->has('Encoding')) {
501 26
            $result = $this->decodeContentByEncoding($text);
502
503 26
            if (null !== $result) {
504 26
                return $result;
505
            }
506
        }
507
508 9
        return $this->decodeContentByAutodetectIfNecessary($text);
509
    }
510
511
    /**
512
     * First try to decode $text by ToUnicode CMap.
513
     * If char translation not found in ToUnicode CMap tries:
514
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
515
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
516
     *  - If DescendantFonts does not exist just return "?" as decoded char.
517
     *
518
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
519
     */
520 40
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
521
    {
522 40
        $bytes = $this->tableSizes['from'];
523
524 40
        if ($bytes) {
525 40
            $result = '';
526 40
            $length = \strlen($text);
527
528 40
            for ($i = 0; $i < $length; $i += $bytes) {
529 40
                $char = substr($text, $i, $bytes);
530
531 40
                if (false !== ($decoded = $this->translateChar($char, false))) {
532 40
                    $char = $decoded;
533
                } elseif ($this->has('DescendantFonts')) {
534
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
535
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

535
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
536
                    } else {
537
                        $fonts = $this->get('DescendantFonts')->getContent();
538
                    }
539
                    $decoded = false;
540
541
                    foreach ($fonts as $font) {
542
                        if ($font instanceof self) {
543
                            if (false !== ($decoded = $font->translateChar($char, false))) {
544
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

544
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
545
                                break;
546
                            }
547
                        }
548
                    }
549
550
                    if (false !== $decoded) {
551
                        $char = $decoded;
552
                    } else {
553
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
554
                    }
555
                } else {
556
                    $char = self::MISSING;
557
                }
558
559 40
                $result .= $char;
560
            }
561
562 40
            $text = $result;
563
        }
564
565 40
        return $text;
566
    }
567
568
    /**
569
     * Decode content by any type of Encoding (dictionary's item) instance.
570
     */
571 26
    private function decodeContentByEncoding(string $text): ?string
572
    {
573 26
        $encoding = $this->get('Encoding');
574
575
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
576 26
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
577 4
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
578
        }
579
580
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
581 26
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
582 4
            return $this->decodeContentByEncodingEncoding($text, $encoding);
583
        }
584
585
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
586 23
        if ($encoding instanceof Element) { // todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
587 23
            return $this->decodeContentByEncodingElement($text, $encoding);
588
        }
589
590
        // don't double-encode strings already in UTF-8
591
        if (!mb_check_encoding($text, 'UTF-8')) {
592
            return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
593
        }
594
595
        return $text;
596
    }
597
598
    /**
599
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
600
     */
601 4
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
602
    {
603 4
        if (!$this->initializedEncodingByPdfObject) {
604 4
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
605
        }
606
607 4
        return $this->initializedEncodingByPdfObject;
608
    }
609
610
    /**
611
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
612
     */
613 4
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
614
    {
615 4
        $result = '';
616 4
        $length = \strlen($text);
617
618 4
        for ($i = 0; $i < $length; ++$i) {
619 4
            $dec_av = hexdec(bin2hex($text[$i]));
620 4
            $dec_ap = $encoding->translateChar($dec_av);
621 4
            $result .= self::uchr($dec_ap ?? $dec_av);
622
        }
623
624 4
        return $result;
625
    }
626
627
    /**
628
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
629
     */
630 23
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
631
    {
632 23
        $pdfEncodingName = $encoding->getContent();
633
634
        // mb_convert_encoding does not support MacRoman/macintosh,
635
        // so we use iconv() here
636 23
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
637
638 23
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8//TRANSLIT//IGNORE', $text) : null;
639
    }
640
641
    /**
642
     * Convert PDF encoding name to iconv-known encoding name.
643
     */
644 23
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
645
    {
646 23
        $pdfToIconvEncodingNameMap = [
647 23
            'StandardEncoding' => 'ISO-8859-1',
648 23
            'MacRomanEncoding' => 'MACINTOSH',
649 23
            'WinAnsiEncoding' => 'CP1252',
650 23
        ];
651
652 23
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
653 23
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
654 23
            : null;
655
    }
656
657
    /**
658
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
659
     * Otherwise, interpret string as "Window-1252" encoded string.
660
     *
661
     * @return string|false
662
     */
663 9
    private function decodeContentByAutodetectIfNecessary(string $text)
664
    {
665 9
        if (mb_check_encoding($text, 'UTF-8')) {
666 8
            return $text;
667
        }
668
669 2
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
670
        // todo: Why exactly `Windows-1252` used?
671
    }
672
673
    /**
674
     * Create Encoding instance by PDFObject instance and init it.
675
     */
676 4
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
677
    {
678 4
        $encoding = $this->createEncodingByPdfObject($PDFObject);
679 4
        $encoding->init();
680
681 4
        return $encoding;
682
    }
683
684
    /**
685
     * Create Encoding instance by PDFObject instance (without init).
686
     */
687 4
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
688
    {
689 4
        $document = $PDFObject->getDocument();
690 4
        $header = $PDFObject->getHeader();
691 4
        $content = $PDFObject->getContent();
692 4
        $config = $PDFObject->getConfig();
693
694 4
        return new Encoding($document, $header, $content, $config);
695
    }
696
}
697