Passed
Pull Request — master (#500)
by
unknown
02:09
created

decodeContentByToUnicodeCMapOrDescendantFonts()   B

Complexity

Conditions 10
Paths 2

Size

Total Lines 46
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 25.6155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 28
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 46
ccs 12
cts 26
cp 0.4615
crap 25.6155
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
63
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
64
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     *
70
     * @see https://github.com/smalot/pdfparser/pull/500
71
     */
72
    private $initializedEncodingByPdfObject;
73
74 37
    public function init()
75
    {
76
        // Load translate table.
77 37
        $this->loadTranslateTable();
78 37
    }
79
80 2
    public function getName(): string
81
    {
82 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
83
    }
84
85 2
    public function getType(): string
86
    {
87 2
        return (string) $this->header->get('Subtype');
88
    }
89
90 1
    public function getDetails(bool $deep = true): array
91
    {
92 1
        $details = [];
93
94 1
        $details['Name'] = $this->getName();
95 1
        $details['Type'] = $this->getType();
96 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
97
98 1
        $details += parent::getDetails($deep);
99
100 1
        return $details;
101
    }
102
103
    /**
104
     * @return string|bool
105
     */
106 21
    public function translateChar(string $char, bool $use_default = true)
107
    {
108 21
        $dec = hexdec(bin2hex($char));
109
110 21
        if (\array_key_exists($dec, $this->table)) {
111 18
            return $this->table[$dec];
112
        }
113
114
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
115 6
        $fallbackDecoded = $char;
116
        if (
117 6
            \strlen($char) < 2
118 6
            && $this->has('Encoding')
119 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
120
        ) {
121
            try {
122 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
123 1
                    $fallbackDecoded = self::uchr($dec);
124
                }
125 1
            } catch (EncodingNotFoundException $e) {
126
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
127
                // See table 5.11 on PDF 1.5 specs for more info
128
            }
129
        }
130
131 6
        return $use_default ? self::MISSING : $fallbackDecoded;
132
    }
133
134
    /**
135
     * Convert unicode character code to "utf-8" encoded string.
136
     *
137
     * @param int $code
138
     * @return string
139
     */
140 35
    public static function uchr(int $code): string
141
    {
142 35
        if (!isset(self::$uchrCache[$code])) {
143
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
144
            // therefore, we use mb_convert_encoding() instead
145 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
146
        }
147
148 35
        return self::$uchrCache[$code];
149
    }
150
151
    /**
152
     * Init internal chars translation table by ToUnicode CMap.
153
     *
154
     * @return array
155
     */
156 37
    public function loadTranslateTable(): array
157
    {
158 37
        if (null !== $this->table) {
159 1
            return $this->table;
160
        }
161
162 37
        $this->table = [];
163 37
        $this->tableSizes = [
164
            'from' => 1,
165
            'to' => 1,
166
        ];
167
168 37
        if ($this->has('ToUnicode')) {
169 30
            $content = $this->get('ToUnicode')->getContent();
170 30
            $matches = [];
171
172
            // Support for multiple spacerange sections
173 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
174 30
                foreach ($matches['sections'] as $section) {
175 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
176
177 30
                    preg_match_all($regexp, $section, $matches);
178
179 30
                    $this->tableSizes = [
180 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
181 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
182
                    ];
183
184 30
                    break;
185
                }
186
            }
187
188
            // Support for multiple bfchar sections
189 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
190 12
                foreach ($matches['sections'] as $section) {
191 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
192
193 12
                    preg_match_all($regexp, $section, $matches);
194
195 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
196
197 12
                    foreach ($matches['from'] as $key => $from) {
198 12
                        $parts = preg_split(
199 12
                            '/([0-9A-F]{4})/i',
200 12
                            $matches['to'][$key],
201 12
                            0,
202 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
203
                        );
204 12
                        $text = '';
205 12
                        foreach ($parts as $part) {
206 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

206
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
207
                        }
208 12
                        $this->table[hexdec($from)] = $text;
209
                    }
210
                }
211
            }
212
213
            // Support for multiple bfrange sections
214 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
215 24
                foreach ($matches['sections'] as $section) {
216
                    // Support for : <srcCode1> <srcCode2> <dstString>
217 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
218
219 24
                    preg_match_all($regexp, $section, $matches);
220
221 24
                    foreach ($matches['from'] as $key => $from) {
222 24
                        $char_from = hexdec($from);
223 24
                        $char_to = hexdec($matches['to'][$key]);
224 24
                        $offset = hexdec($matches['offset'][$key]);
225
226 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
227 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
228
                        }
229
                    }
230
231
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
232
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
233 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
234
235 24
                    preg_match_all($regexp, $section, $matches);
236
237 24
                    foreach ($matches['from'] as $key => $from) {
238 1
                        $char_from = hexdec($from);
239 1
                        $strings = [];
240
241 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
242
243 1
                        foreach ($strings['string'] as $position => $string) {
244 1
                            $parts = preg_split(
245 1
                                '/([0-9A-F]{4})/i',
246
                                $string,
247 1
                                0,
248 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
249
                            );
250 1
                            $text = '';
251 1
                            foreach ($parts as $part) {
252 1
                                $text .= self::uchr(hexdec($part));
253
                            }
254 1
                            $this->table[$char_from + $position] = $text;
255
                        }
256
                    }
257
                }
258
            }
259
        }
260
261 37
        return $this->table;
262
    }
263
264
    /**
265
     * Set custom char translation table where:
266
     * - key - integer character code;
267
     * - value - "utf-8" encoded value;
268
     *
269
     * @param array $table
270
     * @return void
271
     */
272 1
    public function setTable(array $table)
273
    {
274 1
        $this->table = $table;
275 1
    }
276
277
    /**
278
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
279
     *
280
     * @param string $hexa
281
     * @param bool $add_braces
282
     * @return string
283
     */
284 40
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
285
    {
286
        // Special shortcut for XML content.
287 40
        if (false !== stripos($hexa, '<?xml')) {
288 2
            return $hexa;
289
        }
290
291 40
        $text = '';
292 40
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
293
294 40
        foreach ($parts as $part) {
295 40
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
296
                // strip line breaks
297 12
                $part = preg_replace("/[\r\n]/", '', $part);
298 12
                $part = trim($part, '<>');
299 12
                if ($add_braces) {
300 1
                    $text .= '(';
301
                }
302
303 12
                $part = pack('H*', $part);
304 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
305
306 12
                if ($add_braces) {
307 12
                    $text .= ')';
308
                }
309
            } else {
310 40
                $text .= $part;
311
            }
312
        }
313
314 40
        return $text;
315
    }
316
317
    /**
318
     * Decode string with octal-decoded chunks.
319
     *
320
     * @param string $text
321
     * @return string
322
     */
323 40
    public static function decodeOctal(string $text): string
324
    {
325 40
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
326 40
        $text = '';
327
328 40
        foreach ($parts as $part) {
329 40
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
330 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

330
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
331
            } else {
332 40
                $text .= $part;
333
            }
334
        }
335
336 40
        return $text;
337
    }
338
339
    /**
340
     * Decode string with html entity encoded chars.
341
     * @param string $text
342
     * @return string
343
     */
344 54
    public static function decodeEntities(string $text): string
345
    {
346 54
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
347 54
        $text = '';
348
349 54
        foreach ($parts as $part) {
350 54
            if (preg_match('/^#\d{2}$/', $part)) {
351 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

351
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
352
            } else {
353 54
                $text .= $part;
354
            }
355
        }
356
357 54
        return $text;
358
    }
359
360
    /**
361
     * Check if given string is Unicode text (by BOM);
362
     * If true - decode to "utf-8" encoded string.
363
     * Otherwise - return text as is.
364
     *
365
     * //todo: rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
366
     * @param string $text
367
     * @return string
368
     */
369 40
    public static function decodeUnicode(string $text): string
370
    {
371 40
        if (preg_match('/^\xFE\xFF/i', $text)) {
372
            // Strip U+FEFF byte order marker.
373 24
            $decode = substr($text, 2);
374 24
            $text = '';
375 24
            $length = \strlen($decode);
376
377 24
            for ($i = 0; $i < $length; $i += 2) {
378 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

378
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
379
            }
380
        }
381
382 40
        return $text;
383
    }
384
385
    /**
386
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
387
     */
388 20
    protected function getFontSpaceLimit(): int
389
    {
390 20
        return $this->config->getFontSpaceLimit();
391
    }
392
393
    /**
394
     * Decode text by commands array.
395
     *
396
     * @param array $commands
397
     * @return string
398
     */
399 20
    public function decodeText(array $commands): string
400
    {
401 20
        $word_position = 0;
402 20
        $words = [];
403 20
        $font_space = $this->getFontSpaceLimit();
404
405 20
        foreach ($commands as $command) {
406 20
            switch ($command[PDFObject::TYPE]) {
407 20
                case 'n':
408 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
409 7
                        $word_position = \count($words);
410
                    }
411 15
                    continue 2;
412 20
                case '<':
413
                    // Decode hexadecimal.
414 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
415 10
                    break;
416
417
                default:
418
                    // Decode octal (if necessary).
419 13
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
420
            }
421
422
            // replace escaped chars
423 20
            $text = str_replace(
424 20
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
425 20
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
426
                $text
427
            );
428
429
            // add content to result string
430 20
            if (isset($words[$word_position])) {
431 15
                $words[$word_position] .= $text;
432
            } else {
433 20
                $words[$word_position] = $text;
434
            }
435
        }
436
437 20
        foreach ($words as &$word) {
438 20
            $word = $this->decodeContent($word);
439
        }
440
441 20
        return implode(' ', $words);
442
    }
443
444
    /**
445
     * Decode given $text to "utf-8" encoded string.
446
     *
447
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
448
     */
449 22
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

449
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
450
    {
451 22
        if ($this->has('ToUnicode')) {
452 18
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
453
        }
454
455 14
        if ($this->has('Encoding')) {
456 10
            $result = $this->decodeContentByEncoding($text);
457
458 10
            if (null !== $result) {
459 10
                return $result;
460
            }
461
        }
462
463 8
        return $this->decodeContentByAutodetectIfNecessary($text);
464
    }
465
466
    /**
467
     * First try to decode $text by ToUnicode CMap.
468
     * If char translation not found in ToUnicode CMap tries:
469
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
470
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
471
     *  - If DescendantFonts does not exist just return "?" as decoded char.
472
     *
473
     * //todo: Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
474
     * @param string $text
475
     * @return string
476
     */
477 18
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
478
    {
479 18
        $bytes = $this->tableSizes['from'];
480
481 18
        if ($bytes) {
482 18
            $result = '';
483 18
            $length = \strlen($text);
484
485 18
            for ($i = 0; $i < $length; $i += $bytes) {
486 18
                $char = substr($text, $i, $bytes);
487
488 18
                if (false !== ($decoded = $this->translateChar($char, false))) {
489 18
                    $char = $decoded;
490
                } elseif ($this->has('DescendantFonts')) {
491
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
492
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

492
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
493
                    } else {
494
                        $fonts = $this->get('DescendantFonts')->getContent();
495
                    }
496
                    $decoded = false;
497
498
                    foreach ($fonts as $font) {
499
                        if ($font instanceof self) {
500
                            if (false !== ($decoded = $font->translateChar($char, false))) {
501
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

501
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
502
                                break;
503
                            }
504
                        }
505
                    }
506
507
                    if (false !== $decoded) {
508
                        $char = $decoded;
509
                    } else {
510
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
511
                    }
512
                } else {
513
                    $char = self::MISSING;
514
                }
515
516 18
                $result .= $char;
517
            }
518
519 18
            $text = $result;
520
        }
521
522 18
        return $text;
523
    }
524
525
    /**
526
     * Decode content by any type of Encoding (dictionary's item) instance.
527
     */
528 10
    private function decodeContentByEncoding(string $text): ?string
529
    {
530 10
        $encoding = $this->get('Encoding');
531
532
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
533 10
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
534 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
535
        }
536
537
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
538 10
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
539 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
540
        }
541
542
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
543 7
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
544 7
            return $this->decodeContentByEncodingElement($text, $encoding);
545
        }
546
547
        // Encoding has unintended type.
548
        $encodingClassName = \get_class($encoding);
549
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
550
    }
551
552
    /**
553
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
554
     *
555
     * @param PDFObject $PDFObject
556
     * @return Encoding
557
     */
558 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
559
    {
560 3
        if (!$this->initializedEncodingByPdfObject) {
561 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
562
        }
563
564 3
        return $this->initializedEncodingByPdfObject;
565
    }
566
567
    /**
568
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
569
     */
570 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
571
    {
572 3
        $result = '';
573 3
        $length = \strlen($text);
574
575 3
        for ($i = 0; $i < $length; ++$i) {
576 3
            $dec_av = hexdec(bin2hex($text[$i]));
577 3
            $dec_ap = $encoding->translateChar($dec_av);
578 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

578
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
579
        }
580
581 3
        return $result;
582
    }
583
584
    /**
585
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
586
     */
587 7
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
588
    {
589 7
        $pdfEncodingName = $encoding->getContent();
590
591
        // mb_convert_encoding does not support MacRoman/macintosh,
592
        // so we use iconv() here
593 7
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
594
595 7
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
596
    }
597
598
    /**
599
     * Convert PDF encoding name to iconv-known encoding name.
600
     */
601 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
602
    {
603
        $pdfToIconvEncodingNameMap = [
604 7
            'StandardEncoding' => 'ISO-8859-1',
605
            'MacRomanEncoding' => 'MACINTOSH',
606
            'WinAnsiEncoding' => 'CP1252',
607
        ];
608
609 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
610 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
611 7
            : null;
612
    }
613
614
    /**
615
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
616
     * Otherwise, interpret string as "Window-1252" encoded string.
617
     *
618
     * @param string $text
619
     * @return string|null
620
     */
621 8
    private function decodeContentByAutodetectIfNecessary(string $text): string
622
    {
623 8
        if (mb_check_encoding($text, 'UTF-8')) {
624 8
            return $text;
625
        }
626
627 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
628
        //todo: Why exactly `Windows-1252` used?
629
    }
630
631
    /**
632
     * Create Encoding instance by PDFObject instance and init it.
633
     */
634 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
635
    {
636 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
637 3
        $encoding->init();
638
639 3
        return $encoding;
640
    }
641
642
    /**
643
     * Create Encoding instance by PDFObject instance (without init).
644
     */
645 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
646
    {
647 3
        $document = $PDFObject->getDocument();
648 3
        $header = $PDFObject->getHeader();
649 3
        $content = $PDFObject->getContent();
650 3
        $config = $PDFObject->getConfig();
651
652 3
        return new Encoding($document, $header, $content, $config);
653
    }
654
}
655