Passed
Pull Request — master (#500)
by
unknown
02:28
created

decodeContentByToUnicodeCMapOrDescendantFonts()   B

Complexity

Conditions 10
Paths 2

Size

Total Lines 46
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 25.6155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 28
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 46
ccs 12
cts 26
cp 0.4615
crap 25.6155
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some pdf-files (@see https://github.com/smalot/pdfparser/pull/500) encoding could be referenced by object id
63
     * but object itself not contains `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as
64
     * Encoding in \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create Encoding instance from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     */
70
    private $initializedEncodingByPdfObject;
71
72 37
    public function init()
73
    {
74
        // Load translate table.
75 37
        $this->loadTranslateTable();
76 37
    }
77
78 2
    public function getName(): string
79
    {
80 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
81
    }
82
83 2
    public function getType(): string
84
    {
85 2
        return (string) $this->header->get('Subtype');
86
    }
87
88 1
    public function getDetails(bool $deep = true): array
89
    {
90 1
        $details = [];
91
92 1
        $details['Name'] = $this->getName();
93 1
        $details['Type'] = $this->getType();
94 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
95
96 1
        $details += parent::getDetails($deep);
97
98 1
        return $details;
99
    }
100
101
    /**
102
     * @return string|bool
103
     */
104 21
    public function translateChar(string $char, bool $use_default = true)
105
    {
106 21
        $dec = hexdec(bin2hex($char));
107
108 21
        if (\array_key_exists($dec, $this->table)) {
109 18
            return $this->table[$dec];
110
        }
111
112
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
113 6
        $fallbackDecoded = $char;
114
        if (
115 6
            \strlen($char) < 2
116 6
            && $this->has('Encoding')
117 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
118
        ) {
119
            try {
120 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
121 1
                    $fallbackDecoded = self::uchr($dec);
122
                }
123 1
            } catch (EncodingNotFoundException $e) {
124
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
125
                // See table 5.11 on PDF 1.5 specs for more info
126
            }
127
        }
128
129 6
        return $use_default ? self::MISSING : $fallbackDecoded;
130
    }
131
132
    /**
133
     * Convert unicode character code to "utf-8" encoded string.
134
     *
135
     * @param int $code
136
     * @return string
137
     */
138 35
    public static function uchr(int $code): string
139
    {
140 35
        if (!isset(self::$uchrCache[$code])) {
141
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
142
            // therefore, we use mb_convert_encoding() instead
143 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
144
        }
145
146 35
        return self::$uchrCache[$code];
147
    }
148
149
    /**
150
     * Init internal chars translation table by ToUnicode CMap.
151
     *
152
     * @return array
153
     */
154 37
    public function loadTranslateTable(): array
155
    {
156 37
        if (null !== $this->table) {
157 1
            return $this->table;
158
        }
159
160 37
        $this->table = [];
161 37
        $this->tableSizes = [
162
            'from' => 1,
163
            'to' => 1,
164
        ];
165
166 37
        if ($this->has('ToUnicode')) {
167 30
            $content = $this->get('ToUnicode')->getContent();
168 30
            $matches = [];
169
170
            // Support for multiple spacerange sections
171 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
172 30
                foreach ($matches['sections'] as $section) {
173 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
174
175 30
                    preg_match_all($regexp, $section, $matches);
176
177 30
                    $this->tableSizes = [
178 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
179 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
180
                    ];
181
182 30
                    break;
183
                }
184
            }
185
186
            // Support for multiple bfchar sections
187 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
188 12
                foreach ($matches['sections'] as $section) {
189 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
190
191 12
                    preg_match_all($regexp, $section, $matches);
192
193 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
194
195 12
                    foreach ($matches['from'] as $key => $from) {
196 12
                        $parts = preg_split(
197 12
                            '/([0-9A-F]{4})/i',
198 12
                            $matches['to'][$key],
199 12
                            0,
200 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
201
                        );
202 12
                        $text = '';
203 12
                        foreach ($parts as $part) {
204 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

204
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
205
                        }
206 12
                        $this->table[hexdec($from)] = $text;
207
                    }
208
                }
209
            }
210
211
            // Support for multiple bfrange sections
212 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
213 24
                foreach ($matches['sections'] as $section) {
214
                    // Support for : <srcCode1> <srcCode2> <dstString>
215 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
216
217 24
                    preg_match_all($regexp, $section, $matches);
218
219 24
                    foreach ($matches['from'] as $key => $from) {
220 24
                        $char_from = hexdec($from);
221 24
                        $char_to = hexdec($matches['to'][$key]);
222 24
                        $offset = hexdec($matches['offset'][$key]);
223
224 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
225 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
226
                        }
227
                    }
228
229
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
230
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
231 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
232
233 24
                    preg_match_all($regexp, $section, $matches);
234
235 24
                    foreach ($matches['from'] as $key => $from) {
236 1
                        $char_from = hexdec($from);
237 1
                        $strings = [];
238
239 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
240
241 1
                        foreach ($strings['string'] as $position => $string) {
242 1
                            $parts = preg_split(
243 1
                                '/([0-9A-F]{4})/i',
244
                                $string,
245 1
                                0,
246 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
247
                            );
248 1
                            $text = '';
249 1
                            foreach ($parts as $part) {
250 1
                                $text .= self::uchr(hexdec($part));
251
                            }
252 1
                            $this->table[$char_from + $position] = $text;
253
                        }
254
                    }
255
                }
256
            }
257
        }
258
259 37
        return $this->table;
260
    }
261
262
    /**
263
     * Set custom char translation table where:
264
     * - key - integer character code;
265
     * - value - "utf-8" encoded value;
266
     *
267
     * @param array $table
268
     * @return void
269
     */
270 1
    public function setTable(array $table)
271
    {
272 1
        $this->table = $table;
273 1
    }
274
275
    /**
276
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
277
     *
278
     * @param string $hexa
279
     * @param bool $add_braces
280
     * @return string
281
     */
282 40
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
283
    {
284
        // Special shortcut for XML content.
285 40
        if (false !== stripos($hexa, '<?xml')) {
286 2
            return $hexa;
287
        }
288
289 40
        $text = '';
290 40
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
291
292 40
        foreach ($parts as $part) {
293 40
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
294
                // strip line breaks
295 12
                $part = preg_replace("/[\r\n]/", '', $part);
296 12
                $part = trim($part, '<>');
297 12
                if ($add_braces) {
298 1
                    $text .= '(';
299
                }
300
301 12
                $part = pack('H*', $part);
302 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
303
304 12
                if ($add_braces) {
305 12
                    $text .= ')';
306
                }
307
            } else {
308 40
                $text .= $part;
309
            }
310
        }
311
312 40
        return $text;
313
    }
314
315
    /**
316
     * Decode string with octal-decoded chunks.
317
     *
318
     * @param string $text
319
     * @return string
320
     */
321 40
    public static function decodeOctal(string $text): string
322
    {
323 40
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
324 40
        $text = '';
325
326 40
        foreach ($parts as $part) {
327 40
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
328 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

328
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
329
            } else {
330 40
                $text .= $part;
331
            }
332
        }
333
334 40
        return $text;
335
    }
336
337
    /**
338
     * Decode string with html entity encoded chars.
339
     * @param string $text
340
     * @return string
341
     */
342 54
    public static function decodeEntities(string $text): string
343
    {
344 54
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
345 54
        $text = '';
346
347 54
        foreach ($parts as $part) {
348 54
            if (preg_match('/^#\d{2}$/', $part)) {
349 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

349
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
350
            } else {
351 54
                $text .= $part;
352
            }
353
        }
354
355 54
        return $text;
356
    }
357
358
    /**
359
     * Check if given string is Unicode text (by BOM);
360
     * If true - decode to "utf-8" encoded string.
361
     * Otherwise - return text as is.
362
     *
363
     * //todo: rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
364
     * @param string $text
365
     * @return string
366
     */
367 40
    public static function decodeUnicode(string $text): string
368
    {
369 40
        if (preg_match('/^\xFE\xFF/i', $text)) {
370
            // Strip U+FEFF byte order marker.
371 24
            $decode = substr($text, 2);
372 24
            $text = '';
373 24
            $length = \strlen($decode);
374
375 24
            for ($i = 0; $i < $length; $i += 2) {
376 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

376
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
377
            }
378
        }
379
380 40
        return $text;
381
    }
382
383
    /**
384
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
385
     */
386 20
    protected function getFontSpaceLimit(): int
387
    {
388 20
        return $this->config->getFontSpaceLimit();
389
    }
390
391
    /**
392
     * Decode text by commands array.
393
     *
394
     * @param array $commands
395
     * @return string
396
     */
397 20
    public function decodeText(array $commands): string
398
    {
399 20
        $word_position = 0;
400 20
        $words = [];
401 20
        $font_space = $this->getFontSpaceLimit();
402
403 20
        foreach ($commands as $command) {
404 20
            switch ($command[PDFObject::TYPE]) {
405 20
                case 'n':
406 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
407 7
                        $word_position = \count($words);
408
                    }
409 15
                    continue 2;
410 20
                case '<':
411
                    // Decode hexadecimal.
412 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
413 10
                    break;
414
415
                default:
416
                    // Decode octal (if necessary).
417 13
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
418
            }
419
420
            // replace escaped chars
421 20
            $text = str_replace(
422 20
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
423 20
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
424
                $text
425
            );
426
427
            // add content to result string
428 20
            if (isset($words[$word_position])) {
429 15
                $words[$word_position] .= $text;
430
            } else {
431 20
                $words[$word_position] = $text;
432
            }
433
        }
434
435 20
        foreach ($words as &$word) {
436 20
            $word = $this->decodeContent($word);
437
        }
438
439 20
        return implode(' ', $words);
440
    }
441
442
    /**
443
     * Decode given $text to "utf-8" encoded string.
444
     *
445
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
446
     */
447 22
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

447
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
448
    {
449 22
        if ($this->has('ToUnicode')) {
450 18
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
451
        }
452
453 14
        if ($this->has('Encoding')) {
454 10
            $result = $this->decodeContentByEncoding($text);
455
456 10
            if (null !== $result) {
457 10
                return $result;
458
            }
459
        }
460
461 8
        return $this->decodeContentByAutodetectIfNecessary($text);
462
    }
463
464
    /**
465
     * First try to decode $text by ToUnicode CMap.
466
     * If char translation not found in ToUnicode CMap tries:
467
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
468
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
469
     *  - If DescendantFonts does not exist just return "?" as decoded char.
470
     *
471
     * //todo: Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
472
     * @param string $text
473
     * @return string
474
     */
475 18
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
476
    {
477 18
        $bytes = $this->tableSizes['from'];
478
479 18
        if ($bytes) {
480 18
            $result = '';
481 18
            $length = \strlen($text);
482
483 18
            for ($i = 0; $i < $length; $i += $bytes) {
484 18
                $char = substr($text, $i, $bytes);
485
486 18
                if (false !== ($decoded = $this->translateChar($char, false))) {
487 18
                    $char = $decoded;
488
                } elseif ($this->has('DescendantFonts')) {
489
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
490
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

490
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
491
                    } else {
492
                        $fonts = $this->get('DescendantFonts')->getContent();
493
                    }
494
                    $decoded = false;
495
496
                    foreach ($fonts as $font) {
497
                        if ($font instanceof self) {
498
                            if (false !== ($decoded = $font->translateChar($char, false))) {
499
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

499
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
500
                                break;
501
                            }
502
                        }
503
                    }
504
505
                    if (false !== $decoded) {
506
                        $char = $decoded;
507
                    } else {
508
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
509
                    }
510
                } else {
511
                    $char = self::MISSING;
512
                }
513
514 18
                $result .= $char;
515
            }
516
517 18
            $text = $result;
518
        }
519
520 18
        return $text;
521
    }
522
523
    /**
524
     * Decode content by any type of Encoding (dictionary's item) instance.
525
     */
526 10
    private function decodeContentByEncoding(string $text): ?string
527
    {
528 10
        $encoding = $this->get('Encoding');
529
530
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
531 10
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
532 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
533
        }
534
535
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
536 10
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
537 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
538
        }
539
540
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
541 7
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
542 7
            return $this->decodeContentByEncodingElement($text, $encoding);
543
        }
544
545
        // Encoding has unintended type.
546
        $encodingClassName = \get_class($encoding);
547
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
548
    }
549
550
    /**
551
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
552
     *
553
     * @param PDFObject $PDFObject
554
     * @return Encoding
555
     */
556 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
557
    {
558 3
        if (!$this->initializedEncodingByPdfObject) {
559 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
560
        }
561
562 3
        return $this->initializedEncodingByPdfObject;
563
    }
564
565
    /**
566
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
567
     */
568 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
569
    {
570 3
        $result = '';
571 3
        $length = \strlen($text);
572
573 3
        for ($i = 0; $i < $length; ++$i) {
574 3
            $dec_av = hexdec(bin2hex($text[$i]));
575 3
            $dec_ap = $encoding->translateChar($dec_av);
576 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

576
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
577
        }
578
579 3
        return $result;
580
    }
581
582
    /**
583
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
584
     */
585 7
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
586
    {
587 7
        $pdfEncodingName = $encoding->getContent();
588
589
        // mb_convert_encoding does not support MacRoman/macintosh,
590
        // so we use iconv() here
591 7
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
592
593 7
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
594
    }
595
596
    /**
597
     * Convert PDF encoding name to iconv-known encoding name.
598
     */
599 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
600
    {
601
        $pdfToIconvEncodingNameMap = [
602 7
            'StandardEncoding' => 'ISO-8859-1',
603
            'MacRomanEncoding' => 'MACINTOSH',
604
            'WinAnsiEncoding' => 'CP1252',
605
        ];
606
607 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
608 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
609 7
            : null;
610
    }
611
612
    /**
613
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
614
     * Otherwise, interpret string as "Window-1252" encoded string.
615
     *
616
     * @param string $text
617
     * @return string|null
618
     */
619 8
    private function decodeContentByAutodetectIfNecessary(string $text): string
620
    {
621 8
        if (mb_check_encoding($text, 'UTF-8')) {
622 8
            return $text;
623
        }
624
625 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
626
        //todo: Why exactly `Windows-1252` used?
627
    }
628
629
    /**
630
     * Create Encoding instance by PDFObject instance and init it.
631
     */
632 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
633
    {
634 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
635 3
        $encoding->init();
636
637 3
        return $encoding;
638
    }
639
640
    /**
641
     * Create Encoding instance by PDFObject instance (without init).
642
     */
643 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
644
    {
645 3
        $document = $PDFObject->getDocument();
646 3
        $header = $PDFObject->getHeader();
647 3
        $content = $PDFObject->getContent();
648 3
        $config = $PDFObject->getConfig();
649
650 3
        return new Encoding($document, $header, $content, $config);
651
    }
652
}
653