Passed
Pull Request — master (#500)
by
unknown
03:02
created

decodeContentByToUnicodeCMapOrDescendantFonts()   B

Complexity

Conditions 10
Paths 2

Size

Total Lines 46
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 25.6155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 28
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 46
ccs 12
cts 26
cp 0.4615
crap 25.6155
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
63
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
64
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     *
70
     * @see https://github.com/smalot/pdfparser/pull/500
71
     */
72
    private $initializedEncodingByPdfObject;
73
74 37
    public function init()
75
    {
76
        // Load translate table.
77 37
        $this->loadTranslateTable();
78 37
    }
79
80 2
    public function getName(): string
81
    {
82 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
83
    }
84
85 2
    public function getType(): string
86
    {
87 2
        return (string) $this->header->get('Subtype');
88
    }
89
90 1
    public function getDetails(bool $deep = true): array
91
    {
92 1
        $details = [];
93
94 1
        $details['Name'] = $this->getName();
95 1
        $details['Type'] = $this->getType();
96 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
97
98 1
        $details += parent::getDetails($deep);
99
100 1
        return $details;
101
    }
102
103
    /**
104
     * @return string|bool
105
     */
106 21
    public function translateChar(string $char, bool $use_default = true)
107
    {
108 21
        $dec = hexdec(bin2hex($char));
109
110 21
        if (\array_key_exists($dec, $this->table)) {
111 18
            return $this->table[$dec];
112
        }
113
114
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
115 6
        $fallbackDecoded = $char;
116
        if (
117 6
            \strlen($char) < 2
118 6
            && $this->has('Encoding')
119 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
120
        ) {
121
            try {
122 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
123 1
                    $fallbackDecoded = self::uchr($dec);
124
                }
125 1
            } catch (EncodingNotFoundException $e) {
126
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
127
                // See table 5.11 on PDF 1.5 specs for more info
128
            }
129
        }
130
131 6
        return $use_default ? self::MISSING : $fallbackDecoded;
132
    }
133
134
    /**
135
     * Convert unicode character code to "utf-8" encoded string.
136
     */
137 35
    public static function uchr(int $code): string
138
    {
139 35
        if (!isset(self::$uchrCache[$code])) {
140
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
141
            // therefore, we use mb_convert_encoding() instead
142 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
143
        }
144
145 35
        return self::$uchrCache[$code];
146
    }
147
148
    /**
149
     * Init internal chars translation table by ToUnicode CMap.
150
     */
151 37
    public function loadTranslateTable(): array
152
    {
153 37
        if (null !== $this->table) {
154 1
            return $this->table;
155
        }
156
157 37
        $this->table = [];
158 37
        $this->tableSizes = [
159
            'from' => 1,
160
            'to' => 1,
161
        ];
162
163 37
        if ($this->has('ToUnicode')) {
164 30
            $content = $this->get('ToUnicode')->getContent();
165 30
            $matches = [];
166
167
            // Support for multiple spacerange sections
168 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
169 30
                foreach ($matches['sections'] as $section) {
170 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
171
172 30
                    preg_match_all($regexp, $section, $matches);
173
174 30
                    $this->tableSizes = [
175 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
176 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
177
                    ];
178
179 30
                    break;
180
                }
181
            }
182
183
            // Support for multiple bfchar sections
184 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
185 12
                foreach ($matches['sections'] as $section) {
186 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
187
188 12
                    preg_match_all($regexp, $section, $matches);
189
190 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
191
192 12
                    foreach ($matches['from'] as $key => $from) {
193 12
                        $parts = preg_split(
194 12
                            '/([0-9A-F]{4})/i',
195 12
                            $matches['to'][$key],
196 12
                            0,
197 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
198
                        );
199 12
                        $text = '';
200 12
                        foreach ($parts as $part) {
201 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

201
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
202
                        }
203 12
                        $this->table[hexdec($from)] = $text;
204
                    }
205
                }
206
            }
207
208
            // Support for multiple bfrange sections
209 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
210 24
                foreach ($matches['sections'] as $section) {
211
                    // Support for : <srcCode1> <srcCode2> <dstString>
212 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
213
214 24
                    preg_match_all($regexp, $section, $matches);
215
216 24
                    foreach ($matches['from'] as $key => $from) {
217 24
                        $char_from = hexdec($from);
218 24
                        $char_to = hexdec($matches['to'][$key]);
219 24
                        $offset = hexdec($matches['offset'][$key]);
220
221 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
222 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
223
                        }
224
                    }
225
226
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
227
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
228 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
229
230 24
                    preg_match_all($regexp, $section, $matches);
231
232 24
                    foreach ($matches['from'] as $key => $from) {
233 1
                        $char_from = hexdec($from);
234 1
                        $strings = [];
235
236 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
237
238 1
                        foreach ($strings['string'] as $position => $string) {
239 1
                            $parts = preg_split(
240 1
                                '/([0-9A-F]{4})/i',
241
                                $string,
242 1
                                0,
243 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
244
                            );
245 1
                            $text = '';
246 1
                            foreach ($parts as $part) {
247 1
                                $text .= self::uchr(hexdec($part));
248
                            }
249 1
                            $this->table[$char_from + $position] = $text;
250
                        }
251
                    }
252
                }
253
            }
254
        }
255
256 37
        return $this->table;
257
    }
258
259
    /**
260
     * Set custom char translation table where:
261
     * - key - integer character code;
262
     * - value - "utf-8" encoded value;
263
     *
264
     * @return void
265
     */
266 1
    public function setTable(array $table)
267
    {
268 1
        $this->table = $table;
269 1
    }
270
271
    /**
272
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
273
     */
274 40
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
275
    {
276
        // Special shortcut for XML content.
277 40
        if (false !== stripos($hexa, '<?xml')) {
278 2
            return $hexa;
279
        }
280
281 40
        $text = '';
282 40
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
283
284 40
        foreach ($parts as $part) {
285 40
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
286
                // strip line breaks
287 12
                $part = preg_replace("/[\r\n]/", '', $part);
288 12
                $part = trim($part, '<>');
289 12
                if ($add_braces) {
290 1
                    $text .= '(';
291
                }
292
293 12
                $part = pack('H*', $part);
294 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
295
296 12
                if ($add_braces) {
297 12
                    $text .= ')';
298
                }
299
            } else {
300 40
                $text .= $part;
301
            }
302
        }
303
304 40
        return $text;
305
    }
306
307
    /**
308
     * Decode string with octal-decoded chunks.
309
     */
310 40
    public static function decodeOctal(string $text): string
311
    {
312 40
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
313 40
        $text = '';
314
315 40
        foreach ($parts as $part) {
316 40
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
317 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

317
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
318
            } else {
319 40
                $text .= $part;
320
            }
321
        }
322
323 40
        return $text;
324
    }
325
326
    /**
327
     * Decode string with html entity encoded chars.
328
     */
329 54
    public static function decodeEntities(string $text): string
330
    {
331 54
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
332 54
        $text = '';
333
334 54
        foreach ($parts as $part) {
335 54
            if (preg_match('/^#\d{2}$/', $part)) {
336 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

336
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
337
            } else {
338 54
                $text .= $part;
339
            }
340
        }
341
342 54
        return $text;
343
    }
344
345
    /**
346
     * Check if given string is Unicode text (by BOM);
347
     * If true - decode to "utf-8" encoded string.
348
     * Otherwise - return text as is.
349
     *
350
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
351
     */
352 40
    public static function decodeUnicode(string $text): string
353
    {
354 40
        if (preg_match('/^\xFE\xFF/i', $text)) {
355
            // Strip U+FEFF byte order marker.
356 24
            $decode = substr($text, 2);
357 24
            $text = '';
358 24
            $length = \strlen($decode);
359
360 24
            for ($i = 0; $i < $length; $i += 2) {
361 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

361
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
362
            }
363
        }
364
365 40
        return $text;
366
    }
367
368
    /**
369
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
370
     */
371 20
    protected function getFontSpaceLimit(): int
372
    {
373 20
        return $this->config->getFontSpaceLimit();
374
    }
375
376
    /**
377
     * Decode text by commands array.
378
     */
379 20
    public function decodeText(array $commands): string
380
    {
381 20
        $word_position = 0;
382 20
        $words = [];
383 20
        $font_space = $this->getFontSpaceLimit();
384
385 20
        foreach ($commands as $command) {
386 20
            switch ($command[PDFObject::TYPE]) {
387 20
                case 'n':
388 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
389 7
                        $word_position = \count($words);
390
                    }
391 15
                    continue 2;
392 20
                case '<':
393
                    // Decode hexadecimal.
394 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
395 10
                    break;
396
397
                default:
398
                    // Decode octal (if necessary).
399 13
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
400
            }
401
402
            // replace escaped chars
403 20
            $text = str_replace(
404 20
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
405 20
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
406
                $text
407
            );
408
409
            // add content to result string
410 20
            if (isset($words[$word_position])) {
411 15
                $words[$word_position] .= $text;
412
            } else {
413 20
                $words[$word_position] = $text;
414
            }
415
        }
416
417 20
        foreach ($words as &$word) {
418 20
            $word = $this->decodeContent($word);
419
        }
420
421 20
        return implode(' ', $words);
422
    }
423
424
    /**
425
     * Decode given $text to "utf-8" encoded string.
426
     *
427
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
428
     */
429 22
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

429
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
430
    {
431 22
        if ($this->has('ToUnicode')) {
432 18
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
433
        }
434
435 14
        if ($this->has('Encoding')) {
436 10
            $result = $this->decodeContentByEncoding($text);
437
438 10
            if (null !== $result) {
439 10
                return $result;
440
            }
441
        }
442
443 8
        return $this->decodeContentByAutodetectIfNecessary($text);
444
    }
445
446
    /**
447
     * First try to decode $text by ToUnicode CMap.
448
     * If char translation not found in ToUnicode CMap tries:
449
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
450
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
451
     *  - If DescendantFonts does not exist just return "?" as decoded char.
452
     *
453
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
454
     */
455 18
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
456
    {
457 18
        $bytes = $this->tableSizes['from'];
458
459 18
        if ($bytes) {
460 18
            $result = '';
461 18
            $length = \strlen($text);
462
463 18
            for ($i = 0; $i < $length; $i += $bytes) {
464 18
                $char = substr($text, $i, $bytes);
465
466 18
                if (false !== ($decoded = $this->translateChar($char, false))) {
467 18
                    $char = $decoded;
468
                } elseif ($this->has('DescendantFonts')) {
469
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
470
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

470
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
471
                    } else {
472
                        $fonts = $this->get('DescendantFonts')->getContent();
473
                    }
474
                    $decoded = false;
475
476
                    foreach ($fonts as $font) {
477
                        if ($font instanceof self) {
478
                            if (false !== ($decoded = $font->translateChar($char, false))) {
479
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

479
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
480
                                break;
481
                            }
482
                        }
483
                    }
484
485
                    if (false !== $decoded) {
486
                        $char = $decoded;
487
                    } else {
488
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
489
                    }
490
                } else {
491
                    $char = self::MISSING;
492
                }
493
494 18
                $result .= $char;
495
            }
496
497 18
            $text = $result;
498
        }
499
500 18
        return $text;
501
    }
502
503
    /**
504
     * Decode content by any type of Encoding (dictionary's item) instance.
505
     *
506
     * @throws LogicException if unknown encoding instance type is used (given by $this->get('Encoding'))
507
     */
508 10
    private function decodeContentByEncoding(string $text): ?string
509
    {
510 10
        $encoding = $this->get('Encoding');
511
512
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
513 10
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
514 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
515
        }
516
517
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
518 10
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
519 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
520
        }
521
522
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
523 7
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
524 7
            return $this->decodeContentByEncodingElement($text, $encoding);
525
        }
526
527
        // Encoding has unintended type.
528
        $encodingClassName = \get_class($encoding);
529
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
530
    }
531
532
    /**
533
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534
     */
535 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
536
    {
537 3
        if (!$this->initializedEncodingByPdfObject) {
538 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
539
        }
540
541 3
        return $this->initializedEncodingByPdfObject;
542
    }
543
544
    /**
545
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546
     */
547 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
548
    {
549 3
        $result = '';
550 3
        $length = \strlen($text);
551
552 3
        for ($i = 0; $i < $length; ++$i) {
553 3
            $dec_av = hexdec(bin2hex($text[$i]));
554 3
            $dec_ap = $encoding->translateChar($dec_av);
555 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
556
        }
557
558 3
        return $result;
559
    }
560
561
    /**
562
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563
     */
564 7
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
565
    {
566 7
        $pdfEncodingName = $encoding->getContent();
567
568
        // mb_convert_encoding does not support MacRoman/macintosh,
569
        // so we use iconv() here
570 7
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
571
572 7
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
573
    }
574
575
    /**
576
     * Convert PDF encoding name to iconv-known encoding name.
577
     */
578 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
579
    {
580
        $pdfToIconvEncodingNameMap = [
581 7
            'StandardEncoding' => 'ISO-8859-1',
582
            'MacRomanEncoding' => 'MACINTOSH',
583
            'WinAnsiEncoding' => 'CP1252',
584
        ];
585
586 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
587 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
588 7
            : null;
589
    }
590
591
    /**
592
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
593
     * Otherwise, interpret string as "Window-1252" encoded string.
594
     *
595
     * @return string|false
596
     */
597 8
    private function decodeContentByAutodetectIfNecessary(string $text)
598
    {
599 8
        if (mb_check_encoding($text, 'UTF-8')) {
600 8
            return $text;
601
        }
602
603 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
604
        //todo: Why exactly `Windows-1252` used?
605
    }
606
607
    /**
608
     * Create Encoding instance by PDFObject instance and init it.
609
     */
610 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
611
    {
612 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
613 3
        $encoding->init();
614
615 3
        return $encoding;
616
    }
617
618
    /**
619
     * Create Encoding instance by PDFObject instance (without init).
620
     */
621 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
622
    {
623 3
        $document = $PDFObject->getDocument();
624 3
        $header = $PDFObject->getHeader();
625 3
        $content = $PDFObject->getContent();
626 3
        $config = $PDFObject->getConfig();
627
628 3
        return new Encoding($document, $header, $content, $config);
629
    }
630
}
631