Passed
Pull Request — master (#500)
by Jeremy
02:37
created

decodeContentByToUnicodeCMapOrDescendantFonts()   B

Complexity

Conditions 10
Paths 2

Size

Total Lines 46
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 25.6155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 28
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 46
ccs 12
cts 26
cp 0.4615
crap 25.6155
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60
    /**
61
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
62
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
63
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
64
     *
65
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
66
     *
67
     * @var Encoding
68
     *
69
     * @see https://github.com/smalot/pdfparser/pull/500
70
     */
71
    private $initializedEncodingByPdfObject;
72
73 38
    public function init()
74
    {
75
        // Load translate table.
76 38
        $this->loadTranslateTable();
77 38
    }
78
79 2
    public function getName(): string
80
    {
81 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
82
    }
83
84 2
    public function getType(): string
85
    {
86 2
        return (string) $this->header->get('Subtype');
87
    }
88
89 1
    public function getDetails(bool $deep = true): array
90
    {
91 1
        $details = [];
92
93 1
        $details['Name'] = $this->getName();
94 1
        $details['Type'] = $this->getType();
95 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
96
97 1
        $details += parent::getDetails($deep);
98
99 1
        return $details;
100
    }
101
102
    /**
103
     * @return string|bool
104
     */
105 22
    public function translateChar(string $char, bool $use_default = true)
106
    {
107 22
        $dec = hexdec(bin2hex($char));
108
109 22
        if (\array_key_exists($dec, $this->table)) {
110 19
            return $this->table[$dec];
111
        }
112
113
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
114 6
        $fallbackDecoded = $char;
115
        if (
116 6
            \strlen($char) < 2
117 6
            && $this->has('Encoding')
118 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
119
        ) {
120
            try {
121 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
122 1
                    $fallbackDecoded = self::uchr($dec);
123
                }
124 1
            } catch (EncodingNotFoundException $e) {
125
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
126
                // See table 5.11 on PDF 1.5 specs for more info
127
            }
128
        }
129
130 6
        return $use_default ? self::MISSING : $fallbackDecoded;
131
    }
132
133
    /**
134
     * Convert unicode character code to "utf-8" encoded string.
135
     */
136 36
    public static function uchr(int $code): string
137
    {
138 36
        if (!isset(self::$uchrCache[$code])) {
139
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
140
            // therefore, we use mb_convert_encoding() instead
141 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
142
        }
143
144 36
        return self::$uchrCache[$code];
145
    }
146
147
    /**
148
     * Init internal chars translation table by ToUnicode CMap.
149
     */
150 38
    public function loadTranslateTable(): array
151
    {
152 38
        if (null !== $this->table) {
153 1
            return $this->table;
154
        }
155
156 38
        $this->table = [];
157 38
        $this->tableSizes = [
158
            'from' => 1,
159
            'to' => 1,
160
        ];
161
162 38
        if ($this->has('ToUnicode')) {
163 31
            $content = $this->get('ToUnicode')->getContent();
164 31
            $matches = [];
165
166
            // Support for multiple spacerange sections
167 31
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
168 31
                foreach ($matches['sections'] as $section) {
169 31
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
170
171 31
                    preg_match_all($regexp, $section, $matches);
172
173 31
                    $this->tableSizes = [
174 31
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
175 31
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
176
                    ];
177
178 31
                    break;
179
                }
180
            }
181
182
            // Support for multiple bfchar sections
183 31
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
184 13
                foreach ($matches['sections'] as $section) {
185 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
186
187 13
                    preg_match_all($regexp, $section, $matches);
188
189 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
190
191 13
                    foreach ($matches['from'] as $key => $from) {
192 13
                        $parts = preg_split(
193 13
                            '/([0-9A-F]{4})/i',
194 13
                            $matches['to'][$key],
195 13
                            0,
196 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
197
                        );
198 13
                        $text = '';
199 13
                        foreach ($parts as $part) {
200 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

200
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
201
                        }
202 13
                        $this->table[hexdec($from)] = $text;
203
                    }
204
                }
205
            }
206
207
            // Support for multiple bfrange sections
208 31
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
209 24
                foreach ($matches['sections'] as $section) {
210
                    // Support for : <srcCode1> <srcCode2> <dstString>
211 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
212
213 24
                    preg_match_all($regexp, $section, $matches);
214
215 24
                    foreach ($matches['from'] as $key => $from) {
216 24
                        $char_from = hexdec($from);
217 24
                        $char_to = hexdec($matches['to'][$key]);
218 24
                        $offset = hexdec($matches['offset'][$key]);
219
220 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
221 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
222
                        }
223
                    }
224
225
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
226
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
227 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
228
229 24
                    preg_match_all($regexp, $section, $matches);
230
231 24
                    foreach ($matches['from'] as $key => $from) {
232 1
                        $char_from = hexdec($from);
233 1
                        $strings = [];
234
235 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
236
237 1
                        foreach ($strings['string'] as $position => $string) {
238 1
                            $parts = preg_split(
239 1
                                '/([0-9A-F]{4})/i',
240
                                $string,
241 1
                                0,
242 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
243
                            );
244 1
                            $text = '';
245 1
                            foreach ($parts as $part) {
246 1
                                $text .= self::uchr(hexdec($part));
247
                            }
248 1
                            $this->table[$char_from + $position] = $text;
249
                        }
250
                    }
251
                }
252
            }
253
        }
254
255 38
        return $this->table;
256
    }
257
258
    /**
259
     * Set custom char translation table where:
260
     * - key - integer character code;
261
     * - value - "utf-8" encoded value;
262
     *
263
     * @return void
264
     */
265 1
    public function setTable(array $table)
266
    {
267 1
        $this->table = $table;
268 1
    }
269
270
    /**
271
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
272
     */
273 41
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
274
    {
275
        // Special shortcut for XML content.
276 41
        if (false !== stripos($hexa, '<?xml')) {
277 2
            return $hexa;
278
        }
279
280 41
        $text = '';
281 41
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
282
283 41
        foreach ($parts as $part) {
284 41
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
285
                // strip line breaks
286 13
                $part = preg_replace("/[\r\n]/", '', $part);
287 13
                $part = trim($part, '<>');
288 13
                if ($add_braces) {
289 1
                    $text .= '(';
290
                }
291
292 13
                $part = pack('H*', $part);
293 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
294
295 13
                if ($add_braces) {
296 13
                    $text .= ')';
297
                }
298
            } else {
299 41
                $text .= $part;
300
            }
301
        }
302
303 41
        return $text;
304
    }
305
306
    /**
307
     * Decode string with octal-decoded chunks.
308
     */
309 41
    public static function decodeOctal(string $text): string
310
    {
311 41
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
312 41
        $text = '';
313
314 41
        foreach ($parts as $part) {
315 41
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
316 18
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

316
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
317
            } else {
318 41
                $text .= $part;
319
            }
320
        }
321
322 41
        return $text;
323
    }
324
325
    /**
326
     * Decode string with html entity encoded chars.
327
     */
328 55
    public static function decodeEntities(string $text): string
329
    {
330 55
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
331 55
        $text = '';
332
333 55
        foreach ($parts as $part) {
334 55
            if (preg_match('/^#\d{2}$/', $part)) {
335 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
336
            } else {
337 55
                $text .= $part;
338
            }
339
        }
340
341 55
        return $text;
342
    }
343
344
    /**
345
     * Check if given string is Unicode text (by BOM);
346
     * If true - decode to "utf-8" encoded string.
347
     * Otherwise - return text as is.
348
     *
349
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
350
     */
351 41
    public static function decodeUnicode(string $text): string
352
    {
353 41
        if (preg_match('/^\xFE\xFF/i', $text)) {
354
            // Strip U+FEFF byte order marker.
355 25
            $decode = substr($text, 2);
356 25
            $text = '';
357 25
            $length = \strlen($decode);
358
359 25
            for ($i = 0; $i < $length; $i += 2) {
360 25
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

360
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
361
            }
362
        }
363
364 41
        return $text;
365
    }
366
367
    /**
368
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
369
     */
370 21
    protected function getFontSpaceLimit(): int
371
    {
372 21
        return $this->config->getFontSpaceLimit();
373
    }
374
375
    /**
376
     * Decode text by commands array.
377
     */
378 21
    public function decodeText(array $commands): string
379
    {
380 21
        $word_position = 0;
381 21
        $words = [];
382 21
        $font_space = $this->getFontSpaceLimit();
383
384 21
        foreach ($commands as $command) {
385 21
            switch ($command[PDFObject::TYPE]) {
386 21
                case 'n':
387 16
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
388 8
                        $word_position = \count($words);
389
                    }
390 16
                    continue 2;
391 21
                case '<':
392
                    // Decode hexadecimal.
393 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
394 11
                    break;
395
396
                default:
397
                    // Decode octal (if necessary).
398 14
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
399
            }
400
401
            // replace escaped chars
402 21
            $text = str_replace(
403 21
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
404 21
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
405
                $text
406
            );
407
408
            // add content to result string
409 21
            if (isset($words[$word_position])) {
410 16
                $words[$word_position] .= $text;
411
            } else {
412 21
                $words[$word_position] = $text;
413
            }
414
        }
415
416 21
        foreach ($words as &$word) {
417 21
            $word = $this->decodeContent($word);
418
        }
419
420 21
        return implode(' ', $words);
421
    }
422
423
    /**
424
     * Decode given $text to "utf-8" encoded string.
425
     *
426
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
427
     */
428 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

428
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
429
    {
430 23
        if ($this->has('ToUnicode')) {
431 19
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
432
        }
433
434 15
        if ($this->has('Encoding')) {
435 11
            $result = $this->decodeContentByEncoding($text);
436
437 11
            if (null !== $result) {
438 11
                return $result;
439
            }
440
        }
441
442 8
        return $this->decodeContentByAutodetectIfNecessary($text);
443
    }
444
445
    /**
446
     * First try to decode $text by ToUnicode CMap.
447
     * If char translation not found in ToUnicode CMap tries:
448
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
449
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
450
     *  - If DescendantFonts does not exist just return "?" as decoded char.
451
     *
452
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
453
     */
454 19
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
455
    {
456 19
        $bytes = $this->tableSizes['from'];
457
458 19
        if ($bytes) {
459 19
            $result = '';
460 19
            $length = \strlen($text);
461
462 19
            for ($i = 0; $i < $length; $i += $bytes) {
463 19
                $char = substr($text, $i, $bytes);
464
465 19
                if (false !== ($decoded = $this->translateChar($char, false))) {
466 19
                    $char = $decoded;
467
                } elseif ($this->has('DescendantFonts')) {
468
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
469
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

469
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
470
                    } else {
471
                        $fonts = $this->get('DescendantFonts')->getContent();
472
                    }
473
                    $decoded = false;
474
475
                    foreach ($fonts as $font) {
476
                        if ($font instanceof self) {
477
                            if (false !== ($decoded = $font->translateChar($char, false))) {
478
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

478
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
479
                                break;
480
                            }
481
                        }
482
                    }
483
484
                    if (false !== $decoded) {
485
                        $char = $decoded;
486
                    } else {
487
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
488
                    }
489
                } else {
490
                    $char = self::MISSING;
491
                }
492
493 19
                $result .= $char;
494
            }
495
496 19
            $text = $result;
497
        }
498
499 19
        return $text;
500
    }
501
502
    /**
503
     * Decode content by any type of Encoding (dictionary's item) instance.
504
     */
505 11
    private function decodeContentByEncoding(string $text): ?string
506
    {
507 11
        $encoding = $this->get('Encoding');
508
509
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
510 11
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
511 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
512
        }
513
514
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
515 11
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
516 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
517
        }
518
519
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
520 8
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
521 8
            return $this->decodeContentByEncodingElement($text, $encoding);
522
        }
523
524
        // don't double-encode strings already in UTF-8
525
        if (!mb_check_encoding($text, 'UTF-8')) {
526
            return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
527
        }
528
529
        return $text;
530
    }
531
532
    /**
533
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534
     */
535 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
536
    {
537 3
        if (!$this->initializedEncodingByPdfObject) {
538 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
539
        }
540
541 3
        return $this->initializedEncodingByPdfObject;
542
    }
543
544
    /**
545
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546
     */
547 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
548
    {
549 3
        $result = '';
550 3
        $length = \strlen($text);
551
552 3
        for ($i = 0; $i < $length; ++$i) {
553 3
            $dec_av = hexdec(bin2hex($text[$i]));
554 3
            $dec_ap = $encoding->translateChar($dec_av);
555 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
556
        }
557
558 3
        return $result;
559
    }
560
561
    /**
562
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563
     */
564 8
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
565
    {
566 8
        $pdfEncodingName = $encoding->getContent();
567
568
        // mb_convert_encoding does not support MacRoman/macintosh,
569
        // so we use iconv() here
570 8
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
571
572 8
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
573
    }
574
575
    /**
576
     * Convert PDF encoding name to iconv-known encoding name.
577
     */
578 8
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
579
    {
580
        $pdfToIconvEncodingNameMap = [
581 8
            'StandardEncoding' => 'ISO-8859-1',
582
            'MacRomanEncoding' => 'MACINTOSH',
583
            'WinAnsiEncoding' => 'CP1252',
584
        ];
585
586 8
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
587 8
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
588 8
            : null;
589
    }
590
591
    /**
592
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
593
     * Otherwise, interpret string as "Window-1252" encoded string.
594
     *
595
     * @return string|false
596
     */
597 8
    private function decodeContentByAutodetectIfNecessary(string $text)
598
    {
599 8
        if (mb_check_encoding($text, 'UTF-8')) {
600 8
            return $text;
601
        }
602
603 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
604
        //todo: Why exactly `Windows-1252` used?
605
    }
606
607
    /**
608
     * Create Encoding instance by PDFObject instance and init it.
609
     */
610 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
611
    {
612 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
613 3
        $encoding->init();
614
615 3
        return $encoding;
616
    }
617
618
    /**
619
     * Create Encoding instance by PDFObject instance (without init).
620
     */
621 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
622
    {
623 3
        $document = $PDFObject->getDocument();
624 3
        $header = $PDFObject->getHeader();
625 3
        $content = $PDFObject->getContent();
626 3
        $config = $PDFObject->getConfig();
627
628 3
        return new Encoding($document, $header, $content, $config);
629
    }
630
}
631