Passed
Pull Request — master (#500)
by
unknown
02:28
created

Font::createEncodingByPdfObject()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 5
c 1
b 1
f 0
nc 1
nop 1
dl 0
loc 8
ccs 6
cts 6
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some pdf-files (@see https://github.com/smalot/pdfparser/pull/500) encoding could be referenced by object id
63
     * but object itself not contains `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as
64
     * Encoding in \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create Encoding instance from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     */
70
    private $initializedEncodingByPdfObject;
71
72 37
    public function init()
73
    {
74
        // Load translate table.
75 37
        $this->loadTranslateTable();
76 37
    }
77
78 2
    public function getName(): string
79
    {
80 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
81
    }
82
83 2
    public function getType(): string
84
    {
85 2
        return (string) $this->header->get('Subtype');
86
    }
87
88 1
    public function getDetails(bool $deep = true): array
89
    {
90 1
        $details = [];
91
92 1
        $details['Name'] = $this->getName();
93 1
        $details['Type'] = $this->getType();
94 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
95
96 1
        $details += parent::getDetails($deep);
97
98 1
        return $details;
99
    }
100
101
    /**
102
     * @return string|bool
103
     */
104 21
    public function translateChar(string $char, bool $use_default = true)
105
    {
106 21
        $dec = hexdec(bin2hex($char));
107
108 21
        if (\array_key_exists($dec, $this->table)) {
109 18
            return $this->table[$dec];
110
        }
111
112
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
113 6
        $fallbackDecoded = $char;
114
        if (
115 6
            \strlen($char) < 2
116 6
            && $this->has('Encoding')
117 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
118
        ) {
119
            try {
120 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
121 1
                    $fallbackDecoded = self::uchr($dec);
122
                }
123 1
            } catch (EncodingNotFoundException $e) {
124
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
125
                // See table 5.11 on PDF 1.5 specs for more info
126
            }
127
        }
128
129 6
        return $use_default ? self::MISSING : $fallbackDecoded;
130
    }
131
132
    /**
133
     * Convert unicode character code to "utf-8" encoded string.
134
     *
135
     * @param int $code
136
     * @return string
137
     */
138 35
    public static function uchr(int $code): string
139
    {
140 35
        if (!isset(self::$uchrCache[$code])) {
141
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
142
            // therefore, we use mb_convert_encoding() instead
143 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
144
        }
145
146 35
        return self::$uchrCache[$code];
147
    }
148
149
    /**
150
     * Init internal chars translation table by ToUnicode CMap.
151
     *
152
     * @return array
153
     */
154 37
    public function loadTranslateTable(): array
155
    {
156 37
        if (null !== $this->table) {
157 1
            return $this->table;
158
        }
159
160 37
        $this->table = [];
161 37
        $this->tableSizes = [
162
            'from' => 1,
163
            'to' => 1,
164
        ];
165
166 37
        if ($this->has('ToUnicode')) {
167 30
            $content = $this->get('ToUnicode')->getContent();
168 30
            $matches = [];
169
170
            // Support for multiple spacerange sections
171 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
172 30
                foreach ($matches['sections'] as $section) {
173 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
174
175 30
                    preg_match_all($regexp, $section, $matches);
176
177 30
                    $this->tableSizes = [
178 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
179 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
180
                    ];
181
182 30
                    break;
183
                }
184
            }
185
186
            // Support for multiple bfchar sections
187 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
188 12
                foreach ($matches['sections'] as $section) {
189 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
190
191 12
                    preg_match_all($regexp, $section, $matches);
192
193 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
194
195 12
                    foreach ($matches['from'] as $key => $from) {
196 12
                        $parts = preg_split(
197 12
                            '/([0-9A-F]{4})/i',
198 12
                            $matches['to'][$key],
199 12
                            0,
200 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
201
                        );
202 12
                        $text = '';
203 12
                        foreach ($parts as $part) {
204 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

204
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
205
                        }
206 12
                        $this->table[hexdec($from)] = $text;
207
                    }
208
                }
209
            }
210
211
            // Support for multiple bfrange sections
212 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
213 24
                foreach ($matches['sections'] as $section) {
214
                    // Support for : <srcCode1> <srcCode2> <dstString>
215 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
216
217 24
                    preg_match_all($regexp, $section, $matches);
218
219 24
                    foreach ($matches['from'] as $key => $from) {
220 24
                        $char_from = hexdec($from);
221 24
                        $char_to = hexdec($matches['to'][$key]);
222 24
                        $offset = hexdec($matches['offset'][$key]);
223
224 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
225 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
226
                        }
227
                    }
228
229
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
230
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
231 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
232
233 24
                    preg_match_all($regexp, $section, $matches);
234
235 24
                    foreach ($matches['from'] as $key => $from) {
236 1
                        $char_from = hexdec($from);
237 1
                        $strings = [];
238
239 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
240
241 1
                        foreach ($strings['string'] as $position => $string) {
242 1
                            $parts = preg_split(
243 1
                                '/([0-9A-F]{4})/i',
244
                                $string,
245 1
                                0,
246 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
247
                            );
248 1
                            $text = '';
249 1
                            foreach ($parts as $part) {
250 1
                                $text .= self::uchr(hexdec($part));
251
                            }
252 1
                            $this->table[$char_from + $position] = $text;
253
                        }
254
                    }
255
                }
256
            }
257
        }
258
259 37
        return $this->table;
260
    }
261
262
    /**
263
     * Set custom char translation table where:
264
     * - key - integer character code;
265
     * - value - "utf-8" encoded value;
266
     *
267
     * @param array $table
268
     * @return void
269
     */
270 1
    public function setTable(array $table)
271
    {
272 1
        $this->table = $table;
273 1
    }
274
275
    /**
276
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
277
     *
278
     * @param string $hexa
279
     * @param bool $add_braces
280
     * @return string
281
     */
282 40
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
283
    {
284
        // Special shortcut for XML content.
285 40
        if (false !== stripos($hexa, '<?xml')) {
286 2
            return $hexa;
287
        }
288
289 40
        $text = '';
290 40
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
291
292 40
        foreach ($parts as $part) {
293 40
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
294
                // strip line breaks
295 12
                $part = preg_replace("/[\r\n]/", '', $part);
296 12
                $part = trim($part, '<>');
297 12
                if ($add_braces) {
298 1
                    $text .= '(';
299
                }
300
301 12
                $part = pack('H*', $part);
302 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
303
304 12
                if ($add_braces) {
305 12
                    $text .= ')';
306
                }
307
            } else {
308 40
                $text .= $part;
309
            }
310
        }
311
312 40
        return $text;
313
    }
314
315
    /**
316
     * Decode string with octal-decoded chunks.
317
     *
318
     * @param string $text
319
     * @return string
320
     */
321 40
    public static function decodeOctal(string $text): string
322
    {
323 40
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
324 40
        $text = '';
325
326 40
        foreach ($parts as $part) {
327 40
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
328 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

328
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
329
            } else {
330 40
                $text .= $part;
331
            }
332
        }
333
334 40
        return $text;
335
    }
336
337
    /**
338
     * Decode string with html entity encoded chars.
339
     * @param string $text
340
     * @return string
341
     */
342 54
    public static function decodeEntities(string $text): string
343
    {
344 54
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
345 54
        $text = '';
346
347 54
        foreach ($parts as $part) {
348 54
            if (preg_match('/^#\d{2}$/', $part)) {
349 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

349
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
350
            } else {
351 54
                $text .= $part;
352
            }
353
        }
354
355 54
        return $text;
356
    }
357
358
    /**
359
     * Check if given string is Unicode text (by BOM);
360
     * If true - decode to "utf-8" encoded string.
361
     * Otherwise - return text as is.
362
     *
363
     * //todo: rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
364
     * @param string $text
365
     * @return string
366
     */
367 40
    public static function decodeUnicode(string $text): string
368
    {
369 40
        if (preg_match('/^\xFE\xFF/i', $text)) {
370
            // Strip U+FEFF byte order marker.
371 24
            $decode = substr($text, 2);
372 24
            $text = '';
373 24
            $length = \strlen($decode);
374
375 24
            for ($i = 0; $i < $length; $i += 2) {
376 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

376
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
377
            }
378
        }
379
380 40
        return $text;
381
    }
382
383
    /**
384
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
385
     */
386 20
    protected function getFontSpaceLimit(): int
387
    {
388 20
        return $this->config->getFontSpaceLimit();
389
    }
390
391
    /**
392
     * Decode text by commands array.
393
     *
394
     * @param array $commands
395
     * @return string
396
     */
397 20
    public function decodeText(array $commands): string
398
    {
399 20
        $word_position = 0;
400 20
        $words = [];
401 20
        $font_space = $this->getFontSpaceLimit();
402
403 20
        foreach ($commands as $command) {
404 20
            switch ($command[PDFObject::TYPE]) {
405 20
                case 'n':
406 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
407 7
                        $word_position = \count($words);
408
                    }
409 15
                    continue 2;
410 20
                case '<':
411
                    // Decode hexadecimal.
412 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
413 10
                    break;
414
415
                default:
416
                    // Decode octal (if necessary).
417 13
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
418
            }
419
420
            // replace escaped chars
421 20
            $text = str_replace(
422 20
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
423 20
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
424
                $text
425
            );
426
427
            // add content to result string
428 20
            if (isset($words[$word_position])) {
429 15
                $words[$word_position] .= $text;
430
            } else {
431 20
                $words[$word_position] = $text;
432
            }
433
        }
434
435 20
        foreach ($words as &$word) {
436 20
            $word = $this->decodeContent($word);
437
        }
438
439 20
        return implode(' ', $words);
440
    }
441
442
    /**
443
     * Decode given $text to "utf-8" encoded string.
444
     *
445
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
446
     */
447 22
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

447
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
448
    {
449 22
        if ($this->has('ToUnicode')) {
450 18
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
451
        }
452
453 14
        if ($this->has('Encoding')) {
454 10
            $result = $this->decodeContentByEncoding($text);
455
456 10
            if (null !== $result) {
457 10
                return $result;
458
            }
459
        }
460
461 8
        return $this->decodeContentByAutodetectIfNecessary($text);
462
    }
463
464
    /**
465
     * First try to decode $text by ToUnicode CMap.
466
     * If char translation not found in ToUnicode CMap tries:
467
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
468
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
469
     *  - If DescendantFonts does not exist just return "?" as decoded char.
470
     *
471
     * //todo: Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
472
     * @param string $text
473
     * @return string
474
     */
475 18
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
476
    {
477 18
        $bytes = $this->tableSizes['from'];
478
479 18
        if ($bytes) {
480 18
            $result = '';
481 18
            $length = \strlen($text);
482
483 18
            for ($i = 0; $i < $length; $i += $bytes) {
484 18
                $char = substr($text, $i, $bytes);
485
486 18
                if (false !== ($decoded = $this->translateChar($char, false))) {
487 18
                    $char = $decoded;
488
                } elseif ($this->has('DescendantFonts')) {
489
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
490
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

490
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
491
                    } else {
492
                        $fonts = $this->get('DescendantFonts')->getContent();
493
                    }
494
                    $decoded = false;
495
496
                    foreach ($fonts as $font) {
497
                        if ($font instanceof self) {
498
                            if (false !== ($decoded = $font->translateChar($char, false))) {
499
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

499
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
500
                                break;
501
                            }
502
                        }
503
                    }
504
505
                    if (false !== $decoded) {
506
                        $char = $decoded;
507
                    } else {
508
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
509
                    }
510
                } else {
511
                    $char = self::MISSING;
512
                }
513
514 18
                $result .= $char;
515
            }
516
517 18
            $text = $result;
518
        }
519
520 18
        return $text;
521
    }
522
523
    /**
524
     * Decode content by any type of Encoding (dictionary's item) instance.
525
     */
526 10
    private function decodeContentByEncoding(string $text): ?string
527
    {
528 10
        $encoding = $this->get('Encoding');
529
530
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
531 10
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
532 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
533
        }
534
535
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
536 10
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
537 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
538
        }
539
540
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
541 7
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
542 7
            return $this->decodeContentByEncodingElement($text, $encoding);
543
        }
544
545
        // Encoding has unintended type.
546
        $encodingClassName = \get_class($encoding);
547
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
548
    }
549
550
    /**
551
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
552
     *
553
     * @param PDFObject $PDFObject
554
     * @return Encoding
555
     */
556 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
557
    {
558 3
        if (!$this->initializedEncodingByPdfObject) {
559 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
560
        }
561
562 3
        return $this->initializedEncodingByPdfObject;
563
    }
564
565
    /**
566
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
567
     */
568 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
569
    {
570 3
        $result = '';
571 3
        $length = \strlen($text);
572
573 3
        for ($i = 0; $i < $length; ++$i) {
574 3
            $dec_av = hexdec(bin2hex($text[$i]));
575 3
            $dec_ap = $encoding->translateChar($dec_av);
576 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

576
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
577
        }
578
579 3
        return $result;
580
    }
581
582
    /**
583
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
584
     */
585 7
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
586
    {
587 7
        $pdfEncodingName = $encoding->getContent();
588
589
        // mb_convert_encoding does not support MacRoman/macintosh,
590
        // so we use iconv() here
591 7
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
592
593 7
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
594
    }
595
596
    /**
597
     * Convert PDF encoding name to iconv-known encoding name.
598
     */
599 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
600
    {
601
        $pdfToIconvEncodingNameMap = [
602 7
            'StandardEncoding' => 'ISO-8859-1',
603
            'MacRomanEncoding' => 'MACINTOSH',
604
            'WinAnsiEncoding' => 'CP1252',
605
        ];
606
607 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
608 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
609 7
            : null;
610
    }
611
612
    /**
613
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
614
     * Otherwise, interpret string as "Window-1252" encoded string.
615
     *
616
     * @param string $text
617
     * @return string|null
618
     */
619 8
    private function decodeContentByAutodetectIfNecessary(string $text): string
620
    {
621 8
        if (mb_check_encoding($text, 'UTF-8')) {
622 8
            return $text;
623
        }
624
625 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
626
        //todo: Why exactly `Windows-1252` used?
627
    }
628
629
    /**
630
     * Create Encoding instance by PDFObject instance and init it.
631
     */
632 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
633
    {
634 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
635 3
        $encoding->init();
636
637 3
        return $encoding;
638
    }
639
640
    /**
641
     * Create Encoding instance by PDFObject instance (without init).
642
     */
643 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
644
    {
645 3
        $document = $PDFObject->getDocument();
646 3
        $header = $PDFObject->getHeader();
647 3
        $content = $PDFObject->getContent();
648 3
        $config = $PDFObject->getConfig();
649
650 3
        return new Encoding($document, $header, $content, $config);
651
    }
652
}
653