Passed
Pull Request — master (#500)
by
unknown
02:09
created

Font::getName()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 1
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
63
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
64
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     *
70
     * @see https://github.com/smalot/pdfparser/pull/500
71
     */
72
    private $initializedEncodingByPdfObject;
73
74 37
    public function init()
75
    {
76
        // Load translate table.
77 37
        $this->loadTranslateTable();
78 37
    }
79
80 2
    public function getName(): string
81
    {
82 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
83
    }
84
85 2
    public function getType(): string
86
    {
87 2
        return (string) $this->header->get('Subtype');
88
    }
89
90 1
    public function getDetails(bool $deep = true): array
91
    {
92 1
        $details = [];
93
94 1
        $details['Name'] = $this->getName();
95 1
        $details['Type'] = $this->getType();
96 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
97
98 1
        $details += parent::getDetails($deep);
99
100 1
        return $details;
101
    }
102
103
    /**
104
     * @return string|bool
105
     */
106 21
    public function translateChar(string $char, bool $use_default = true)
107
    {
108 21
        $dec = hexdec(bin2hex($char));
109
110 21
        if (\array_key_exists($dec, $this->table)) {
111 18
            return $this->table[$dec];
112
        }
113
114
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
115 6
        $fallbackDecoded = $char;
116
        if (
117 6
            \strlen($char) < 2
118 6
            && $this->has('Encoding')
119 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
120
        ) {
121
            try {
122 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
123 1
                    $fallbackDecoded = self::uchr($dec);
124
                }
125 1
            } catch (EncodingNotFoundException $e) {
126
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
127
                // See table 5.11 on PDF 1.5 specs for more info
128
            }
129
        }
130
131 6
        return $use_default ? self::MISSING : $fallbackDecoded;
132
    }
133
134
    /**
135
     * Convert unicode character code to "utf-8" encoded string.
136
     *
137
     * @param int $code
138
     * @return string
139
     */
140 35
    public static function uchr(int $code): string
141
    {
142 35
        if (!isset(self::$uchrCache[$code])) {
143
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
144
            // therefore, we use mb_convert_encoding() instead
145 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
146
        }
147
148 35
        return self::$uchrCache[$code];
149
    }
150
151
    /**
152
     * Init internal chars translation table by ToUnicode CMap.
153
     *
154
     * @return array
155
     */
156 37
    public function loadTranslateTable(): array
157
    {
158 37
        if (null !== $this->table) {
159 1
            return $this->table;
160
        }
161
162 37
        $this->table = [];
163 37
        $this->tableSizes = [
164
            'from' => 1,
165
            'to' => 1,
166
        ];
167
168 37
        if ($this->has('ToUnicode')) {
169 30
            $content = $this->get('ToUnicode')->getContent();
170 30
            $matches = [];
171
172
            // Support for multiple spacerange sections
173 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
174 30
                foreach ($matches['sections'] as $section) {
175 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
176
177 30
                    preg_match_all($regexp, $section, $matches);
178
179 30
                    $this->tableSizes = [
180 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
181 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
182
                    ];
183
184 30
                    break;
185
                }
186
            }
187
188
            // Support for multiple bfchar sections
189 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
190 12
                foreach ($matches['sections'] as $section) {
191 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
192
193 12
                    preg_match_all($regexp, $section, $matches);
194
195 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
196
197 12
                    foreach ($matches['from'] as $key => $from) {
198 12
                        $parts = preg_split(
199 12
                            '/([0-9A-F]{4})/i',
200 12
                            $matches['to'][$key],
201 12
                            0,
202 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
203
                        );
204 12
                        $text = '';
205 12
                        foreach ($parts as $part) {
206 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

206
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
207
                        }
208 12
                        $this->table[hexdec($from)] = $text;
209
                    }
210
                }
211
            }
212
213
            // Support for multiple bfrange sections
214 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
215 24
                foreach ($matches['sections'] as $section) {
216
                    // Support for : <srcCode1> <srcCode2> <dstString>
217 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
218
219 24
                    preg_match_all($regexp, $section, $matches);
220
221 24
                    foreach ($matches['from'] as $key => $from) {
222 24
                        $char_from = hexdec($from);
223 24
                        $char_to = hexdec($matches['to'][$key]);
224 24
                        $offset = hexdec($matches['offset'][$key]);
225
226 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
227 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
228
                        }
229
                    }
230
231
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
232
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
233 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
234
235 24
                    preg_match_all($regexp, $section, $matches);
236
237 24
                    foreach ($matches['from'] as $key => $from) {
238 1
                        $char_from = hexdec($from);
239 1
                        $strings = [];
240
241 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
242
243 1
                        foreach ($strings['string'] as $position => $string) {
244 1
                            $parts = preg_split(
245 1
                                '/([0-9A-F]{4})/i',
246
                                $string,
247 1
                                0,
248 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
249
                            );
250 1
                            $text = '';
251 1
                            foreach ($parts as $part) {
252 1
                                $text .= self::uchr(hexdec($part));
253
                            }
254 1
                            $this->table[$char_from + $position] = $text;
255
                        }
256
                    }
257
                }
258
            }
259
        }
260
261 37
        return $this->table;
262
    }
263
264
    /**
265
     * Set custom char translation table where:
266
     * - key - integer character code;
267
     * - value - "utf-8" encoded value;
268
     *
269
     * @param array $table
270
     * @return void
271
     */
272 1
    public function setTable(array $table)
273
    {
274 1
        $this->table = $table;
275 1
    }
276
277
    /**
278
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
279
     *
280
     * @param string $hexa
281
     * @param bool $add_braces
282
     * @return string
283
     */
284 40
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
285
    {
286
        // Special shortcut for XML content.
287 40
        if (false !== stripos($hexa, '<?xml')) {
288 2
            return $hexa;
289
        }
290
291 40
        $text = '';
292 40
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
293
294 40
        foreach ($parts as $part) {
295 40
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
296
                // strip line breaks
297 12
                $part = preg_replace("/[\r\n]/", '', $part);
298 12
                $part = trim($part, '<>');
299 12
                if ($add_braces) {
300 1
                    $text .= '(';
301
                }
302
303 12
                $part = pack('H*', $part);
304 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
305
306 12
                if ($add_braces) {
307 12
                    $text .= ')';
308
                }
309
            } else {
310 40
                $text .= $part;
311
            }
312
        }
313
314 40
        return $text;
315
    }
316
317
    /**
318
     * Decode string with octal-decoded chunks.
319
     *
320
     * @param string $text
321
     * @return string
322
     */
323 40
    public static function decodeOctal(string $text): string
324
    {
325 40
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
326 40
        $text = '';
327
328 40
        foreach ($parts as $part) {
329 40
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
330 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

330
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
331
            } else {
332 40
                $text .= $part;
333
            }
334
        }
335
336 40
        return $text;
337
    }
338
339
    /**
340
     * Decode string with html entity encoded chars.
341
     * @param string $text
342
     * @return string
343
     */
344 54
    public static function decodeEntities(string $text): string
345
    {
346 54
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
347 54
        $text = '';
348
349 54
        foreach ($parts as $part) {
350 54
            if (preg_match('/^#\d{2}$/', $part)) {
351 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

351
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
352
            } else {
353 54
                $text .= $part;
354
            }
355
        }
356
357 54
        return $text;
358
    }
359
360
    /**
361
     * Check if given string is Unicode text (by BOM);
362
     * If true - decode to "utf-8" encoded string.
363
     * Otherwise - return text as is.
364
     *
365
     * //todo: rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
366
     * @param string $text
367
     * @return string
368
     */
369 40
    public static function decodeUnicode(string $text): string
370
    {
371 40
        if (preg_match('/^\xFE\xFF/i', $text)) {
372
            // Strip U+FEFF byte order marker.
373 24
            $decode = substr($text, 2);
374 24
            $text = '';
375 24
            $length = \strlen($decode);
376
377 24
            for ($i = 0; $i < $length; $i += 2) {
378 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

378
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
379
            }
380
        }
381
382 40
        return $text;
383
    }
384
385
    /**
386
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
387
     */
388 20
    protected function getFontSpaceLimit(): int
389
    {
390 20
        return $this->config->getFontSpaceLimit();
391
    }
392
393
    /**
394
     * Decode text by commands array.
395
     *
396
     * @param array $commands
397
     * @return string
398
     */
399 20
    public function decodeText(array $commands): string
400
    {
401 20
        $word_position = 0;
402 20
        $words = [];
403 20
        $font_space = $this->getFontSpaceLimit();
404
405 20
        foreach ($commands as $command) {
406 20
            switch ($command[PDFObject::TYPE]) {
407 20
                case 'n':
408 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
409 7
                        $word_position = \count($words);
410
                    }
411 15
                    continue 2;
412 20
                case '<':
413
                    // Decode hexadecimal.
414 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
415 10
                    break;
416
417
                default:
418
                    // Decode octal (if necessary).
419 13
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
420
            }
421
422
            // replace escaped chars
423 20
            $text = str_replace(
424 20
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
425 20
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
426
                $text
427
            );
428
429
            // add content to result string
430 20
            if (isset($words[$word_position])) {
431 15
                $words[$word_position] .= $text;
432
            } else {
433 20
                $words[$word_position] = $text;
434
            }
435
        }
436
437 20
        foreach ($words as &$word) {
438 20
            $word = $this->decodeContent($word);
439
        }
440
441 20
        return implode(' ', $words);
442
    }
443
444
    /**
445
     * Decode given $text to "utf-8" encoded string.
446
     *
447
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
448
     */
449 22
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

449
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
450
    {
451 22
        if ($this->has('ToUnicode')) {
452 18
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
453
        }
454
455 14
        if ($this->has('Encoding')) {
456 10
            $result = $this->decodeContentByEncoding($text);
457
458 10
            if (null !== $result) {
459 10
                return $result;
460
            }
461
        }
462
463 8
        return $this->decodeContentByAutodetectIfNecessary($text);
464
    }
465
466
    /**
467
     * First try to decode $text by ToUnicode CMap.
468
     * If char translation not found in ToUnicode CMap tries:
469
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
470
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
471
     *  - If DescendantFonts does not exist just return "?" as decoded char.
472
     *
473
     * //todo: Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
474
     * @param string $text
475
     * @return string
476
     */
477 18
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
478
    {
479 18
        $bytes = $this->tableSizes['from'];
480
481 18
        if ($bytes) {
482 18
            $result = '';
483 18
            $length = \strlen($text);
484
485 18
            for ($i = 0; $i < $length; $i += $bytes) {
486 18
                $char = substr($text, $i, $bytes);
487
488 18
                if (false !== ($decoded = $this->translateChar($char, false))) {
489 18
                    $char = $decoded;
490
                } elseif ($this->has('DescendantFonts')) {
491
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
492
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

492
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
493
                    } else {
494
                        $fonts = $this->get('DescendantFonts')->getContent();
495
                    }
496
                    $decoded = false;
497
498
                    foreach ($fonts as $font) {
499
                        if ($font instanceof self) {
500
                            if (false !== ($decoded = $font->translateChar($char, false))) {
501
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

501
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
502
                                break;
503
                            }
504
                        }
505
                    }
506
507
                    if (false !== $decoded) {
508
                        $char = $decoded;
509
                    } else {
510
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
511
                    }
512
                } else {
513
                    $char = self::MISSING;
514
                }
515
516 18
                $result .= $char;
517
            }
518
519 18
            $text = $result;
520
        }
521
522 18
        return $text;
523
    }
524
525
    /**
526
     * Decode content by any type of Encoding (dictionary's item) instance.
527
     */
528 10
    private function decodeContentByEncoding(string $text): ?string
529
    {
530 10
        $encoding = $this->get('Encoding');
531
532
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
533 10
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
534 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
535
        }
536
537
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
538 10
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
539 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
540
        }
541
542
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
543 7
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
544 7
            return $this->decodeContentByEncodingElement($text, $encoding);
545
        }
546
547
        // Encoding has unintended type.
548
        $encodingClassName = \get_class($encoding);
549
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
550
    }
551
552
    /**
553
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
554
     *
555
     * @param PDFObject $PDFObject
556
     * @return Encoding
557
     */
558 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
559
    {
560 3
        if (!$this->initializedEncodingByPdfObject) {
561 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
562
        }
563
564 3
        return $this->initializedEncodingByPdfObject;
565
    }
566
567
    /**
568
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
569
     */
570 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
571
    {
572 3
        $result = '';
573 3
        $length = \strlen($text);
574
575 3
        for ($i = 0; $i < $length; ++$i) {
576 3
            $dec_av = hexdec(bin2hex($text[$i]));
577 3
            $dec_ap = $encoding->translateChar($dec_av);
578 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

578
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
579
        }
580
581 3
        return $result;
582
    }
583
584
    /**
585
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
586
     */
587 7
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
588
    {
589 7
        $pdfEncodingName = $encoding->getContent();
590
591
        // mb_convert_encoding does not support MacRoman/macintosh,
592
        // so we use iconv() here
593 7
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
594
595 7
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
596
    }
597
598
    /**
599
     * Convert PDF encoding name to iconv-known encoding name.
600
     */
601 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
602
    {
603
        $pdfToIconvEncodingNameMap = [
604 7
            'StandardEncoding' => 'ISO-8859-1',
605
            'MacRomanEncoding' => 'MACINTOSH',
606
            'WinAnsiEncoding' => 'CP1252',
607
        ];
608
609 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
610 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
611 7
            : null;
612
    }
613
614
    /**
615
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
616
     * Otherwise, interpret string as "Window-1252" encoded string.
617
     *
618
     * @param string $text
619
     * @return string|null
620
     */
621 8
    private function decodeContentByAutodetectIfNecessary(string $text): string
622
    {
623 8
        if (mb_check_encoding($text, 'UTF-8')) {
624 8
            return $text;
625
        }
626
627 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
628
        //todo: Why exactly `Windows-1252` used?
629
    }
630
631
    /**
632
     * Create Encoding instance by PDFObject instance and init it.
633
     */
634 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
635
    {
636 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
637 3
        $encoding->init();
638
639 3
        return $encoding;
640
    }
641
642
    /**
643
     * Create Encoding instance by PDFObject instance (without init).
644
     */
645 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
646
    {
647 3
        $document = $PDFObject->getDocument();
648 3
        $header = $PDFObject->getHeader();
649 3
        $content = $PDFObject->getContent();
650 3
        $config = $PDFObject->getConfig();
651
652 3
        return new Encoding($document, $header, $content, $config);
653
    }
654
}
655