Passed
Pull Request — master (#500)
by Konrad
05:56 queued 03:54
created

decodeContentByToUnicodeCMapOrDescendantFonts()   B

Complexity

Conditions 10
Paths 2

Size

Total Lines 46
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 25.6155

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 10
eloc 28
c 1
b 0
f 0
nc 2
nop 1
dl 0
loc 46
ccs 12
cts 26
cp 0.4615
crap 25.6155
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
63
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
64
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     *
70
     * @see https://github.com/smalot/pdfparser/pull/500
71
     */
72
    private $initializedEncodingByPdfObject;
73
74 38
    public function init()
75
    {
76
        // Load translate table.
77 38
        $this->loadTranslateTable();
78 38
    }
79
80 2
    public function getName(): string
81
    {
82 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
83
    }
84
85 2
    public function getType(): string
86
    {
87 2
        return (string) $this->header->get('Subtype');
88
    }
89
90 1
    public function getDetails(bool $deep = true): array
91
    {
92 1
        $details = [];
93
94 1
        $details['Name'] = $this->getName();
95 1
        $details['Type'] = $this->getType();
96 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
97
98 1
        $details += parent::getDetails($deep);
99
100 1
        return $details;
101
    }
102
103
    /**
104
     * @return string|bool
105
     */
106 22
    public function translateChar(string $char, bool $use_default = true)
107
    {
108 22
        $dec = hexdec(bin2hex($char));
109
110 22
        if (\array_key_exists($dec, $this->table)) {
111 19
            return $this->table[$dec];
112
        }
113
114
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
115 6
        $fallbackDecoded = $char;
116
        if (
117 6
            \strlen($char) < 2
118 6
            && $this->has('Encoding')
119 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
120
        ) {
121
            try {
122 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
123 1
                    $fallbackDecoded = self::uchr($dec);
124
                }
125 1
            } catch (EncodingNotFoundException $e) {
126
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
127
                // See table 5.11 on PDF 1.5 specs for more info
128
            }
129
        }
130
131 6
        return $use_default ? self::MISSING : $fallbackDecoded;
132
    }
133
134
    /**
135
     * Convert unicode character code to "utf-8" encoded string.
136
     */
137 36
    public static function uchr(int $code): string
138
    {
139 36
        if (!isset(self::$uchrCache[$code])) {
140
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
141
            // therefore, we use mb_convert_encoding() instead
142 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
143
        }
144
145 36
        return self::$uchrCache[$code];
146
    }
147
148
    /**
149
     * Init internal chars translation table by ToUnicode CMap.
150
     */
151 38
    public function loadTranslateTable(): array
152
    {
153 38
        if (null !== $this->table) {
154 1
            return $this->table;
155
        }
156
157 38
        $this->table = [];
158 38
        $this->tableSizes = [
159
            'from' => 1,
160
            'to' => 1,
161
        ];
162
163 38
        if ($this->has('ToUnicode')) {
164 31
            $content = $this->get('ToUnicode')->getContent();
165 31
            $matches = [];
166
167
            // Support for multiple spacerange sections
168 31
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
169 31
                foreach ($matches['sections'] as $section) {
170 31
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
171
172 31
                    preg_match_all($regexp, $section, $matches);
173
174 31
                    $this->tableSizes = [
175 31
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
176 31
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
177
                    ];
178
179 31
                    break;
180
                }
181
            }
182
183
            // Support for multiple bfchar sections
184 31
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
185 13
                foreach ($matches['sections'] as $section) {
186 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
187
188 13
                    preg_match_all($regexp, $section, $matches);
189
190 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
191
192 13
                    foreach ($matches['from'] as $key => $from) {
193 13
                        $parts = preg_split(
194 13
                            '/([0-9A-F]{4})/i',
195 13
                            $matches['to'][$key],
196 13
                            0,
197 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
198
                        );
199 13
                        $text = '';
200 13
                        foreach ($parts as $part) {
201 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

201
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
202
                        }
203 13
                        $this->table[hexdec($from)] = $text;
204
                    }
205
                }
206
            }
207
208
            // Support for multiple bfrange sections
209 31
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
210 24
                foreach ($matches['sections'] as $section) {
211
                    // Support for : <srcCode1> <srcCode2> <dstString>
212 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
213
214 24
                    preg_match_all($regexp, $section, $matches);
215
216 24
                    foreach ($matches['from'] as $key => $from) {
217 24
                        $char_from = hexdec($from);
218 24
                        $char_to = hexdec($matches['to'][$key]);
219 24
                        $offset = hexdec($matches['offset'][$key]);
220
221 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
222 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
223
                        }
224
                    }
225
226
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
227
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
228 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
229
230 24
                    preg_match_all($regexp, $section, $matches);
231
232 24
                    foreach ($matches['from'] as $key => $from) {
233 1
                        $char_from = hexdec($from);
234 1
                        $strings = [];
235
236 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
237
238 1
                        foreach ($strings['string'] as $position => $string) {
239 1
                            $parts = preg_split(
240 1
                                '/([0-9A-F]{4})/i',
241
                                $string,
242 1
                                0,
243 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
244
                            );
245 1
                            $text = '';
246 1
                            foreach ($parts as $part) {
247 1
                                $text .= self::uchr(hexdec($part));
248
                            }
249 1
                            $this->table[$char_from + $position] = $text;
250
                        }
251
                    }
252
                }
253
            }
254
        }
255
256 38
        return $this->table;
257
    }
258
259
    /**
260
     * Set custom char translation table where:
261
     * - key - integer character code;
262
     * - value - "utf-8" encoded value;
263
     *
264
     * @return void
265
     */
266 1
    public function setTable(array $table)
267
    {
268 1
        $this->table = $table;
269 1
    }
270
271
    /**
272
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
273
     */
274 41
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
275
    {
276
        // Special shortcut for XML content.
277 41
        if (false !== stripos($hexa, '<?xml')) {
278 2
            return $hexa;
279
        }
280
281 41
        $text = '';
282 41
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
283
284 41
        foreach ($parts as $part) {
285 41
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
286
                // strip line breaks
287 13
                $part = preg_replace("/[\r\n]/", '', $part);
288 13
                $part = trim($part, '<>');
289 13
                if ($add_braces) {
290 1
                    $text .= '(';
291
                }
292
293 13
                $part = pack('H*', $part);
294 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
295
296 13
                if ($add_braces) {
297 13
                    $text .= ')';
298
                }
299
            } else {
300 41
                $text .= $part;
301
            }
302
        }
303
304 41
        return $text;
305
    }
306
307
    /**
308
     * Decode string with octal-decoded chunks.
309
     */
310 41
    public static function decodeOctal(string $text): string
311
    {
312 41
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
313 41
        $text = '';
314
315 41
        foreach ($parts as $part) {
316 41
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
317 18
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

317
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
318
            } else {
319 41
                $text .= $part;
320
            }
321
        }
322
323 41
        return $text;
324
    }
325
326
    /**
327
     * Decode string with html entity encoded chars.
328
     */
329 55
    public static function decodeEntities(string $text): string
330
    {
331 55
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
332 55
        $text = '';
333
334 55
        foreach ($parts as $part) {
335 55
            if (preg_match('/^#\d{2}$/', $part)) {
336 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

336
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
337
            } else {
338 55
                $text .= $part;
339
            }
340
        }
341
342 55
        return $text;
343
    }
344
345
    /**
346
     * Check if given string is Unicode text (by BOM);
347
     * If true - decode to "utf-8" encoded string.
348
     * Otherwise - return text as is.
349
     *
350
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
351
     */
352 41
    public static function decodeUnicode(string $text): string
353
    {
354 41
        if (preg_match('/^\xFE\xFF/i', $text)) {
355
            // Strip U+FEFF byte order marker.
356 25
            $decode = substr($text, 2);
357 25
            $text = '';
358 25
            $length = \strlen($decode);
359
360 25
            for ($i = 0; $i < $length; $i += 2) {
361 25
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

361
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
362
            }
363
        }
364
365 41
        return $text;
366
    }
367
368
    /**
369
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
370
     */
371 21
    protected function getFontSpaceLimit(): int
372
    {
373 21
        return $this->config->getFontSpaceLimit();
374
    }
375
376
    /**
377
     * Decode text by commands array.
378
     */
379 21
    public function decodeText(array $commands): string
380
    {
381 21
        $word_position = 0;
382 21
        $words = [];
383 21
        $font_space = $this->getFontSpaceLimit();
384
385 21
        foreach ($commands as $command) {
386 21
            switch ($command[PDFObject::TYPE]) {
387 21
                case 'n':
388 16
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
389 8
                        $word_position = \count($words);
390
                    }
391 16
                    continue 2;
392 21
                case '<':
393
                    // Decode hexadecimal.
394 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
395 11
                    break;
396
397
                default:
398
                    // Decode octal (if necessary).
399 14
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
400
            }
401
402
            // replace escaped chars
403 21
            $text = str_replace(
404 21
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
405 21
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
406
                $text
407
            );
408
409
            // add content to result string
410 21
            if (isset($words[$word_position])) {
411 16
                $words[$word_position] .= $text;
412
            } else {
413 21
                $words[$word_position] = $text;
414
            }
415
        }
416
417 21
        foreach ($words as &$word) {
418 21
            $word = $this->decodeContent($word);
419
        }
420
421 21
        return implode(' ', $words);
422
    }
423
424
    /**
425
     * Decode given $text to "utf-8" encoded string.
426
     *
427
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
428
     */
429 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

429
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
430
    {
431 23
        if ($this->has('ToUnicode')) {
432 19
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
433
        }
434
435 15
        if ($this->has('Encoding')) {
436 11
            $result = $this->decodeContentByEncoding($text);
437
438 11
            if (null !== $result) {
439 11
                return $result;
440
            }
441
        }
442
443 8
        return $this->decodeContentByAutodetectIfNecessary($text);
444
    }
445
446
    /**
447
     * First try to decode $text by ToUnicode CMap.
448
     * If char translation not found in ToUnicode CMap tries:
449
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
450
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
451
     *  - If DescendantFonts does not exist just return "?" as decoded char.
452
     *
453
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
454
     */
455 19
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
456
    {
457 19
        $bytes = $this->tableSizes['from'];
458
459 19
        if ($bytes) {
460 19
            $result = '';
461 19
            $length = \strlen($text);
462
463 19
            for ($i = 0; $i < $length; $i += $bytes) {
464 19
                $char = substr($text, $i, $bytes);
465
466 19
                if (false !== ($decoded = $this->translateChar($char, false))) {
467 19
                    $char = $decoded;
468
                } elseif ($this->has('DescendantFonts')) {
469
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
470
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

470
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
471
                    } else {
472
                        $fonts = $this->get('DescendantFonts')->getContent();
473
                    }
474
                    $decoded = false;
475
476
                    foreach ($fonts as $font) {
477
                        if ($font instanceof self) {
478
                            if (false !== ($decoded = $font->translateChar($char, false))) {
479
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

479
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
480
                                break;
481
                            }
482
                        }
483
                    }
484
485
                    if (false !== $decoded) {
486
                        $char = $decoded;
487
                    } else {
488
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
489
                    }
490
                } else {
491
                    $char = self::MISSING;
492
                }
493
494 19
                $result .= $char;
495
            }
496
497 19
            $text = $result;
498
        }
499
500 19
        return $text;
501
    }
502
503
    /**
504
     * Decode content by any type of Encoding (dictionary's item) instance.
505
     *
506
     * @throws LogicException if unknown encoding instance type is used (given by $this->get('Encoding'))
507
     */
508 11
    private function decodeContentByEncoding(string $text): ?string
509
    {
510 11
        $encoding = $this->get('Encoding');
511
512
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
513 11
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
514 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
515
        }
516
517
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
518 11
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
519 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
520
        }
521
522
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
523 8
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
524 8
            return $this->decodeContentByEncodingElement($text, $encoding);
525
        }
526
527
        // Encoding has unintended type.
528
        $encodingClassName = \get_class($encoding);
529
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
530
    }
531
532
    /**
533
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534
     */
535 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
536
    {
537 3
        if (!$this->initializedEncodingByPdfObject) {
538 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
539
        }
540
541 3
        return $this->initializedEncodingByPdfObject;
542
    }
543
544
    /**
545
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546
     */
547 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
548
    {
549 3
        $result = '';
550 3
        $length = \strlen($text);
551
552 3
        for ($i = 0; $i < $length; ++$i) {
553 3
            $dec_av = hexdec(bin2hex($text[$i]));
554 3
            $dec_ap = $encoding->translateChar($dec_av);
555 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
556
        }
557
558 3
        return $result;
559
    }
560
561
    /**
562
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563
     */
564 8
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
565
    {
566 8
        $pdfEncodingName = $encoding->getContent();
567
568
        // mb_convert_encoding does not support MacRoman/macintosh,
569
        // so we use iconv() here
570 8
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
571
572 8
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
573
    }
574
575
    /**
576
     * Convert PDF encoding name to iconv-known encoding name.
577
     */
578 8
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
579
    {
580
        $pdfToIconvEncodingNameMap = [
581 8
            'StandardEncoding' => 'ISO-8859-1',
582
            'MacRomanEncoding' => 'MACINTOSH',
583
            'WinAnsiEncoding' => 'CP1252',
584
        ];
585
586 8
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
587 8
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
588 8
            : null;
589
    }
590
591
    /**
592
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
593
     * Otherwise, interpret string as "Window-1252" encoded string.
594
     *
595
     * @return string|false
596
     */
597 8
    private function decodeContentByAutodetectIfNecessary(string $text)
598
    {
599 8
        if (mb_check_encoding($text, 'UTF-8')) {
600 8
            return $text;
601
        }
602
603 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
604
        //todo: Why exactly `Windows-1252` used?
605
    }
606
607
    /**
608
     * Create Encoding instance by PDFObject instance and init it.
609
     */
610 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
611
    {
612 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
613 3
        $encoding->init();
614
615 3
        return $encoding;
616
    }
617
618
    /**
619
     * Create Encoding instance by PDFObject instance (without init).
620
     */
621 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
622
    {
623 3
        $document = $PDFObject->getDocument();
624 3
        $header = $PDFObject->getHeader();
625 3
        $content = $PDFObject->getContent();
626 3
        $config = $PDFObject->getConfig();
627
628 3
        return new Encoding($document, $header, $content, $config);
629
    }
630
}
631