Passed
Pull Request — master (#500)
by Konrad
05:56 queued 03:54
created

Font::decodeEntities()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 3

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 14
rs 10
c 0
b 0
f 0
ccs 8
cts 8
cp 1
cc 3
nc 3
nop 1
crap 3
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use LogicException;
34
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
35
use Smalot\PdfParser\Exception\EncodingNotFoundException;
36
37
/**
38
 * Class Font
39
 */
40
class Font extends PDFObject
41
{
42
    const MISSING = '?';
43
44
    /**
45
     * @var array
46
     */
47
    protected $table = null;
48
49
    /**
50
     * @var array
51
     */
52
    protected $tableSizes = null;
53
54
    /**
55
     * Caches results from uchr.
56
     *
57
     * @var array
58
     */
59
    private static $uchrCache = [];
60
61
    /**
62
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
63
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
64
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
65
     *
66
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
67
     *
68
     * @var Encoding
69
     *
70
     * @see https://github.com/smalot/pdfparser/pull/500
71
     */
72
    private $initializedEncodingByPdfObject;
73
74 38
    public function init()
75
    {
76
        // Load translate table.
77 38
        $this->loadTranslateTable();
78 38
    }
79
80 2
    public function getName(): string
81
    {
82 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
83
    }
84
85 2
    public function getType(): string
86
    {
87 2
        return (string) $this->header->get('Subtype');
88
    }
89
90 1
    public function getDetails(bool $deep = true): array
91
    {
92 1
        $details = [];
93
94 1
        $details['Name'] = $this->getName();
95 1
        $details['Type'] = $this->getType();
96 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
97
98 1
        $details += parent::getDetails($deep);
99
100 1
        return $details;
101
    }
102
103
    /**
104
     * @return string|bool
105
     */
106 22
    public function translateChar(string $char, bool $use_default = true)
107
    {
108 22
        $dec = hexdec(bin2hex($char));
109
110 22
        if (\array_key_exists($dec, $this->table)) {
111 19
            return $this->table[$dec];
112
        }
113
114
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
115 6
        $fallbackDecoded = $char;
116
        if (
117 6
            \strlen($char) < 2
118 6
            && $this->has('Encoding')
119 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
120
        ) {
121
            try {
122 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
123 1
                    $fallbackDecoded = self::uchr($dec);
124
                }
125 1
            } catch (EncodingNotFoundException $e) {
126
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
127
                // See table 5.11 on PDF 1.5 specs for more info
128
            }
129
        }
130
131 6
        return $use_default ? self::MISSING : $fallbackDecoded;
132
    }
133
134
    /**
135
     * Convert unicode character code to "utf-8" encoded string.
136
     */
137 36
    public static function uchr(int $code): string
138
    {
139 36
        if (!isset(self::$uchrCache[$code])) {
140
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
141
            // therefore, we use mb_convert_encoding() instead
142 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
143
        }
144
145 36
        return self::$uchrCache[$code];
146
    }
147
148
    /**
149
     * Init internal chars translation table by ToUnicode CMap.
150
     */
151 38
    public function loadTranslateTable(): array
152
    {
153 38
        if (null !== $this->table) {
154 1
            return $this->table;
155
        }
156
157 38
        $this->table = [];
158 38
        $this->tableSizes = [
159
            'from' => 1,
160
            'to' => 1,
161
        ];
162
163 38
        if ($this->has('ToUnicode')) {
164 31
            $content = $this->get('ToUnicode')->getContent();
165 31
            $matches = [];
166
167
            // Support for multiple spacerange sections
168 31
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
169 31
                foreach ($matches['sections'] as $section) {
170 31
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
171
172 31
                    preg_match_all($regexp, $section, $matches);
173
174 31
                    $this->tableSizes = [
175 31
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
176 31
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
177
                    ];
178
179 31
                    break;
180
                }
181
            }
182
183
            // Support for multiple bfchar sections
184 31
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
185 13
                foreach ($matches['sections'] as $section) {
186 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
187
188 13
                    preg_match_all($regexp, $section, $matches);
189
190 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
191
192 13
                    foreach ($matches['from'] as $key => $from) {
193 13
                        $parts = preg_split(
194 13
                            '/([0-9A-F]{4})/i',
195 13
                            $matches['to'][$key],
196 13
                            0,
197 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
198
                        );
199 13
                        $text = '';
200 13
                        foreach ($parts as $part) {
201 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

201
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
202
                        }
203 13
                        $this->table[hexdec($from)] = $text;
204
                    }
205
                }
206
            }
207
208
            // Support for multiple bfrange sections
209 31
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
210 24
                foreach ($matches['sections'] as $section) {
211
                    // Support for : <srcCode1> <srcCode2> <dstString>
212 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
213
214 24
                    preg_match_all($regexp, $section, $matches);
215
216 24
                    foreach ($matches['from'] as $key => $from) {
217 24
                        $char_from = hexdec($from);
218 24
                        $char_to = hexdec($matches['to'][$key]);
219 24
                        $offset = hexdec($matches['offset'][$key]);
220
221 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
222 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
223
                        }
224
                    }
225
226
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
227
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
228 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
229
230 24
                    preg_match_all($regexp, $section, $matches);
231
232 24
                    foreach ($matches['from'] as $key => $from) {
233 1
                        $char_from = hexdec($from);
234 1
                        $strings = [];
235
236 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
237
238 1
                        foreach ($strings['string'] as $position => $string) {
239 1
                            $parts = preg_split(
240 1
                                '/([0-9A-F]{4})/i',
241
                                $string,
242 1
                                0,
243 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
244
                            );
245 1
                            $text = '';
246 1
                            foreach ($parts as $part) {
247 1
                                $text .= self::uchr(hexdec($part));
248
                            }
249 1
                            $this->table[$char_from + $position] = $text;
250
                        }
251
                    }
252
                }
253
            }
254
        }
255
256 38
        return $this->table;
257
    }
258
259
    /**
260
     * Set custom char translation table where:
261
     * - key - integer character code;
262
     * - value - "utf-8" encoded value;
263
     *
264
     * @return void
265
     */
266 1
    public function setTable(array $table)
267
    {
268 1
        $this->table = $table;
269 1
    }
270
271
    /**
272
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
273
     */
274 41
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
275
    {
276
        // Special shortcut for XML content.
277 41
        if (false !== stripos($hexa, '<?xml')) {
278 2
            return $hexa;
279
        }
280
281 41
        $text = '';
282 41
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
283
284 41
        foreach ($parts as $part) {
285 41
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
286
                // strip line breaks
287 13
                $part = preg_replace("/[\r\n]/", '', $part);
288 13
                $part = trim($part, '<>');
289 13
                if ($add_braces) {
290 1
                    $text .= '(';
291
                }
292
293 13
                $part = pack('H*', $part);
294 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
295
296 13
                if ($add_braces) {
297 13
                    $text .= ')';
298
                }
299
            } else {
300 41
                $text .= $part;
301
            }
302
        }
303
304 41
        return $text;
305
    }
306
307
    /**
308
     * Decode string with octal-decoded chunks.
309
     */
310 41
    public static function decodeOctal(string $text): string
311
    {
312 41
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
313 41
        $text = '';
314
315 41
        foreach ($parts as $part) {
316 41
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
317 18
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

317
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
318
            } else {
319 41
                $text .= $part;
320
            }
321
        }
322
323 41
        return $text;
324
    }
325
326
    /**
327
     * Decode string with html entity encoded chars.
328
     */
329 55
    public static function decodeEntities(string $text): string
330
    {
331 55
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
332 55
        $text = '';
333
334 55
        foreach ($parts as $part) {
335 55
            if (preg_match('/^#\d{2}$/', $part)) {
336 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

336
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
337
            } else {
338 55
                $text .= $part;
339
            }
340
        }
341
342 55
        return $text;
343
    }
344
345
    /**
346
     * Check if given string is Unicode text (by BOM);
347
     * If true - decode to "utf-8" encoded string.
348
     * Otherwise - return text as is.
349
     *
350
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
351
     */
352 41
    public static function decodeUnicode(string $text): string
353
    {
354 41
        if (preg_match('/^\xFE\xFF/i', $text)) {
355
            // Strip U+FEFF byte order marker.
356 25
            $decode = substr($text, 2);
357 25
            $text = '';
358 25
            $length = \strlen($decode);
359
360 25
            for ($i = 0; $i < $length; $i += 2) {
361 25
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

361
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
362
            }
363
        }
364
365 41
        return $text;
366
    }
367
368
    /**
369
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
370
     */
371 21
    protected function getFontSpaceLimit(): int
372
    {
373 21
        return $this->config->getFontSpaceLimit();
374
    }
375
376
    /**
377
     * Decode text by commands array.
378
     */
379 21
    public function decodeText(array $commands): string
380
    {
381 21
        $word_position = 0;
382 21
        $words = [];
383 21
        $font_space = $this->getFontSpaceLimit();
384
385 21
        foreach ($commands as $command) {
386 21
            switch ($command[PDFObject::TYPE]) {
387 21
                case 'n':
388 16
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
389 8
                        $word_position = \count($words);
390
                    }
391 16
                    continue 2;
392 21
                case '<':
393
                    // Decode hexadecimal.
394 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
395 11
                    break;
396
397
                default:
398
                    // Decode octal (if necessary).
399 14
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
400
            }
401
402
            // replace escaped chars
403 21
            $text = str_replace(
404 21
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
405 21
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
406
                $text
407
            );
408
409
            // add content to result string
410 21
            if (isset($words[$word_position])) {
411 16
                $words[$word_position] .= $text;
412
            } else {
413 21
                $words[$word_position] = $text;
414
            }
415
        }
416
417 21
        foreach ($words as &$word) {
418 21
            $word = $this->decodeContent($word);
419
        }
420
421 21
        return implode(' ', $words);
422
    }
423
424
    /**
425
     * Decode given $text to "utf-8" encoded string.
426
     *
427
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
428
     */
429 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

429
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
430
    {
431 23
        if ($this->has('ToUnicode')) {
432 19
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
433
        }
434
435 15
        if ($this->has('Encoding')) {
436 11
            $result = $this->decodeContentByEncoding($text);
437
438 11
            if (null !== $result) {
439 11
                return $result;
440
            }
441
        }
442
443 8
        return $this->decodeContentByAutodetectIfNecessary($text);
444
    }
445
446
    /**
447
     * First try to decode $text by ToUnicode CMap.
448
     * If char translation not found in ToUnicode CMap tries:
449
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
450
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
451
     *  - If DescendantFonts does not exist just return "?" as decoded char.
452
     *
453
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
454
     */
455 19
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
456
    {
457 19
        $bytes = $this->tableSizes['from'];
458
459 19
        if ($bytes) {
460 19
            $result = '';
461 19
            $length = \strlen($text);
462
463 19
            for ($i = 0; $i < $length; $i += $bytes) {
464 19
                $char = substr($text, $i, $bytes);
465
466 19
                if (false !== ($decoded = $this->translateChar($char, false))) {
467 19
                    $char = $decoded;
468
                } elseif ($this->has('DescendantFonts')) {
469
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
470
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

470
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
471
                    } else {
472
                        $fonts = $this->get('DescendantFonts')->getContent();
473
                    }
474
                    $decoded = false;
475
476
                    foreach ($fonts as $font) {
477
                        if ($font instanceof self) {
478
                            if (false !== ($decoded = $font->translateChar($char, false))) {
479
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

479
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
480
                                break;
481
                            }
482
                        }
483
                    }
484
485
                    if (false !== $decoded) {
486
                        $char = $decoded;
487
                    } else {
488
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
489
                    }
490
                } else {
491
                    $char = self::MISSING;
492
                }
493
494 19
                $result .= $char;
495
            }
496
497 19
            $text = $result;
498
        }
499
500 19
        return $text;
501
    }
502
503
    /**
504
     * Decode content by any type of Encoding (dictionary's item) instance.
505
     *
506
     * @throws LogicException if unknown encoding instance type is used (given by $this->get('Encoding'))
507
     */
508 11
    private function decodeContentByEncoding(string $text): ?string
509
    {
510 11
        $encoding = $this->get('Encoding');
511
512
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
513 11
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
514 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
515
        }
516
517
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
518 11
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
519 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
520
        }
521
522
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
523 8
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
524 8
            return $this->decodeContentByEncodingElement($text, $encoding);
525
        }
526
527
        // Encoding has unintended type.
528
        $encodingClassName = \get_class($encoding);
529
        throw new LogicException("Unknown encoding instance type: {$encodingClassName}");
530
    }
531
532
    /**
533
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
534
     */
535 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
536
    {
537 3
        if (!$this->initializedEncodingByPdfObject) {
538 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
539
        }
540
541 3
        return $this->initializedEncodingByPdfObject;
542
    }
543
544
    /**
545
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
546
     */
547 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
548
    {
549 3
        $result = '';
550 3
        $length = \strlen($text);
551
552 3
        for ($i = 0; $i < $length; ++$i) {
553 3
            $dec_av = hexdec(bin2hex($text[$i]));
554 3
            $dec_ap = $encoding->translateChar($dec_av);
555 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
556
        }
557
558 3
        return $result;
559
    }
560
561
    /**
562
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
563
     */
564 8
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
565
    {
566 8
        $pdfEncodingName = $encoding->getContent();
567
568
        // mb_convert_encoding does not support MacRoman/macintosh,
569
        // so we use iconv() here
570 8
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
571
572 8
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
573
    }
574
575
    /**
576
     * Convert PDF encoding name to iconv-known encoding name.
577
     */
578 8
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
579
    {
580
        $pdfToIconvEncodingNameMap = [
581 8
            'StandardEncoding' => 'ISO-8859-1',
582
            'MacRomanEncoding' => 'MACINTOSH',
583
            'WinAnsiEncoding' => 'CP1252',
584
        ];
585
586 8
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
587 8
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
588 8
            : null;
589
    }
590
591
    /**
592
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
593
     * Otherwise, interpret string as "Window-1252" encoded string.
594
     *
595
     * @return string|false
596
     */
597 8
    private function decodeContentByAutodetectIfNecessary(string $text)
598
    {
599 8
        if (mb_check_encoding($text, 'UTF-8')) {
600 8
            return $text;
601
        }
602
603 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
604
        //todo: Why exactly `Windows-1252` used?
605
    }
606
607
    /**
608
     * Create Encoding instance by PDFObject instance and init it.
609
     */
610 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
611
    {
612 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
613 3
        $encoding->init();
614
615 3
        return $encoding;
616
    }
617
618
    /**
619
     * Create Encoding instance by PDFObject instance (without init).
620
     */
621 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
622
    {
623 3
        $document = $PDFObject->getDocument();
624 3
        $header = $PDFObject->getHeader();
625 3
        $content = $PDFObject->getContent();
626 3
        $config = $PDFObject->getConfig();
627
628 3
        return new Encoding($document, $header, $content, $config);
629
    }
630
}
631