Passed
Pull Request — master (#517)
by
unknown
02:11
created

Font::calculateTextWidth()   A

Complexity

Conditions 5
Paths 3

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
eloc 18
nc 3
nop 2
dl 0
loc 27
ccs 18
cts 18
cp 1
crap 5
rs 9.3554
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60
    /**
61
     * In some PDF-files encoding could be referenced by object id but object itself does not contain
62
     * `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
63
     * \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
64
     *
65
     * Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
66
     *
67
     * @var Encoding
68
     *
69
     * @see https://github.com/smalot/pdfparser/pull/500
70
     */
71
    private $initializedEncodingByPdfObject;
72
73 39
    public function init()
74
    {
75
        // Load translate table.
76 39
        $this->loadTranslateTable();
77 39
    }
78
79 3
    public function getName(): string
80
    {
81 3
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
82
    }
83
84 3
    public function getType(): string
85
    {
86 3
        return (string) $this->header->get('Subtype');
87
    }
88
89 2
    public function getDetails(bool $deep = true): array
90
    {
91 2
        $details = [];
92
93 2
        $details['Name'] = $this->getName();
94 2
        $details['Type'] = $this->getType();
95 2
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
96
97 2
        $details += parent::getDetails($deep);
98
99 2
        return $details;
100
    }
101
102
    /**
103
     * @return string|bool
104
     */
105 22
    public function translateChar(string $char, bool $use_default = true)
106
    {
107 22
        $dec = hexdec(bin2hex($char));
108
109 22
        if (\array_key_exists($dec, $this->table)) {
110 19
            return $this->table[$dec];
111
        }
112
113
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
114 6
        $fallbackDecoded = $char;
115
        if (
116 6
            \strlen($char) < 2
117 6
            && $this->has('Encoding')
118 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
119
        ) {
120
            try {
121 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
122 1
                    $fallbackDecoded = self::uchr($dec);
123
                }
124 1
            } catch (EncodingNotFoundException $e) {
125
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
126
                // See table 5.11 on PDF 1.5 specs for more info
127
            }
128
        }
129
130 6
        return $use_default ? self::MISSING : $fallbackDecoded;
131
    }
132
133
    /**
134
     * Convert unicode character code to "utf-8" encoded string.
135
     */
136 37
    public static function uchr(int $code): string
137
    {
138 37
        if (!isset(self::$uchrCache[$code])) {
139
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
140
            // therefore, we use mb_convert_encoding() instead
141 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
142
        }
143
144 37
        return self::$uchrCache[$code];
145
    }
146
147
    /**
148
     * Init internal chars translation table by ToUnicode CMap.
149
     */
150 39
    public function loadTranslateTable(): array
151
    {
152 39
        if (null !== $this->table) {
153 1
            return $this->table;
154
        }
155
156 39
        $this->table = [];
157 39
        $this->tableSizes = [
158
            'from' => 1,
159
            'to' => 1,
160
        ];
161
162 39
        if ($this->has('ToUnicode')) {
163 32
            $content = $this->get('ToUnicode')->getContent();
164 32
            $matches = [];
165
166
            // Support for multiple spacerange sections
167 32
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
168 32
                foreach ($matches['sections'] as $section) {
169 32
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
170
171 32
                    preg_match_all($regexp, $section, $matches);
172
173 32
                    $this->tableSizes = [
174 32
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
175 32
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
176
                    ];
177
178 32
                    break;
179
                }
180
            }
181
182
            // Support for multiple bfchar sections
183 32
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
184 13
                foreach ($matches['sections'] as $section) {
185 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
186
187 13
                    preg_match_all($regexp, $section, $matches);
188
189 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
190
191 13
                    foreach ($matches['from'] as $key => $from) {
192 13
                        $parts = preg_split(
193 13
                            '/([0-9A-F]{4})/i',
194 13
                            $matches['to'][$key],
195 13
                            0,
196 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
197
                        );
198 13
                        $text = '';
199 13
                        foreach ($parts as $part) {
200 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

200
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
201
                        }
202 13
                        $this->table[hexdec($from)] = $text;
203
                    }
204
                }
205
            }
206
207
            // Support for multiple bfrange sections
208 32
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
209 25
                foreach ($matches['sections'] as $section) {
210
                    // Support for : <srcCode1> <srcCode2> <dstString>
211 25
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
212
213 25
                    preg_match_all($regexp, $section, $matches);
214
215 25
                    foreach ($matches['from'] as $key => $from) {
216 25
                        $char_from = hexdec($from);
217 25
                        $char_to = hexdec($matches['to'][$key]);
218 25
                        $offset = hexdec($matches['offset'][$key]);
219
220 25
                        for ($char = $char_from; $char <= $char_to; ++$char) {
221 25
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
222
                        }
223
                    }
224
225
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
226
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
227 25
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
228
229 25
                    preg_match_all($regexp, $section, $matches);
230
231 25
                    foreach ($matches['from'] as $key => $from) {
232 1
                        $char_from = hexdec($from);
233 1
                        $strings = [];
234
235 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
236
237 1
                        foreach ($strings['string'] as $position => $string) {
238 1
                            $parts = preg_split(
239 1
                                '/([0-9A-F]{4})/i',
240
                                $string,
241 1
                                0,
242 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
243
                            );
244 1
                            $text = '';
245 1
                            foreach ($parts as $part) {
246 1
                                $text .= self::uchr(hexdec($part));
247
                            }
248 1
                            $this->table[$char_from + $position] = $text;
249
                        }
250
                    }
251
                }
252
            }
253
        }
254
255 39
        return $this->table;
256
    }
257
258
    /**
259
     * Set custom char translation table where:
260
     * - key - integer character code;
261
     * - value - "utf-8" encoded value;
262
     *
263
     * @return void
264
     */
265 1
    public function setTable(array $table)
266
    {
267 1
        $this->table = $table;
268 1
    }
269
270
    /**
271
     * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
272
     */
273 1
    public function calculateTextWidth($text, &$missing = null): float
274
    {
275 1
        $index_map = array_flip($this->table);
276 1
        $details = $this->getDetails();
277 1
        $widths = $details['Widths'];
278
279
        // Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
280 1
        $width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
281
282 1
        $width = 0;
283 1
        $missing = [];
284 1
        $textLength = mb_strlen($text);
285 1
        for ($i = 0; $i < $textLength; ++$i) {
286 1
            $char = mb_substr($text, $i, 1);
287
            if (
288 1
                !\array_key_exists($char, $index_map)
289 1
                || !\array_key_exists($index_map[$char], $width_map)
290 1
                || !\array_key_exists($width_map[$index_map[$char]], $widths)
291
            ) {
292 1
                $missing[] = $char;
293 1
                continue;
294
            }
295 1
            $width_index = $width_map[$index_map[$char]];
296 1
            $width += $widths[$width_index];
297
        }
298
299 1
        return $width;
300
    }
301
302
    /**
303
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
304
     */
305 42
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
306
    {
307
        // Special shortcut for XML content.
308 42
        if (false !== stripos($hexa, '<?xml')) {
309 2
            return $hexa;
310
        }
311
312 42
        $text = '';
313 42
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
314
315 42
        foreach ($parts as $part) {
316 42
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
317
                // strip line breaks
318 13
                $part = preg_replace("/[\r\n]/", '', $part);
319 13
                $part = trim($part, '<>');
320 13
                if ($add_braces) {
321 1
                    $text .= '(';
322
                }
323
324 13
                $part = pack('H*', $part);
325 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
326
327 13
                if ($add_braces) {
328 13
                    $text .= ')';
329
                }
330
            } else {
331 42
                $text .= $part;
332
            }
333
        }
334
335 42
        return $text;
336
    }
337
338
    /**
339
     * Decode string with octal-decoded chunks.
340
     */
341 42
    public static function decodeOctal(string $text): string
342
    {
343 42
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
344 42
        $text = '';
345
346 42
        foreach ($parts as $part) {
347 42
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
348 19
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

348
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
349
            } else {
350 42
                $text .= $part;
351
            }
352
        }
353
354 42
        return $text;
355
    }
356
357
    /**
358
     * Decode string with html entity encoded chars.
359
     */
360 56
    public static function decodeEntities(string $text): string
361
    {
362 56
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
363 56
        $text = '';
364
365 56
        foreach ($parts as $part) {
366 56
            if (preg_match('/^#\d{2}$/', $part)) {
367 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

367
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
368
            } else {
369 56
                $text .= $part;
370
            }
371
        }
372
373 56
        return $text;
374
    }
375
376
    /**
377
     * Check if given string is Unicode text (by BOM);
378
     * If true - decode to "utf-8" encoded string.
379
     * Otherwise - return text as is.
380
     *
381
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
382
     */
383 42
    public static function decodeUnicode(string $text): string
384
    {
385 42
        if (preg_match('/^\xFE\xFF/i', $text)) {
386
            // Strip U+FEFF byte order marker.
387 26
            $decode = substr($text, 2);
388 26
            $text = '';
389 26
            $length = \strlen($decode);
390
391 26
            for ($i = 0; $i < $length; $i += 2) {
392 26
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

392
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
393
            }
394
        }
395
396 42
        return $text;
397
    }
398
399
    /**
400
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
401
     */
402 21
    protected function getFontSpaceLimit(): int
403
    {
404 21
        return $this->config->getFontSpaceLimit();
405
    }
406
407
    /**
408
     * Decode text by commands array.
409
     */
410 21
    public function decodeText(array $commands): string
411
    {
412 21
        $word_position = 0;
413 21
        $words = [];
414 21
        $font_space = $this->getFontSpaceLimit();
415
416 21
        foreach ($commands as $command) {
417 21
            switch ($command[PDFObject::TYPE]) {
418 21
                case 'n':
419 16
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
420 8
                        $word_position = \count($words);
421
                    }
422 16
                    continue 2;
423 21
                case '<':
424
                    // Decode hexadecimal.
425 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
426 11
                    break;
427
428
                default:
429
                    // Decode octal (if necessary).
430 14
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
431
            }
432
433
            // replace escaped chars
434 21
            $text = str_replace(
435 21
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
436 21
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
437
                $text
438
            );
439
440
            // add content to result string
441 21
            if (isset($words[$word_position])) {
442 16
                $words[$word_position] .= $text;
443
            } else {
444 21
                $words[$word_position] = $text;
445
            }
446
        }
447
448 21
        foreach ($words as &$word) {
449 21
            $word = $this->decodeContent($word);
450
        }
451
452 21
        return implode(' ', $words);
453
    }
454
455
    /**
456
     * Decode given $text to "utf-8" encoded string.
457
     *
458
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
459
     */
460 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

460
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
461
    {
462 23
        if ($this->has('ToUnicode')) {
463 19
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
464
        }
465
466 15
        if ($this->has('Encoding')) {
467 11
            $result = $this->decodeContentByEncoding($text);
468
469 11
            if (null !== $result) {
470 11
                return $result;
471
            }
472
        }
473
474 8
        return $this->decodeContentByAutodetectIfNecessary($text);
475
    }
476
477
    /**
478
     * First try to decode $text by ToUnicode CMap.
479
     * If char translation not found in ToUnicode CMap tries:
480
     *  - If DescendantFonts exists tries to decode char by one of that fonts.
481
     *      - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
482
     *  - If DescendantFonts does not exist just return "?" as decoded char.
483
     *
484
     * @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
485
     */
486 19
    private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
487
    {
488 19
        $bytes = $this->tableSizes['from'];
489
490 19
        if ($bytes) {
491 19
            $result = '';
492 19
            $length = \strlen($text);
493
494 19
            for ($i = 0; $i < $length; $i += $bytes) {
495 19
                $char = substr($text, $i, $bytes);
496
497 19
                if (false !== ($decoded = $this->translateChar($char, false))) {
498 19
                    $char = $decoded;
499
                } elseif ($this->has('DescendantFonts')) {
500
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
501
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

501
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
502
                    } else {
503
                        $fonts = $this->get('DescendantFonts')->getContent();
504
                    }
505
                    $decoded = false;
506
507
                    foreach ($fonts as $font) {
508
                        if ($font instanceof self) {
509
                            if (false !== ($decoded = $font->translateChar($char, false))) {
510
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

510
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
511
                                break;
512
                            }
513
                        }
514
                    }
515
516
                    if (false !== $decoded) {
517
                        $char = $decoded;
518
                    } else {
519
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
520
                    }
521
                } else {
522
                    $char = self::MISSING;
523
                }
524
525 19
                $result .= $char;
526
            }
527
528 19
            $text = $result;
529
        }
530
531 19
        return $text;
532
    }
533
534
    /**
535
     * Decode content by any type of Encoding (dictionary's item) instance.
536
     */
537 11
    private function decodeContentByEncoding(string $text): ?string
538
    {
539 11
        $encoding = $this->get('Encoding');
540
541
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
542 11
        if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
543 3
            $encoding = $this->getInitializedEncodingByPdfObject($encoding);
544
        }
545
546
        // When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
547 11
        if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
548 3
            return $this->decodeContentByEncodingEncoding($text, $encoding);
549
        }
550
551
        // When Encoding is just string (/Encoding /WinAnsiEncoding)
552 8
        if ($encoding instanceof Element) { //todo: ElementString class must by used?
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
553 8
            return $this->decodeContentByEncodingElement($text, $encoding);
554
        }
555
556
        // don't double-encode strings already in UTF-8
557
        if (!mb_check_encoding($text, 'UTF-8')) {
558
            return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
559
        }
560
561
        return $text;
562
    }
563
564
    /**
565
     * Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
566
     */
567 3
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
568
    {
569 3
        if (!$this->initializedEncodingByPdfObject) {
570 3
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
571
        }
572
573 3
        return $this->initializedEncodingByPdfObject;
574
    }
575
576
    /**
577
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
578
     */
579 3
    private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
580
    {
581 3
        $result = '';
582 3
        $length = \strlen($text);
583
584 3
        for ($i = 0; $i < $length; ++$i) {
585 3
            $dec_av = hexdec(bin2hex($text[$i]));
586 3
            $dec_ap = $encoding->translateChar($dec_av);
587 3
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

587
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
588
        }
589
590 3
        return $result;
591
    }
592
593
    /**
594
     * Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
595
     */
596 8
    private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
597
    {
598 8
        $pdfEncodingName = $encoding->getContent();
599
600
        // mb_convert_encoding does not support MacRoman/macintosh,
601
        // so we use iconv() here
602 8
        $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
603
604 8
        return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
605
    }
606
607
    /**
608
     * Convert PDF encoding name to iconv-known encoding name.
609
     */
610 8
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
611
    {
612
        $pdfToIconvEncodingNameMap = [
613 8
            'StandardEncoding' => 'ISO-8859-1',
614
            'MacRomanEncoding' => 'MACINTOSH',
615
            'WinAnsiEncoding' => 'CP1252',
616
        ];
617
618 8
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
619 8
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
620 8
            : null;
621
    }
622
623
    /**
624
     * If string seems like "utf-8" encoded string do nothing and just return given string as is.
625
     * Otherwise, interpret string as "Window-1252" encoded string.
626
     *
627
     * @return string|false
628
     */
629 8
    private function decodeContentByAutodetectIfNecessary(string $text)
630
    {
631 8
        if (mb_check_encoding($text, 'UTF-8')) {
632 8
            return $text;
633
        }
634
635 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...UTF-8', 'Windows-1252') also could return the type array which is incompatible with the documented return type false|string.
Loading history...
636
        //todo: Why exactly `Windows-1252` used?
637
    }
638
639
    /**
640
     * Create Encoding instance by PDFObject instance and init it.
641
     */
642 3
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
643
    {
644 3
        $encoding = $this->createEncodingByPdfObject($PDFObject);
645 3
        $encoding->init();
646
647 3
        return $encoding;
648
    }
649
650
    /**
651
     * Create Encoding instance by PDFObject instance (without init).
652
     */
653 3
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
654
    {
655 3
        $document = $PDFObject->getDocument();
656 3
        $header = $PDFObject->getHeader();
657 3
        $content = $PDFObject->getContent();
658 3
        $config = $PDFObject->getConfig();
659
660 3
        return new Encoding($document, $header, $content, $config);
661
    }
662
}
663