Test Failed
Pull Request — master (#500)
by
unknown
08:29
created

Font::decodeText()   B

Complexity

Conditions 7
Paths 14

Size

Total Lines 43
Code Lines 25

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 23
CRAP Score 7

Importance

Changes 5
Bugs 1 Features 0
Metric Value
cc 7
eloc 25
c 5
b 1
f 0
nc 14
nop 1
dl 0
loc 43
ccs 23
cts 23
cp 1
crap 7
rs 8.5866
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60
    /** @var Encoding */
61
    private $initializedEncodingByPdfObject;
62
63 34
    public function init()
64
    {
65
        // Load translate table.
66 34
        $this->loadTranslateTable();
67 34
    }
68
69 2
    public function getName(): string
70
    {
71 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
72
    }
73
74 2
    public function getType(): string
75
    {
76 2
        return (string) $this->header->get('Subtype');
77
    }
78
79 1
    public function getDetails(bool $deep = true): array
80
    {
81 1
        $details = [];
82
83 1
        $details['Name'] = $this->getName();
84 1
        $details['Type'] = $this->getType();
85 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
86
87 1
        $details += parent::getDetails($deep);
88
89 1
        return $details;
90
    }
91
92
    /**
93
     * @return string|bool
94
     */
95 19
    public function translateChar(string $char, bool $use_default = true)
96
    {
97 19
        $dec = hexdec(bin2hex($char));
98
99 19
        if (\array_key_exists($dec, $this->table)) {
100 16
            return $this->table[$dec];
101
        }
102
103
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
104 6
        $fallbackDecoded = $char;
105
        if (
106 6
            \strlen($char) < 2
107 6
            && $this->has('Encoding')
108 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
109
        ) {
110
            try {
111 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
112 1
                    $fallbackDecoded = self::uchr($dec);
113
                }
114 1
            } catch (EncodingNotFoundException $e) {
115
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
116
                // See table 5.11 on PDF 1.5 specs for more info
117
            }
118
        }
119
120 6
        return $use_default ? self::MISSING : $fallbackDecoded;
121
    }
122
123 32
    public static function uchr(int $code): string
124
    {
125 32
        if (!isset(self::$uchrCache[$code])) {
126
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
127
            // therefore, we use mb_convert_encoding() instead
128 11
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
129
        }
130
131 32
        return self::$uchrCache[$code];
132
    }
133
134 34
    public function loadTranslateTable(): array
135
    {
136 34
        if (null !== $this->table) {
137 1
            return $this->table;
138
        }
139
140 34
        $this->table = [];
141 34
        $this->tableSizes = [
142
            'from' => 1,
143
            'to' => 1,
144
        ];
145
146 34
        if ($this->has('ToUnicode')) {
147 28
            $content = $this->get('ToUnicode')->getContent();
148 28
            $matches = [];
149
150
            // Support for multiple spacerange sections
151 28
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
152 28
                foreach ($matches['sections'] as $section) {
153 28
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 28
                    preg_match_all($regexp, $section, $matches);
156
157 28
                    $this->tableSizes = [
158 28
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
159 28
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
160
                    ];
161
162 28
                    break;
163
                }
164
            }
165
166
            // Support for multiple bfchar sections
167 28
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
168 12
                foreach ($matches['sections'] as $section) {
169 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
170
171 12
                    preg_match_all($regexp, $section, $matches);
172
173 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
174
175 12
                    foreach ($matches['from'] as $key => $from) {
176 12
                        $parts = preg_split(
177 12
                            '/([0-9A-F]{4})/i',
178 12
                            $matches['to'][$key],
179 12
                            0,
180 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
181
                        );
182 12
                        $text = '';
183 12
                        foreach ($parts as $part) {
184 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

184
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
185
                        }
186 12
                        $this->table[hexdec($from)] = $text;
187
                    }
188
                }
189
            }
190
191
            // Support for multiple bfrange sections
192 28
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
193 22
                foreach ($matches['sections'] as $section) {
194
                    // Support for : <srcCode1> <srcCode2> <dstString>
195 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
196
197 22
                    preg_match_all($regexp, $section, $matches);
198
199 22
                    foreach ($matches['from'] as $key => $from) {
200 22
                        $char_from = hexdec($from);
201 22
                        $char_to = hexdec($matches['to'][$key]);
202 22
                        $offset = hexdec($matches['offset'][$key]);
203
204 22
                        for ($char = $char_from; $char <= $char_to; ++$char) {
205 22
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
206
                        }
207
                    }
208
209
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
210
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
211 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
212
213 22
                    preg_match_all($regexp, $section, $matches);
214
215 22
                    foreach ($matches['from'] as $key => $from) {
216 1
                        $char_from = hexdec($from);
217 1
                        $strings = [];
218
219 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
220
221 1
                        foreach ($strings['string'] as $position => $string) {
222 1
                            $parts = preg_split(
223 1
                                '/([0-9A-F]{4})/i',
224
                                $string,
225 1
                                0,
226 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
227
                            );
228 1
                            $text = '';
229 1
                            foreach ($parts as $part) {
230 1
                                $text .= self::uchr(hexdec($part));
231
                            }
232 1
                            $this->table[$char_from + $position] = $text;
233
                        }
234
                    }
235
                }
236
            }
237
        }
238
239 34
        return $this->table;
240
    }
241
242 1
    public function setTable(array $table)
243
    {
244 1
        $this->table = $table;
245 1
    }
246
247 37
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
248
    {
249
        // Special shortcut for XML content.
250 37
        if (false !== stripos($hexa, '<?xml')) {
251 2
            return $hexa;
252
        }
253
254 37
        $text = '';
255 37
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
256
257 37
        foreach ($parts as $part) {
258 37
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
259
                // strip line breaks
260 12
                $part = preg_replace("/[\r\n]/", '', $part);
261 12
                $part = trim($part, '<>');
262 12
                if ($add_braces) {
263 1
                    $text .= '(';
264
                }
265
266 12
                $part = pack('H*', $part);
267 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
268
269 12
                if ($add_braces) {
270 12
                    $text .= ')';
271
                }
272
            } else {
273 37
                $text .= $part;
274
            }
275
        }
276
277 37
        return $text;
278
    }
279
280 37
    public static function decodeOctal(string $text): string
281
    {
282 37
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
283 37
        $text = '';
284
285 37
        foreach ($parts as $part) {
286 37
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
287 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

287
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
288
            } else {
289 37
                $text .= $part;
290
            }
291
        }
292
293 37
        return $text;
294
    }
295
296 51
    public static function decodeEntities(string $text): string
297
    {
298 51
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
299 51
        $text = '';
300
301 51
        foreach ($parts as $part) {
302 51
            if (preg_match('/^#\d{2}$/', $part)) {
303 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

303
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
304
            } else {
305 51
                $text .= $part;
306
            }
307
        }
308
309 51
        return $text;
310
    }
311
312 37
    public static function decodeUnicode(string $text): string
313
    {
314 37
        if (preg_match('/^\xFE\xFF/i', $text)) {
315
            // Strip U+FEFF byte order marker.
316 24
            $decode = substr($text, 2);
317 24
            $text = '';
318 24
            $length = \strlen($decode);
319
320 24
            for ($i = 0; $i < $length; $i += 2) {
321 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

321
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
322
            }
323
        }
324
325 37
        return $text;
326
    }
327
328
    /**
329
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
330
     */
331 17
    protected function getFontSpaceLimit(): int
332
    {
333 17
        return $this->config->getFontSpaceLimit();
334
    }
335
336 17
    public function decodeText(array $commands): string
337
    {
338 17
        $word_position = 0;
339 17
        $words = [];
340 17
        $font_space = $this->getFontSpaceLimit();
341
342 17
        foreach ($commands as $command) {
343 17
            switch ($command[PDFObject::TYPE]) {
344 17
                case 'n':
345 13
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
346 6
                        $word_position = \count($words);
347
                    }
348 13
                    continue 2;
349 17
                case '<':
350
                    // Decode hexadecimal.
351 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
352 10
                    break;
353
354
                default:
355
                    // Decode octal (if necessary).
356 10
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
357
            }
358
359
            // replace escaped chars
360 17
            $text = str_replace(
361 17
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
362 17
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
363
                $text
364
            );
365
366
            // add content to result string
367 17
            if (isset($words[$word_position])) {
368 13
                $words[$word_position] .= $text;
369
            } else {
370 17
                $words[$word_position] = $text;
371
            }
372
        }
373
374 17
        foreach ($words as &$word) {
375 17
            $word = $this->decodeContent($word);
376
        }
377
378 17
        return implode(' ', $words);
379
    }
380
381
    /**
382
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
383
     */
384 19
    public function decodeContent(string $text, ?bool &$unicode = null): string
0 ignored issues
show
Unused Code introduced by
The parameter $unicode is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

384
    public function decodeContent(string $text, /** @scrutinizer ignore-unused */ ?bool &$unicode = null): string

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
385
    {
386 19
        if ($this->has('ToUnicode')) {
387 16
            return $this->decodeContentToUnicode($text);
388
        }
389
390 11
        if ($this->has('Encoding')) {
391 7
            $encoding = $this->get('Encoding');
392
393 7
            if ($encoding instanceof PDFObject) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
394
                $encoding = $this->getInitializedEncodingByPdfObject($encoding);
395
            }
396
397 7
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
398
                return $this->decodeContentByEncoding($text, $encoding);
399
            }
400
401 7
            if ($encoding instanceof Element) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Element.
Loading history...
402 7
                $pdfEncodingName = $encoding->getContent();
403
404
                // mb_convert_encoding does not support MacRoman/macintosh,
405
                // so we use iconv() here
406 7
                $iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
407
408 7
                if ($iconvEncodingName) {
409 7
                    return iconv($iconvEncodingName, 'UTF-8', $text);
410
                }
411
            }
412
        }
413
414 8
        return $this->decodeContentToUtf8IfNecessary($text);
0 ignored issues
show
Bug Best Practice introduced by
The expression return $this->decodeCont...oUtf8IfNecessary($text) could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
415
    }
416
417 16
    private function decodeContentToUnicode(string $text): string
418
    {
419 16
        $bytes = $this->tableSizes['from'];
420
421 16
        if ($bytes) {
422 16
            $result = '';
423 16
            $length = \strlen($text);
424
425 16
            for ($i = 0; $i < $length; $i += $bytes) {
426 16
                $char = substr($text, $i, $bytes);
427
428 16
                if (false !== ($decoded = $this->translateChar($char, false))) {
429 16
                    $char = $decoded;
430
                } elseif ($this->has('DescendantFonts')) {
431
                    if ($this->get('DescendantFonts') instanceof PDFObject) {
432
                        $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

432
                        $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
433
                    } else {
434
                        $fonts = $this->get('DescendantFonts')->getContent();
435
                    }
436
                    $decoded = false;
437
438
                    foreach ($fonts as $font) {
439
                        if ($font instanceof self) {
440
                            if (false !== ($decoded = $font->translateChar($char, false))) {
441
                                $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

441
                                $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
442
                                break;
443
                            }
444
                        }
445
                    }
446
447
                    if (false !== $decoded) {
448
                        $char = $decoded;
449
                    } else {
450
                        $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
451
                    }
452
                } else {
453
                    $char = self::MISSING;
454
                }
455
456 16
                $result .= $char;
457
            }
458
459 16
            $text = $result;
460
        }
461
462 16
        return $text;
463
    }
464
465
    private function decodeContentByEncoding(string $text, Encoding $encoding): string
466
    {
467
        $result = '';
468
        $length = \strlen($text);
469
470
        for ($i = 0; $i < $length; ++$i) {
471
            $dec_av = hexdec(bin2hex($text[$i]));
472
            $dec_ap = $encoding->translateChar($dec_av);
473
            $result .= self::uchr($dec_ap ?? $dec_av);
0 ignored issues
show
Bug introduced by
It seems like $dec_ap ?? $dec_av can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

473
            $result .= self::uchr(/** @scrutinizer ignore-type */ $dec_ap ?? $dec_av);
Loading history...
474
        }
475
476
        return $result;
477
    }
478
479 7
    private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
480
    {
481
        $pdfToIconvEncodingNameMap = [
482 7
            'StandardEncoding' => 'ISO-8859-1',
483
            'MacRomanEncoding' => 'MACINTOSH',
484
            'WinAnsiEncoding' => 'CP1252',
485
        ];
486
487 7
        return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
488 7
            ? $pdfToIconvEncodingNameMap[$pdfEncodingName]
489 7
            : null;
490
    }
491
492 8
    private function decodeContentToUtf8IfNecessary($text)
493
    {
494 8
        if (mb_check_encoding($text, 'UTF-8')) {
495 8
            return $text;
496
        }
497
498 1
        return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
499
    }
500
501
    private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
502
    {
503
        if ($this->initializedEncodingByPdfObject) {
504
            $this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
505
        }
506
507
        return $this->initializedEncodingByPdfObject;
508
    }
509
510
    private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
511
    {
512
        $encoding = $this->createEncodingByPdfObject($PDFObject);
513
514
        $this->initEncoding($encoding);
515
516
        return $encoding;
517
    }
518
519
    private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
520
    {
521
        $document = $PDFObject->getDocument();
522
        $header = $PDFObject->getHeader();
523
        $content = $PDFObject->getContent();
524
        $config = $PDFObject->getConfig();
525
526
        return new Encoding($document, $header, $content, $config);
527
    }
528
529
    private function initEncoding(Encoding $encoding)
530
    {
531
        $encoding->getHeader()->init();
532
        $encoding->init();
533
    }
534
}
535