Test Failed
Pull Request — master (#510)
by Jeremy
04:34 queued 02:12
created

Font::uchr()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
nc 2
nop 1
dl 0
loc 9
ccs 4
cts 4
cp 1
crap 2
rs 10
c 2
b 0
f 0
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60 36
    public function init()
61
    {
62
        // Load translate table.
63 36
        $this->loadTranslateTable();
64 36
    }
65
66 1
    public function getName(): string
67
    {
68 1
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
69
    }
70
71 1
    public function getType(): string
72
    {
73 1
        return (string) $this->header->get('Subtype');
74
    }
75
76
    public function getDetails(bool $deep = true): array
77
    {
78
        $details = [];
79
80
        $details['Name'] = $this->getName();
81
        $details['Type'] = $this->getType();
82
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
83
84
        $details += parent::getDetails($deep);
85
86
        return $details;
87
    }
88
89
    /**
90
     * @return string|bool
91
     */
92 21
    public function translateChar(string $char, bool $use_default = true)
93
    {
94 21
        $dec = hexdec(bin2hex($char));
95
96 21
        if (\array_key_exists($dec, $this->table)) {
97 19
            return $this->table[$dec];
98
        }
99
100
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
101 5
        $fallbackDecoded = $char;
102
        if (
103 5
            \strlen($char) < 2
104 5
            && $this->has('Encoding')
105 5
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
106
        ) {
107
            try {
108 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
109 1
                    $fallbackDecoded = self::uchr($dec);
110
                }
111 1
            } catch (EncodingNotFoundException $e) {
112
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
113
                // See table 5.11 on PDF 1.5 specs for more info
114
            }
115
        }
116
117 5
        return $use_default ? self::MISSING : $fallbackDecoded;
118
    }
119
120
    /**
121
     * Convert unicode character code to "utf-8" encoded string.
122
     */
123 35
    public static function uchr(int $code): string
124
    {
125 35
        if (!isset(self::$uchrCache[$code])) {
126
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
127
            // therefore, we use mb_convert_encoding() instead
128 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
129
        }
130
131 35
        return self::$uchrCache[$code];
132
    }
133
134
    /**
135
     * Init internal chars translation table by ToUnicode CMap.
136
     */
137 36
    public function loadTranslateTable(): array
138
    {
139 36
        if (null !== $this->table) {
140 1
            return $this->table;
141
        }
142
143 36
        $this->table = [];
144 36
        $this->tableSizes = [
145
            'from' => 1,
146
            'to' => 1,
147
        ];
148
149 36
        if ($this->has('ToUnicode')) {
150 29
            $content = $this->get('ToUnicode')->getContent();
151 29
            $matches = [];
152
153
            // Support for multiple spacerange sections
154 29
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
155 29
                foreach ($matches['sections'] as $section) {
156 29
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
157
158 29
                    preg_match_all($regexp, $section, $matches);
159
160 29
                    $this->tableSizes = [
161 29
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
162 29
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
163
                    ];
164
165 29
                    break;
166
                }
167
            }
168
169
            // Support for multiple bfchar sections
170 29
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
171 13
                foreach ($matches['sections'] as $section) {
172 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
173
174 13
                    preg_match_all($regexp, $section, $matches);
175
176 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
177
178 13
                    foreach ($matches['from'] as $key => $from) {
179 13
                        $parts = preg_split(
180 13
                            '/([0-9A-F]{4})/i',
181 13
                            $matches['to'][$key],
182 13
                            0,
183 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
184
                        );
185 13
                        $text = '';
186 13
                        foreach ($parts as $part) {
187 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

187
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
188
                        }
189 13
                        $this->table[hexdec($from)] = $text;
190
                    }
191
                }
192
            }
193
194
            // Support for multiple bfrange sections
195 29
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
196 23
                foreach ($matches['sections'] as $section) {
197
                    // Support for : <srcCode1> <srcCode2> <dstString>
198 23
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
199
200 23
                    preg_match_all($regexp, $section, $matches);
201
202 23
                    foreach ($matches['from'] as $key => $from) {
203 23
                        $char_from = hexdec($from);
204 23
                        $char_to = hexdec($matches['to'][$key]);
205 23
                        $offset = hexdec($matches['offset'][$key]);
206
207 23
                        for ($char = $char_from; $char <= $char_to; ++$char) {
208 23
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
209
                        }
210
                    }
211
212
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
213
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
214 23
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
215
216 23
                    preg_match_all($regexp, $section, $matches);
217
218 23
                    foreach ($matches['from'] as $key => $from) {
219 1
                        $char_from = hexdec($from);
220 1
                        $strings = [];
221
222 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
223
224 1
                        foreach ($strings['string'] as $position => $string) {
225 1
                            $parts = preg_split(
226 1
                                '/([0-9A-F]{4})/i',
227
                                $string,
228 1
                                0,
229 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
230
                            );
231 1
                            $text = '';
232 1
                            foreach ($parts as $part) {
233 1
                                $text .= self::uchr(hexdec($part));
234
                            }
235 1
                            $this->table[$char_from + $position] = $text;
236
                        }
237
                    }
238
                }
239
            }
240
        }
241
242 36
        return $this->table;
243
    }
244
245
    /**
246
     * Set custom char translation table where:
247
     * - key - integer character code;
248
     * - value - "utf-8" encoded value;
249
     *
250
     * @return void
251
     */
252 1
    public function setTable(array $table)
253
    {
254 1
        $this->table = $table;
255 1
    }
256
257
    /**
258
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
259
     */
260 39
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
261
    {
262
        // Special shortcut for XML content.
263 39
        if (false !== stripos($hexa, '<?xml')) {
264 2
            return $hexa;
265
        }
266
267 39
        $text = '';
268 39
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
269
270 39
        foreach ($parts as $part) {
271 39
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
272
                // strip line breaks
273 12
                $part = preg_replace("/[\r\n]/", '', $part);
274 12
                $part = trim($part, '<>');
275 12
                if ($add_braces) {
276 1
                    $text .= '(';
277
                }
278
279 12
                $part = pack('H*', $part);
280 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
281
282 12
                if ($add_braces) {
283 12
                    $text .= ')';
284
                }
285
            } else {
286 39
                $text .= $part;
287
            }
288
        }
289
290 39
        return $text;
291
    }
292
293
    /**
294
     * Decode string with octal-decoded chunks.
295
     */
296 39
    public static function decodeOctal(string $text): string
297
    {
298 39
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
299 39
        $text = '';
300
301 39
        foreach ($parts as $part) {
302 39
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
303 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

303
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
304
            } else {
305 39
                $text .= $part;
306
            }
307
        }
308
309 39
        return $text;
310
    }
311
312
    /**
313
     * Decode string with html entity encoded chars.
314
     */
315 52
    public static function decodeEntities(string $text): string
316
    {
317 52
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
318 52
        $text = '';
319
320 52
        foreach ($parts as $part) {
321 52
            if (preg_match('/^#\d{2}$/', $part)) {
322 5
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

322
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
323
            } else {
324 52
                $text .= $part;
325
            }
326
        }
327
328 52
        return $text;
329
    }
330
331
    /**
332
     * Check if given string is Unicode text (by BOM);
333
     * If true - decode to "utf-8" encoded string.
334
     * Otherwise - return text as is.
335
     *
336
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
337
     */
338 39
    public static function decodeUnicode(string $text): string
339
    {
340 39
        if (preg_match('/^\xFE\xFF/i', $text)) {
341
            // Strip U+FEFF byte order marker.
342 24
            $decode = substr($text, 2);
343 24
            $text = '';
344 24
            $length = \strlen($decode);
345
346 24
            for ($i = 0; $i < $length; $i += 2) {
347 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

347
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
348
            }
349
        }
350
351 39
        return $text;
352
    }
353
354
    /**
355
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
356
     */
357 19
    protected function getFontSpaceLimit(): int
358
    {
359 19
        return $this->config->getFontSpaceLimit();
360
    }
361
362
    /**
363
     * Decode text by commands array.
364
     */
365 19
    public function decodeText(array $commands): string
366
    {
367 19
        $word_position = 0;
368 19
        $words = [];
369 19
        $font_space = $this->getFontSpaceLimit();
370
371 19
        foreach ($commands as $command) {
372 19
            switch ($command[PDFObject::TYPE]) {
373 19
                case 'n':
374 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
375 7
                        $word_position = \count($words);
376
                    }
377 15
                    continue 2;
378 19
                case '<':
379
                    // Decode hexadecimal.
380 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
381 10
                    break;
382
383
                default:
384
                    // Decode octal (if necessary).
385 12
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
386
            }
387
388
            // replace escaped chars
389 19
            $text = str_replace(
390 19
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
391 19
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
392
                $text
393
            );
394
395
            // add content to result string
396 19
            if (isset($words[$word_position])) {
397 15
                $words[$word_position] .= $text;
398
            } else {
399 19
                $words[$word_position] = $text;
400
            }
401
        }
402
403 19
        foreach ($words as &$word) {
404 19
            $word = $this->decodeContent($word);
405
        }
406
407 19
        return implode(' ', $words);
408
    }
409
410
    /**
411
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
412
     */
413 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
414
    {
415 23
        if ($this->has('ToUnicode')) {
416 18
            $bytes = $this->tableSizes['from'];
417
418 18
            if ($bytes) {
419 18
                $result = '';
420 18
                $length = \strlen($text);
421
422 18
                for ($i = 0; $i < $length; $i += $bytes) {
423 18
                    $char = substr($text, $i, $bytes);
424
425 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
426 18
                        $char = $decoded;
427
                    } elseif ($this->has('DescendantFonts')) {
428
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
429
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

429
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
430
                        } else {
431
                            $fonts = $this->get('DescendantFonts')->getContent();
432
                        }
433
                        $decoded = false;
434
435
                        foreach ($fonts as $font) {
436
                            if ($font instanceof self) {
437
                                if (false !== ($decoded = $font->translateChar($char, false))) {
438
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

438
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
439
                                    break;
440
                                }
441
                            }
442
                        }
443
444
                        if (false !== $decoded) {
445
                            $char = $decoded;
446
                        } else {
447
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
448
                        }
449
                    } else {
450
                        $char = self::MISSING;
451
                    }
452
453 18
                    $result .= $char;
454
                }
455
456 18
                $text = $result;
457
            }
458 15
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
459
            /** @var Encoding $encoding */
460 3
            $encoding = $this->get('Encoding');
461 3
            $unicode = mb_check_encoding($text, 'UTF-8');
462 3
            $result = '';
463 3
            if ($unicode) {
464 2
                $chars = preg_split(
465 2
                        '//s'.($unicode ? 'u' : ''),
466
                        $text,
467 2
                        -1,
468 2
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
469
                );
470
471 2
                foreach ($chars as $char) {
472 2
                    $dec_av = hexdec(bin2hex($char));
473 2
                    $dec_ap = $encoding->translateChar($dec_av);
474 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
475
                }
476
            } else {
477 3
                $length = \strlen($text);
478
479 3
                for ($i = 0; $i < $length; ++$i) {
480 3
                    $dec_av = hexdec(bin2hex($text[$i]));
481 3
                    $dec_ap = $encoding->translateChar($dec_av);
482 3
                    $result .= self::uchr($dec_ap ?? $dec_av);
483
                }
484
            }
485 3
            $text = $result;
486 13
        } elseif ($this->get('Encoding') instanceof Element &&
487 13
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
488
            // mb_convert_encoding does not support MacRoman/macintosh,
489
            // so we use iconv() here
490
            $text = iconv('macintosh', 'UTF-8', $text);
491 13
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
492
            // don't double-encode strings already in UTF-8
493 5
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
494
        }
495
496 23
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
497
    }
498
}
499