Test Failed
Pull Request — master (#510)
by Jeremy
04:34 queued 02:12
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 30.1752

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 36
cts 51
cp 0.7059
crap 30.1752
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60 36
    public function init()
61
    {
62
        // Load translate table.
63 36
        $this->loadTranslateTable();
64 36
    }
65
66 1
    public function getName(): string
67
    {
68 1
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
69
    }
70
71 1
    public function getType(): string
72
    {
73 1
        return (string) $this->header->get('Subtype');
74
    }
75
76
    public function getDetails(bool $deep = true): array
77
    {
78
        $details = [];
79
80
        $details['Name'] = $this->getName();
81
        $details['Type'] = $this->getType();
82
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
83
84
        $details += parent::getDetails($deep);
85
86
        return $details;
87
    }
88
89
    /**
90
     * @return string|bool
91
     */
92 21
    public function translateChar(string $char, bool $use_default = true)
93
    {
94 21
        $dec = hexdec(bin2hex($char));
95
96 21
        if (\array_key_exists($dec, $this->table)) {
97 19
            return $this->table[$dec];
98
        }
99
100
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
101 5
        $fallbackDecoded = $char;
102
        if (
103 5
            \strlen($char) < 2
104 5
            && $this->has('Encoding')
105 5
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
106
        ) {
107
            try {
108 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
109 1
                    $fallbackDecoded = self::uchr($dec);
110
                }
111 1
            } catch (EncodingNotFoundException $e) {
112
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
113
                // See table 5.11 on PDF 1.5 specs for more info
114
            }
115
        }
116
117 5
        return $use_default ? self::MISSING : $fallbackDecoded;
118
    }
119
120
    /**
121
     * Convert unicode character code to "utf-8" encoded string.
122
     */
123 35
    public static function uchr(int $code): string
124
    {
125 35
        if (!isset(self::$uchrCache[$code])) {
126
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
127
            // therefore, we use mb_convert_encoding() instead
128 13
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
129
        }
130
131 35
        return self::$uchrCache[$code];
132
    }
133
134
    /**
135
     * Init internal chars translation table by ToUnicode CMap.
136
     */
137 36
    public function loadTranslateTable(): array
138
    {
139 36
        if (null !== $this->table) {
140 1
            return $this->table;
141
        }
142
143 36
        $this->table = [];
144 36
        $this->tableSizes = [
145
            'from' => 1,
146
            'to' => 1,
147
        ];
148
149 36
        if ($this->has('ToUnicode')) {
150 29
            $content = $this->get('ToUnicode')->getContent();
151 29
            $matches = [];
152
153
            // Support for multiple spacerange sections
154 29
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
155 29
                foreach ($matches['sections'] as $section) {
156 29
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
157
158 29
                    preg_match_all($regexp, $section, $matches);
159
160 29
                    $this->tableSizes = [
161 29
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
162 29
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
163
                    ];
164
165 29
                    break;
166
                }
167
            }
168
169
            // Support for multiple bfchar sections
170 29
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
171 13
                foreach ($matches['sections'] as $section) {
172 13
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
173
174 13
                    preg_match_all($regexp, $section, $matches);
175
176 13
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
177
178 13
                    foreach ($matches['from'] as $key => $from) {
179 13
                        $parts = preg_split(
180 13
                            '/([0-9A-F]{4})/i',
181 13
                            $matches['to'][$key],
182 13
                            0,
183 13
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
184
                        );
185 13
                        $text = '';
186 13
                        foreach ($parts as $part) {
187 13
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

187
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
188
                        }
189 13
                        $this->table[hexdec($from)] = $text;
190
                    }
191
                }
192
            }
193
194
            // Support for multiple bfrange sections
195 29
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
196 23
                foreach ($matches['sections'] as $section) {
197
                    // Support for : <srcCode1> <srcCode2> <dstString>
198 23
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
199
200 23
                    preg_match_all($regexp, $section, $matches);
201
202 23
                    foreach ($matches['from'] as $key => $from) {
203 23
                        $char_from = hexdec($from);
204 23
                        $char_to = hexdec($matches['to'][$key]);
205 23
                        $offset = hexdec($matches['offset'][$key]);
206
207 23
                        for ($char = $char_from; $char <= $char_to; ++$char) {
208 23
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
209
                        }
210
                    }
211
212
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
213
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
214 23
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
215
216 23
                    preg_match_all($regexp, $section, $matches);
217
218 23
                    foreach ($matches['from'] as $key => $from) {
219 1
                        $char_from = hexdec($from);
220 1
                        $strings = [];
221
222 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
223
224 1
                        foreach ($strings['string'] as $position => $string) {
225 1
                            $parts = preg_split(
226 1
                                '/([0-9A-F]{4})/i',
227
                                $string,
228 1
                                0,
229 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
230
                            );
231 1
                            $text = '';
232 1
                            foreach ($parts as $part) {
233 1
                                $text .= self::uchr(hexdec($part));
234
                            }
235 1
                            $this->table[$char_from + $position] = $text;
236
                        }
237
                    }
238
                }
239
            }
240
        }
241
242 36
        return $this->table;
243
    }
244
245
    /**
246
     * Set custom char translation table where:
247
     * - key - integer character code;
248
     * - value - "utf-8" encoded value;
249
     *
250
     * @return void
251
     */
252 1
    public function setTable(array $table)
253
    {
254 1
        $this->table = $table;
255 1
    }
256
257
    /**
258
     * Decode hexadecimal encoded string. If $add_braces is true result value would be wrapped by parentheses.
259
     */
260 39
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
261
    {
262
        // Special shortcut for XML content.
263 39
        if (false !== stripos($hexa, '<?xml')) {
264 2
            return $hexa;
265
        }
266
267 39
        $text = '';
268 39
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
269
270 39
        foreach ($parts as $part) {
271 39
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
272
                // strip line breaks
273 12
                $part = preg_replace("/[\r\n]/", '', $part);
274 12
                $part = trim($part, '<>');
275 12
                if ($add_braces) {
276 1
                    $text .= '(';
277
                }
278
279 12
                $part = pack('H*', $part);
280 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
281
282 12
                if ($add_braces) {
283 12
                    $text .= ')';
284
                }
285
            } else {
286 39
                $text .= $part;
287
            }
288
        }
289
290 39
        return $text;
291
    }
292
293
    /**
294
     * Decode string with octal-decoded chunks.
295
     */
296 39
    public static function decodeOctal(string $text): string
297
    {
298 39
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
299 39
        $text = '';
300
301 39
        foreach ($parts as $part) {
302 39
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
303 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

303
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
304
            } else {
305 39
                $text .= $part;
306
            }
307
        }
308
309 39
        return $text;
310
    }
311
312
    /**
313
     * Decode string with html entity encoded chars.
314
     */
315 52
    public static function decodeEntities(string $text): string
316
    {
317 52
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
318 52
        $text = '';
319
320 52
        foreach ($parts as $part) {
321 52
            if (preg_match('/^#\d{2}$/', $part)) {
322 5
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

322
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
323
            } else {
324 52
                $text .= $part;
325
            }
326
        }
327
328 52
        return $text;
329
    }
330
331
    /**
332
     * Check if given string is Unicode text (by BOM);
333
     * If true - decode to "utf-8" encoded string.
334
     * Otherwise - return text as is.
335
     *
336
     * @todo Rename in next major release to make the name correspond to reality (for ex. decodeIfUnicode())
337
     */
338 39
    public static function decodeUnicode(string $text): string
339
    {
340 39
        if (preg_match('/^\xFE\xFF/i', $text)) {
341
            // Strip U+FEFF byte order marker.
342 24
            $decode = substr($text, 2);
343 24
            $text = '';
344 24
            $length = \strlen($decode);
345
346 24
            for ($i = 0; $i < $length; $i += 2) {
347 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

347
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
348
            }
349
        }
350
351 39
        return $text;
352
    }
353
354
    /**
355
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
356
     */
357 19
    protected function getFontSpaceLimit(): int
358
    {
359 19
        return $this->config->getFontSpaceLimit();
360
    }
361
362
    /**
363
     * Decode text by commands array.
364
     */
365 19
    public function decodeText(array $commands): string
366
    {
367 19
        $word_position = 0;
368 19
        $words = [];
369 19
        $font_space = $this->getFontSpaceLimit();
370
371 19
        foreach ($commands as $command) {
372 19
            switch ($command[PDFObject::TYPE]) {
373 19
                case 'n':
374 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
375 7
                        $word_position = \count($words);
376
                    }
377 15
                    continue 2;
378 19
                case '<':
379
                    // Decode hexadecimal.
380 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
381 10
                    break;
382
383
                default:
384
                    // Decode octal (if necessary).
385 12
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
386
            }
387
388
            // replace escaped chars
389 19
            $text = str_replace(
390 19
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
391 19
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
392
                $text
393
            );
394
395
            // add content to result string
396 19
            if (isset($words[$word_position])) {
397 15
                $words[$word_position] .= $text;
398
            } else {
399 19
                $words[$word_position] = $text;
400
            }
401
        }
402
403 19
        foreach ($words as &$word) {
404 19
            $word = $this->decodeContent($word);
405
        }
406
407 19
        return implode(' ', $words);
408
    }
409
410
    /**
411
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
412
     */
413 23
    public function decodeContent(string $text, ?bool &$unicode = null): string
414
    {
415 23
        if ($this->has('ToUnicode')) {
416 18
            $bytes = $this->tableSizes['from'];
417
418 18
            if ($bytes) {
419 18
                $result = '';
420 18
                $length = \strlen($text);
421
422 18
                for ($i = 0; $i < $length; $i += $bytes) {
423 18
                    $char = substr($text, $i, $bytes);
424
425 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
426 18
                        $char = $decoded;
427
                    } elseif ($this->has('DescendantFonts')) {
428
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
429
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

429
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
430
                        } else {
431
                            $fonts = $this->get('DescendantFonts')->getContent();
432
                        }
433
                        $decoded = false;
434
435
                        foreach ($fonts as $font) {
436
                            if ($font instanceof self) {
437
                                if (false !== ($decoded = $font->translateChar($char, false))) {
438
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

438
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
439
                                    break;
440
                                }
441
                            }
442
                        }
443
444
                        if (false !== $decoded) {
445
                            $char = $decoded;
446
                        } else {
447
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
448
                        }
449
                    } else {
450
                        $char = self::MISSING;
451
                    }
452
453 18
                    $result .= $char;
454
                }
455
456 18
                $text = $result;
457
            }
458 15
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
459
            /** @var Encoding $encoding */
460 3
            $encoding = $this->get('Encoding');
461 3
            $unicode = mb_check_encoding($text, 'UTF-8');
462 3
            $result = '';
463 3
            if ($unicode) {
464 2
                $chars = preg_split(
465 2
                        '//s'.($unicode ? 'u' : ''),
466
                        $text,
467 2
                        -1,
468 2
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
469
                );
470
471 2
                foreach ($chars as $char) {
472 2
                    $dec_av = hexdec(bin2hex($char));
473 2
                    $dec_ap = $encoding->translateChar($dec_av);
474 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
475
                }
476
            } else {
477 3
                $length = \strlen($text);
478
479 3
                for ($i = 0; $i < $length; ++$i) {
480 3
                    $dec_av = hexdec(bin2hex($text[$i]));
481 3
                    $dec_ap = $encoding->translateChar($dec_av);
482 3
                    $result .= self::uchr($dec_ap ?? $dec_av);
483
                }
484
            }
485 3
            $text = $result;
486 13
        } elseif ($this->get('Encoding') instanceof Element &&
487 13
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
488
            // mb_convert_encoding does not support MacRoman/macintosh,
489
            // so we use iconv() here
490
            $text = iconv('macintosh', 'UTF-8', $text);
491 13
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
492
            // don't double-encode strings already in UTF-8
493 5
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
494
        }
495
496 23
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
497
    }
498
}
499