Passed
Pull Request — master (#481)
by Konrad
02:38
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60 35
    public function init()
61
    {
62
        // Load translate table.
63 35
        $this->loadTranslateTable();
64 35
    }
65
66 2
    public function getName(): string
67
    {
68 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
69
    }
70
71 2
    public function getType(): string
72
    {
73 2
        return (string) $this->header->get('Subtype');
74
    }
75
76 1
    public function getDetails(bool $deep = true): array
77
    {
78 1
        $details = [];
79
80 1
        $details['Name'] = $this->getName();
81 1
        $details['Type'] = $this->getType();
82 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
83
84 1
        $details += parent::getDetails($deep);
85
86 1
        return $details;
87
    }
88
89
    /**
90
     * @return string|bool
91
     */
92 21
    public function translateChar(string $char, bool $use_default = true)
93
    {
94 21
        $dec = hexdec(bin2hex($char));
95
96 21
        if (\array_key_exists($dec, $this->table)) {
97 18
            return $this->table[$dec];
98
        }
99
100
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
101 6
        $fallbackDecoded = $char;
102
        if (
103 6
            \strlen($char) < 2
104 6
            && $this->has('Encoding')
105 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
106
        ) {
107
            try {
108 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
109 1
                    $fallbackDecoded = self::uchr($dec);
110
                }
111 1
            } catch (EncodingNotFoundException $e) {
112
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
113
                // See table 5.11 on PDF 1.5 specs for more info
114
            }
115
        }
116
117 6
        return $use_default ? self::MISSING : $fallbackDecoded;
118
    }
119
120 34
    public static function uchr(int $code): string
121
    {
122 34
        if (!isset(self::$uchrCache[$code])) {
123
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
124
            // therefore, we use mb_convert_encoding() instead
125 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
126
        }
127
128 34
        return self::$uchrCache[$code];
129
    }
130
131 35
    public function loadTranslateTable(): array
132
    {
133 35
        if (null !== $this->table) {
134 1
            return $this->table;
135
        }
136
137 35
        $this->table = [];
138 35
        $this->tableSizes = [
139
            'from' => 1,
140
            'to' => 1,
141
        ];
142
143 35
        if ($this->has('ToUnicode')) {
144 30
            $content = $this->get('ToUnicode')->getContent();
145 30
            $matches = [];
146
147
            // Support for multiple spacerange sections
148 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
149 30
                foreach ($matches['sections'] as $section) {
150 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
151
152 30
                    preg_match_all($regexp, $section, $matches);
153
154 30
                    $this->tableSizes = [
155 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
156 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
157
                    ];
158
159 30
                    break;
160
                }
161
            }
162
163
            // Support for multiple bfchar sections
164 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
165 12
                foreach ($matches['sections'] as $section) {
166 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
167
168 12
                    preg_match_all($regexp, $section, $matches);
169
170 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
171
172 12
                    foreach ($matches['from'] as $key => $from) {
173 12
                        $parts = preg_split(
174 12
                            '/([0-9A-F]{4})/i',
175 12
                            $matches['to'][$key],
176 12
                            0,
177 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
178
                        );
179 12
                        $text = '';
180 12
                        foreach ($parts as $part) {
181 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

181
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
182
                        }
183 12
                        $this->table[hexdec($from)] = $text;
184
                    }
185
                }
186
            }
187
188
            // Support for multiple bfrange sections
189 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
190 24
                foreach ($matches['sections'] as $section) {
191
                    // Support for : <srcCode1> <srcCode2> <dstString>
192 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
193
194 24
                    preg_match_all($regexp, $section, $matches);
195
196 24
                    foreach ($matches['from'] as $key => $from) {
197 24
                        $char_from = hexdec($from);
198 24
                        $char_to = hexdec($matches['to'][$key]);
199 24
                        $offset = hexdec($matches['offset'][$key]);
200
201 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
202 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
203
                        }
204
                    }
205
206
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
207
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
208 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
209
210 24
                    preg_match_all($regexp, $section, $matches);
211
212 24
                    foreach ($matches['from'] as $key => $from) {
213 1
                        $char_from = hexdec($from);
214 1
                        $strings = [];
215
216 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
217
218 1
                        foreach ($strings['string'] as $position => $string) {
219 1
                            $parts = preg_split(
220 1
                                '/([0-9A-F]{4})/i',
221
                                $string,
222 1
                                0,
223 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
224
                            );
225 1
                            $text = '';
226 1
                            foreach ($parts as $part) {
227 1
                                $text .= self::uchr(hexdec($part));
228
                            }
229 1
                            $this->table[$char_from + $position] = $text;
230
                        }
231
                    }
232
                }
233
            }
234
        }
235
236 35
        return $this->table;
237
    }
238
239 1
    public function setTable(array $table)
240
    {
241 1
        $this->table = $table;
242 1
    }
243
244 38
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
245
    {
246
        // Special shortcut for XML content.
247 38
        if (false !== stripos($hexa, '<?xml')) {
248 2
            return $hexa;
249
        }
250
251 38
        $text = '';
252 38
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
253
254 38
        foreach ($parts as $part) {
255 38
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
256
                // strip line breaks
257 12
                $part = preg_replace("/[\r\n]/", '', $part);
258 12
                $part = trim($part, '<>');
259 12
                if ($add_braces) {
260 1
                    $text .= '(';
261
                }
262
263 12
                $part = pack('H*', $part);
264 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
265
266 12
                if ($add_braces) {
267 12
                    $text .= ')';
268
                }
269
            } else {
270 38
                $text .= $part;
271
            }
272
        }
273
274 38
        return $text;
275
    }
276
277 38
    public static function decodeOctal(string $text): string
278
    {
279 38
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
280 38
        $text = '';
281
282 38
        foreach ($parts as $part) {
283 38
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
284 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

284
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
285
            } else {
286 38
                $text .= $part;
287
            }
288
        }
289
290 38
        return $text;
291
    }
292
293 52
    public static function decodeEntities(string $text): string
294
    {
295 52
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
296 52
        $text = '';
297
298 52
        foreach ($parts as $part) {
299 52
            if (preg_match('/^#\d{2}$/', $part)) {
300 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
301
            } else {
302 52
                $text .= $part;
303
            }
304
        }
305
306 52
        return $text;
307
    }
308
309 38
    public static function decodeUnicode(string $text): string
310
    {
311 38
        if (preg_match('/^\xFE\xFF/i', $text)) {
312
            // Strip U+FEFF byte order marker.
313 24
            $decode = substr($text, 2);
314 24
            $text = '';
315 24
            $length = \strlen($decode);
316
317 24
            for ($i = 0; $i < $length; $i += 2) {
318 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
319
            }
320
        }
321
322 38
        return $text;
323
    }
324
325
    /**
326
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
327
     */
328 19
    protected function getFontSpaceLimit(): int
329
    {
330 19
        return $this->config->getFontSpaceLimit();
331
    }
332
333 19
    public function decodeText(array $commands): string
334
    {
335 19
        $word_position = 0;
336 19
        $words = [];
337 19
        $font_space = $this->getFontSpaceLimit();
338
339 19
        foreach ($commands as $command) {
340 19
            switch ($command[PDFObject::TYPE]) {
341 19
                case 'n':
342 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
343 7
                        $word_position = \count($words);
344
                    }
345 15
                    continue 2;
346 19
                case '<':
347
                    // Decode hexadecimal.
348 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
349 10
                    break;
350
351
                default:
352
                    // Decode octal (if necessary).
353 12
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
354
            }
355
356
            // replace escaped chars
357 19
            $text = str_replace(
358 19
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
359 19
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
360
                $text
361
            );
362
363
            // add content to result string
364 19
            if (isset($words[$word_position])) {
365 15
                $words[$word_position] .= $text;
366
            } else {
367 19
                $words[$word_position] = $text;
368
            }
369
        }
370
371 19
        foreach ($words as &$word) {
372 19
            $word = $this->decodeContent($word);
373
        }
374
375 19
        return implode(' ', $words);
376
    }
377
378
    /**
379
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
380
     */
381 21
    public function decodeContent(string $text, ?bool &$unicode = null): string
382
    {
383 21
        if ($this->has('ToUnicode')) {
384 18
            $bytes = $this->tableSizes['from'];
385
386 18
            if ($bytes) {
387 18
                $result = '';
388 18
                $length = \strlen($text);
389
390 18
                for ($i = 0; $i < $length; $i += $bytes) {
391 18
                    $char = substr($text, $i, $bytes);
392
393 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
394 18
                        $char = $decoded;
395
                    } elseif ($this->has('DescendantFonts')) {
396
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
397
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

397
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
398
                        } else {
399
                            $fonts = $this->get('DescendantFonts')->getContent();
400
                        }
401
                        $decoded = false;
402
403
                        foreach ($fonts as $font) {
404
                            if ($font instanceof self) {
405
                                if (false !== ($decoded = $font->translateChar($char, false))) {
406
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

406
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
407
                                    break;
408
                                }
409
                            }
410
                        }
411
412
                        if (false !== $decoded) {
413
                            $char = $decoded;
414
                        } else {
415
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
416
                        }
417
                    } else {
418
                        $char = self::MISSING;
419
                    }
420
421 18
                    $result .= $char;
422
                }
423
424 18
                $text = $result;
425
            }
426 13
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
427
            /** @var Encoding $encoding */
428 2
            $encoding = $this->get('Encoding');
429 2
            $unicode = mb_check_encoding($text, 'UTF-8');
430 2
            $result = '';
431 2
            if ($unicode) {
432 2
                $chars = preg_split(
433 2
                        '//s'.($unicode ? 'u' : ''),
434
                        $text,
435 2
                        -1,
436 2
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
437
                );
438
439 2
                foreach ($chars as $char) {
440 2
                    $dec_av = hexdec(bin2hex($char));
441 2
                    $dec_ap = $encoding->translateChar($dec_av);
442 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
443
                }
444
            } else {
445 2
                $length = \strlen($text);
446
447 2
                for ($i = 0; $i < $length; ++$i) {
448 2
                    $dec_av = hexdec(bin2hex($text[$i]));
449 2
                    $dec_ap = $encoding->translateChar($dec_av);
450 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
451
                }
452
            }
453 2
            $text = $result;
454 11
        } elseif ($this->get('Encoding') instanceof Element &&
455 11
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
456
            // mb_convert_encoding does not support MacRoman/macintosh,
457
            // so we use iconv() here
458 1
            $text = iconv('macintosh', 'UTF-8', $text);
459 11
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
460
            // don't double-encode strings already in UTF-8
461 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
462
        }
463
464 21
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
465
    }
466
}
467