Test Failed
Push — php8 ( cf16a0...fa58a5 )
by Konrad
04:36 queued 03:06
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 28.7808

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 36
cts 50
cp 0.72
crap 28.7808
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 27
    public function init()
53
    {
54
        // Load translate table.
55 27
        $this->loadTranslateTable();
56 27
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 16
    public function translateChar($char, $use_default = true)
97
    {
98 16
        $dec = hexdec(bin2hex($char));
99
100 16
        if (\array_key_exists($dec, $this->table)) {
101 15
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 4
        $fallbackDecoded = $char;
106
        if (
107 4
            \strlen($char) < 2
108 4
            && $this->has('Encoding')
109 4
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
110
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
111
        ) {
112
            $fallbackDecoded = self::uchr($dec);
113
        }
114 4
115
        return $use_default ? self::MISSING : $fallbackDecoded;
116
    }
117
118
    /**
119
     * @param int $code
120
     *
121
     * @return string
122 27
     */
123
    public static function uchr($code)
124
    {
125
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
126 27
        // therefore, we use mb_convert_encoding() instead
127
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') also could return the type array which is incompatible with the documented return type string.
Loading history...
128
    }
129
130
    /**
131
     * @return array
132 27
     */
133
    public function loadTranslateTable()
134 27
    {
135 1
        if (null !== $this->table) {
136
            return $this->table;
137
        }
138 27
139 27
        $this->table = [];
140
        $this->tableSizes = [
141
            'from' => 1,
142
            'to' => 1,
143
        ];
144 27
145 24
        if ($this->has('ToUnicode')) {
146 24
            $content = $this->get('ToUnicode')->getContent();
147
            $matches = [];
148
149 24
            // Support for multiple spacerange sections
150 24
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
151 24
                foreach ($matches['sections'] as $section) {
152
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
153 24
154
                    preg_match_all($regexp, $section, $matches);
155 24
156 24
                    $this->tableSizes = [
157 24
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
158
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
159
                    ];
160 24
161
                    break;
162
                }
163
            }
164
165 24
            // Support for multiple bfchar sections
166 8
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
167 8
                foreach ($matches['sections'] as $section) {
168
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
169 8
170
                    preg_match_all($regexp, $section, $matches);
171 8
172
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
173 8
174 8
                    foreach ($matches['from'] as $key => $from) {
175 8
                        $parts = preg_split(
176 8
                            '/([0-9A-F]{4})/i',
177 8
                            $matches['to'][$key],
178 8
                            0,
179
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
180 8
                        );
181 8
                        $text = '';
182 8
                        foreach ($parts as $part) {
183
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

183
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
184 8
                        }
185
                        $this->table[hexdec($from)] = $text;
186
                    }
187
                }
188
            }
189
190 24
            // Support for multiple bfrange sections
191 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
192
                foreach ($matches['sections'] as $section) {
193 20
                    // Support for : <srcCode1> <srcCode2> <dstString>
194
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
195 20
196
                    preg_match_all($regexp, $section, $matches);
197 20
198 20
                    foreach ($matches['from'] as $key => $from) {
199 20
                        $char_from = hexdec($from);
200 20
                        $char_to = hexdec($matches['to'][$key]);
201
                        $offset = hexdec($matches['offset'][$key]);
202 20
203 20
                        for ($char = $char_from; $char <= $char_to; ++$char) {
204
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
205
                        }
206
                    }
207
208
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
209 20
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
210
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
211 20
212
                    preg_match_all($regexp, $section, $matches);
213 20
214 1
                    foreach ($matches['from'] as $key => $from) {
215 1
                        $char_from = hexdec($from);
216
                        $strings = [];
217 1
218
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
219 1
220 1
                        foreach ($strings['string'] as $position => $string) {
221 1
                            $parts = preg_split(
222
                                '/([0-9A-F]{4})/i',
223 1
                                $string,
224 1
                                0,
225
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
226 1
                            );
227 1
                            $text = '';
228 1
                            foreach ($parts as $part) {
229
                                $text .= self::uchr(hexdec($part));
230 1
                            }
231
                            $this->table[$char_from + $position] = $text;
232
                        }
233
                    }
234
                }
235
            }
236
        }
237 27
238
        return $this->table;
239
    }
240
241
    /**
242
     * @param array $table
243
     */
244
    public function setTable($table)
245
    {
246
        $this->table = $table;
247
    }
248
249
    /**
250
     * @param string $hexa
251
     * @param bool   $add_braces
252
     *
253
     * @return string
254 30
     */
255
    public static function decodeHexadecimal($hexa, $add_braces = false)
256
    {
257 30
        // Special shortcut for XML content.
258 3
        if (false !== stripos($hexa, '<?xml')) {
259
            return $hexa;
260
        }
261 30
262 30
        $text = '';
263
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
264 30
265 30
        foreach ($parts as $part) {
266
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
267 9
                // strip line breaks
268 9
                $part = preg_replace("/[\r\n]/", '', $part);
269 9
                $part = trim($part, '<>');
270 1
                if ($add_braces) {
271
                    $text .= '(';
272
                }
273 9
274 9
                $part = pack('H*', $part);
275
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
276 9
277 9
                if ($add_braces) {
278
                    $text .= ')';
279
                }
280 30
            } else {
281
                $text .= $part;
282
            }
283
        }
284 30
285
        return $text;
286
    }
287
288
    /**
289
     * @param string $text
290
     *
291
     * @return string
292 30
     */
293
    public static function decodeOctal($text)
294 30
    {
295 30
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
296
        $text = '';
297 30
298 30
        foreach ($parts as $part) {
299 17
            if (preg_match('/^\\\\\d{3}$/', $part)) {
300
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
301 30
            } else {
302
                $text .= $part;
303
            }
304
        }
305 30
306
        return $text;
307
    }
308
309
    /**
310
     * @param string $text
311
     *
312
     * @return string
313 44
     */
314
    public static function decodeEntities($text)
315 44
    {
316 44
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
317
        $text = '';
318 44
319 44
        foreach ($parts as $part) {
320 3
            if (preg_match('/^#\d{2}$/', $part)) {
321
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

321
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
322 44
            } else {
323
                $text .= $part;
324
            }
325
        }
326 44
327
        return $text;
328
    }
329
330
    /**
331
     * @param string $text
332
     *
333
     * @return string
334 30
     */
335
    public static function decodeUnicode($text)
336 30
    {
337
        if (preg_match('/^\xFE\xFF/i', $text)) {
338 19
            // Strip U+FEFF byte order marker.
339 19
            $decode = substr($text, 2);
340 19
            $text = '';
341
            $length = \strlen($decode);
342 19
343 19
            for ($i = 0; $i < $length; $i += 2) {
344
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
345
            }
346
        }
347 30
348
        return $text;
349
    }
350
351
    /**
352
     * @return int
353
     *
354
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
355 14
     */
356
    protected function getFontSpaceLimit()
357 14
    {
358
        return $this->config->getFontSpaceLimit();
359
    }
360
361
    /**
362
     * @param array $commands
363
     *
364
     * @return string
365 14
     */
366
    public function decodeText($commands)
367 14
    {
368 14
        $text = '';
369 14
        $word_position = 0;
370 14
        $words = [];
371 14
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
372
        $font_space = $this->getFontSpaceLimit();
373 14
374 14
        foreach ($commands as $command) {
375 14
            switch ($command[PDFObject::TYPE]) {
376 12
                case 'n':
377 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
378
                        $word_position = \count($words);
379 12
                    }
380 14
                    continue 2;
381
                case '<':
382 7
                    // Decode hexadecimal.
383 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
384
                    break;
385
386
                default:
387 10
                    // Decode octal (if necessary).
388
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
389
            }
390
391 14
            // replace escaped chars
392 14
            $text = str_replace(
393 14
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
394
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
395
                $text
396
            );
397
398 14
            // add content to result string
399 12
            if (isset($words[$word_position])) {
400
                $words[$word_position] .= $text;
401 14
            } else {
402
                $words[$word_position] = $text;
403
            }
404
        }
405 14
406 14
        foreach ($words as &$word) {
407
            $word = $this->decodeContent($word);
408
        }
409 14
410
        return implode(' ', $words);
411
    }
412
413
    /**
414
     * @param string $text
415
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
416
     *
417
     * @return string
418 16
     */
419
    public function decodeContent($text, &$unicode = null)
420 16
    {
421 14
        if ($this->has('ToUnicode')) {
422
            $bytes = $this->tableSizes['from'];
423 14
424 14
            if ($bytes) {
425 14
                $result = '';
426
                $length = \strlen($text);
427 14
428 14
                for ($i = 0; $i < $length; $i += $bytes) {
429
                    $char = substr($text, $i, $bytes);
430 14
431 14
                    if (false !== ($decoded = $this->translateChar($char, false))) {
432
                        $char = $decoded;
433
                    } elseif ($this->has('DescendantFonts')) {
434
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
435
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

435
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
436
                        } else {
437
                            $fonts = $this->get('DescendantFonts')->getContent();
438
                        }
439
                        $decoded = false;
440
441
                        foreach ($fonts as $font) {
442
                            if ($font instanceof self) {
443
                                if (false !== ($decoded = $font->translateChar($char, false))) {
444
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
445
                                    break;
446
                                }
447
                            }
448
                        }
449
450
                        if (false !== $decoded) {
451
                            $char = $decoded;
452
                        } else {
453
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
454
                        }
455
                    } else {
456
                        $char = self::MISSING;
457
                    }
458 14
459
                    $result .= $char;
460
                }
461 14
462
                $text = $result;
463 11
            }
464
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
465 3
            /** @var Encoding $encoding */
466 3
            $encoding = $this->get('Encoding');
467 3
            $unicode = mb_check_encoding($text, 'UTF-8');
468 3
            $result = '';
469 3
            if ($unicode) {
470 3
                $chars = preg_split(
471
                        '//s'.($unicode ? 'u' : ''),
472 3
                        $text,
473 3
                        -1,
474
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
475
                );
476 3
477 3
                foreach ($chars as $char) {
478 3
                    $dec_av = hexdec(bin2hex($char));
479 3
                    $dec_ap = $encoding->translateChar($dec_av);
480
                    $result .= self::uchr($dec_ap);
481
                }
482 3
            } else {
483
                $length = \strlen($text);
484 3
485 3
                for ($i = 0; $i < $length; ++$i) {
486 3
                    $dec_av = hexdec(bin2hex($text[$i]));
487 3
                    $dec_ap = $encoding->translateChar($dec_av);
488
                    $result .= self::uchr($dec_ap);
489
                }
490 3
            }
491 9
            $text = $result;
492 9
        } elseif ($this->get('Encoding') instanceof Element &&
493
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
494
            // mb_convert_encoding does not support MacRoman/macintosh,
495 1
            // so we use iconv() here
496 9
            $text = iconv('macintosh', 'UTF-8', $text);
497
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
498 3
            // don't double-encode strings already in UTF-8
499
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
500
        }
501 16
502
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
503
    }
504
}
505