Passed
Push — master ( 5d3746...a4bb6d )
by Konrad
02:09
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 27
    public function init()
53
    {
54
        // Load translate table.
55 27
        $this->loadTranslateTable();
56 27
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 16
    public function translateChar($char, $use_default = true)
97
    {
98 16
        $dec = hexdec(bin2hex($char));
99
100 16
        if (\array_key_exists($dec, $this->table)) {
101 15
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 4
        $fallbackDecoded = $char;
106
        if (
107 4
            \strlen($char) < 2
108 4
            && $this->has('Encoding')
109 4
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
110
        ) {
111
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

111
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
112
        }
113
114 4
        return $use_default ? self::MISSING : $fallbackDecoded;
115
    }
116
117
    /**
118
     * @param int $code
119
     *
120
     * @return string
121
     */
122 27
    public static function uchr($code)
123
    {
124
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
125
        // therefore, we use mb_convert_encoding() instead
126 27
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
127
    }
128
129
    /**
130
     * @return array
131
     */
132 27
    public function loadTranslateTable()
133
    {
134 27
        if (null !== $this->table) {
135 1
            return $this->table;
136
        }
137
138 27
        $this->table = [];
139 27
        $this->tableSizes = [
140
            'from' => 1,
141
            'to' => 1,
142
        ];
143
144 27
        if ($this->has('ToUnicode')) {
145 24
            $content = $this->get('ToUnicode')->getContent();
146 24
            $matches = [];
147
148
            // Support for multiple spacerange sections
149 24
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
150 24
                foreach ($matches['sections'] as $section) {
151 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
152
153 24
                    preg_match_all($regexp, $section, $matches);
154
155 24
                    $this->tableSizes = [
156 24
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
157 24
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
158
                    ];
159
160 24
                    break;
161
                }
162
            }
163
164
            // Support for multiple bfchar sections
165 24
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
166 8
                foreach ($matches['sections'] as $section) {
167 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
168
169 8
                    preg_match_all($regexp, $section, $matches);
170
171 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
172
173 8
                    foreach ($matches['from'] as $key => $from) {
174 8
                        $parts = preg_split(
175 8
                            '/([0-9A-F]{4})/i',
176 8
                            $matches['to'][$key],
177 8
                            0,
178 8
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
179
                        );
180 8
                        $text = '';
181 8
                        foreach ($parts as $part) {
182 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

182
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
183
                        }
184 8
                        $this->table[hexdec($from)] = $text;
185
                    }
186
                }
187
            }
188
189
            // Support for multiple bfrange sections
190 24
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
191 20
                foreach ($matches['sections'] as $section) {
192
                    // Support for : <srcCode1> <srcCode2> <dstString>
193 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
194
195 20
                    preg_match_all($regexp, $section, $matches);
196
197 20
                    foreach ($matches['from'] as $key => $from) {
198 20
                        $char_from = hexdec($from);
199 20
                        $char_to = hexdec($matches['to'][$key]);
200 20
                        $offset = hexdec($matches['offset'][$key]);
201
202 20
                        for ($char = $char_from; $char <= $char_to; ++$char) {
203 20
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
204
                        }
205
                    }
206
207
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
208
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
209 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
210
211 20
                    preg_match_all($regexp, $section, $matches);
212
213 20
                    foreach ($matches['from'] as $key => $from) {
214 1
                        $char_from = hexdec($from);
215 1
                        $strings = [];
216
217 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
218
219 1
                        foreach ($strings['string'] as $position => $string) {
220 1
                            $parts = preg_split(
221 1
                                '/([0-9A-F]{4})/i',
222
                                $string,
223 1
                                0,
224 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
225
                            );
226 1
                            $text = '';
227 1
                            foreach ($parts as $part) {
228 1
                                $text .= self::uchr(hexdec($part));
229
                            }
230 1
                            $this->table[$char_from + $position] = $text;
231
                        }
232
                    }
233
                }
234
            }
235
        }
236
237 27
        return $this->table;
238
    }
239
240
    /**
241
     * @param array $table
242
     */
243
    public function setTable($table)
244
    {
245
        $this->table = $table;
246
    }
247
248
    /**
249
     * @param string $hexa
250
     * @param bool   $add_braces
251
     *
252
     * @return string
253
     */
254 30
    public static function decodeHexadecimal($hexa, $add_braces = false)
255
    {
256
        // Special shortcut for XML content.
257 30
        if (false !== stripos($hexa, '<?xml')) {
258 3
            return $hexa;
259
        }
260
261 30
        $text = '';
262 30
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
263
264 30
        foreach ($parts as $part) {
265 30
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
266
                // strip line breaks
267 9
                $part = preg_replace("/[\r\n]/", '', $part);
268 9
                $part = trim($part, '<>');
269 9
                if ($add_braces) {
270 1
                    $text .= '(';
271
                }
272
273 9
                $part = pack('H*', $part);
274 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
275
276 9
                if ($add_braces) {
277 9
                    $text .= ')';
278
                }
279
            } else {
280 30
                $text .= $part;
281
            }
282
        }
283
284 30
        return $text;
285
    }
286
287
    /**
288
     * @param string $text
289
     *
290
     * @return string
291
     */
292 30
    public static function decodeOctal($text)
293
    {
294 30
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
295 30
        $text = '';
296
297 30
        foreach ($parts as $part) {
298 30
            if (preg_match('/^\\\\\d{3}$/', $part)) {
299 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
300
            } else {
301 30
                $text .= $part;
302
            }
303
        }
304
305 30
        return $text;
306
    }
307
308
    /**
309
     * @param string $text
310
     *
311
     * @return string
312
     */
313 44
    public static function decodeEntities($text)
314
    {
315 44
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
316 44
        $text = '';
317
318 44
        foreach ($parts as $part) {
319 44
            if (preg_match('/^#\d{2}$/', $part)) {
320 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

320
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
321
            } else {
322 44
                $text .= $part;
323
            }
324
        }
325
326 44
        return $text;
327
    }
328
329
    /**
330
     * @param string $text
331
     *
332
     * @return string
333
     */
334 30
    public static function decodeUnicode($text)
335
    {
336 30
        if (preg_match('/^\xFE\xFF/i', $text)) {
337
            // Strip U+FEFF byte order marker.
338 19
            $decode = substr($text, 2);
339 19
            $text = '';
340 19
            $length = \strlen($decode);
341
342 19
            for ($i = 0; $i < $length; $i += 2) {
343 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

343
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
344
            }
345
        }
346
347 30
        return $text;
348
    }
349
350
    /**
351
     * @return int
352
     *
353
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
354
     */
355 14
    protected function getFontSpaceLimit()
356
    {
357 14
        return $this->config->getFontSpaceLimit();
358
    }
359
360
    /**
361
     * @param array $commands
362
     *
363
     * @return string
364
     */
365 14
    public function decodeText($commands)
366
    {
367 14
        $text = '';
368 14
        $word_position = 0;
369 14
        $words = [];
370 14
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
371 14
        $font_space = $this->getFontSpaceLimit();
372
373 14
        foreach ($commands as $command) {
374 14
            switch ($command[PDFObject::TYPE]) {
375 14
                case 'n':
376 12
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
377 6
                        $word_position = \count($words);
378
                    }
379 12
                    continue 2;
380 14
                case '<':
381
                    // Decode hexadecimal.
382 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
383 7
                    break;
384
385
                default:
386
                    // Decode octal (if necessary).
387 10
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
388
            }
389
390
            // replace escaped chars
391 14
            $text = str_replace(
392 14
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
393 14
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
394
                $text
395
            );
396
397
            // add content to result string
398 14
            if (isset($words[$word_position])) {
399 12
                $words[$word_position] .= $text;
400
            } else {
401 14
                $words[$word_position] = $text;
402
            }
403
        }
404
405 14
        foreach ($words as &$word) {
406 14
            $word = $this->decodeContent($word);
407
        }
408
409 14
        return implode(' ', $words);
410
    }
411
412
    /**
413
     * @param string $text
414
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
415
     *
416
     * @return string
417
     */
418 16
    public function decodeContent($text, &$unicode = null)
419
    {
420 16
        if ($this->has('ToUnicode')) {
421 14
            $bytes = $this->tableSizes['from'];
422
423 14
            if ($bytes) {
424 14
                $result = '';
425 14
                $length = \strlen($text);
426
427 14
                for ($i = 0; $i < $length; $i += $bytes) {
428 14
                    $char = substr($text, $i, $bytes);
429
430 14
                    if (false !== ($decoded = $this->translateChar($char, false))) {
431 14
                        $char = $decoded;
432
                    } elseif ($this->has('DescendantFonts')) {
433
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
434
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

434
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
435
                        } else {
436
                            $fonts = $this->get('DescendantFonts')->getContent();
437
                        }
438
                        $decoded = false;
439
440
                        foreach ($fonts as $font) {
441
                            if ($font instanceof self) {
442
                                if (false !== ($decoded = $font->translateChar($char, false))) {
443
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

443
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
444
                                    break;
445
                                }
446
                            }
447
                        }
448
449
                        if (false !== $decoded) {
450
                            $char = $decoded;
451
                        } else {
452
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
453
                        }
454
                    } else {
455
                        $char = self::MISSING;
456
                    }
457
458 14
                    $result .= $char;
459
                }
460
461 14
                $text = $result;
462
            }
463 11
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
464
            /** @var Encoding $encoding */
465 3
            $encoding = $this->get('Encoding');
466 3
            $unicode = mb_check_encoding($text, 'UTF-8');
467 3
            $result = '';
468 3
            if ($unicode) {
469 3
                $chars = preg_split(
470 3
                        '//s'.($unicode ? 'u' : ''),
471
                        $text,
472 3
                        -1,
473 3
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
474
                );
475
476 3
                foreach ($chars as $char) {
477 3
                    $dec_av = hexdec(bin2hex($char));
478 3
                    $dec_ap = $encoding->translateChar($dec_av);
479 3
                    $result .= self::uchr($dec_ap);
480
                }
481
            } else {
482 3
                $length = \strlen($text);
483
484 3
                for ($i = 0; $i < $length; ++$i) {
485 3
                    $dec_av = hexdec(bin2hex($text[$i]));
486 3
                    $dec_ap = $encoding->translateChar($dec_av);
487 3
                    $result .= self::uchr($dec_ap);
488
                }
489
            }
490 3
            $text = $result;
491 9
        } elseif ($this->get('Encoding') instanceof Element &&
492 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
493
            // mb_convert_encoding does not support MacRoman/macintosh,
494
            // so we use iconv() here
495 1
            $text = iconv('macintosh', 'UTF-8', $text);
496 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
497
            // don't double-encode strings already in UTF-8
498 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
499
        }
500
501 16
        return $text;
502
    }
503
}
504