Passed
Push — master ( 73204b...b6db6a )
by Konrad
02:05
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 28
    public function init()
53
    {
54
        // Load translate table.
55 28
        $this->loadTranslateTable();
56 28
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 17
    public function translateChar($char, $use_default = true)
97
    {
98 17
        $dec = hexdec(bin2hex($char));
99
100 17
        if (\array_key_exists($dec, $this->table)) {
101 15
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 6
        $fallbackDecoded = $char;
106
        if (
107 6
            \strlen($char) < 2
108 6
            && $this->has('Encoding')
109 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
110 6
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
111
        ) {
112
            $fallbackDecoded = self::uchr($dec);
113
        }
114
115 6
        return $use_default ? self::MISSING : $fallbackDecoded;
116
    }
117
118
    /**
119
     * @param int $code
120
     *
121
     * @return string
122
     */
123 27
    public static function uchr($code)
124
    {
125
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
126
        // therefore, we use mb_convert_encoding() instead
127 27
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') also could return the type array which is incompatible with the documented return type string.
Loading history...
128
    }
129
130
    /**
131
     * @return array
132
     */
133 28
    public function loadTranslateTable()
134
    {
135 28
        if (null !== $this->table) {
136 1
            return $this->table;
137
        }
138
139 28
        $this->table = [];
140 28
        $this->tableSizes = [
141
            'from' => 1,
142
            'to' => 1,
143
        ];
144
145 28
        if ($this->has('ToUnicode')) {
146 25
            $content = $this->get('ToUnicode')->getContent();
147 25
            $matches = [];
148
149
            // Support for multiple spacerange sections
150 25
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
151 25
                foreach ($matches['sections'] as $section) {
152 25
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
153
154 25
                    preg_match_all($regexp, $section, $matches);
155
156 25
                    $this->tableSizes = [
157 25
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
158 25
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
159
                    ];
160
161 25
                    break;
162
                }
163
            }
164
165
            // Support for multiple bfchar sections
166 25
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
167 8
                foreach ($matches['sections'] as $section) {
168 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
169
170 8
                    preg_match_all($regexp, $section, $matches);
171
172 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
173
174 8
                    foreach ($matches['from'] as $key => $from) {
175 8
                        $parts = preg_split(
176 8
                            '/([0-9A-F]{4})/i',
177 8
                            $matches['to'][$key],
178 8
                            0,
179 8
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
180
                        );
181 8
                        $text = '';
182 8
                        foreach ($parts as $part) {
183 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

183
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
184
                        }
185 8
                        $this->table[hexdec($from)] = $text;
186
                    }
187
                }
188
            }
189
190
            // Support for multiple bfrange sections
191 25
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
192 20
                foreach ($matches['sections'] as $section) {
193
                    // Support for : <srcCode1> <srcCode2> <dstString>
194 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
195
196 20
                    preg_match_all($regexp, $section, $matches);
197
198 20
                    foreach ($matches['from'] as $key => $from) {
199 20
                        $char_from = hexdec($from);
200 20
                        $char_to = hexdec($matches['to'][$key]);
201 20
                        $offset = hexdec($matches['offset'][$key]);
202
203 20
                        for ($char = $char_from; $char <= $char_to; ++$char) {
204 20
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
205
                        }
206
                    }
207
208
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
209
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
210 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
211
212 20
                    preg_match_all($regexp, $section, $matches);
213
214 20
                    foreach ($matches['from'] as $key => $from) {
215 1
                        $char_from = hexdec($from);
216 1
                        $strings = [];
217
218 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
219
220 1
                        foreach ($strings['string'] as $position => $string) {
221 1
                            $parts = preg_split(
222 1
                                '/([0-9A-F]{4})/i',
223
                                $string,
224 1
                                0,
225 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
226
                            );
227 1
                            $text = '';
228 1
                            foreach ($parts as $part) {
229 1
                                $text .= self::uchr(hexdec($part));
230
                            }
231 1
                            $this->table[$char_from + $position] = $text;
232
                        }
233
                    }
234
                }
235
            }
236
        }
237
238 28
        return $this->table;
239
    }
240
241
    /**
242
     * @param array $table
243
     */
244
    public function setTable($table)
245
    {
246
        $this->table = $table;
247
    }
248
249
    /**
250
     * @param string $hexa
251
     * @param bool   $add_braces
252
     *
253
     * @return string
254
     */
255 31
    public static function decodeHexadecimal($hexa, $add_braces = false)
256
    {
257
        // Special shortcut for XML content.
258 31
        if (false !== stripos($hexa, '<?xml')) {
259 3
            return $hexa;
260
        }
261
262 31
        $text = '';
263 31
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
264
265 31
        foreach ($parts as $part) {
266 31
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
267
                // strip line breaks
268 10
                $part = preg_replace("/[\r\n]/", '', $part);
269 10
                $part = trim($part, '<>');
270 10
                if ($add_braces) {
271 1
                    $text .= '(';
272
                }
273
274 10
                $part = pack('H*', $part);
275 10
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
276
277 10
                if ($add_braces) {
278 10
                    $text .= ')';
279
                }
280
            } else {
281 31
                $text .= $part;
282
            }
283
        }
284
285 31
        return $text;
286
    }
287
288
    /**
289
     * @param string $text
290
     *
291
     * @return string
292
     */
293 31
    public static function decodeOctal($text)
294
    {
295 31
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
296 31
        $text = '';
297
298 31
        foreach ($parts as $part) {
299 31
            if (preg_match('/^\\\\\d{3}$/', $part)) {
300 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
301
            } else {
302 31
                $text .= $part;
303
            }
304
        }
305
306 31
        return $text;
307
    }
308
309
    /**
310
     * @param string $text
311
     *
312
     * @return string
313
     */
314 45
    public static function decodeEntities($text)
315
    {
316 45
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
317 45
        $text = '';
318
319 45
        foreach ($parts as $part) {
320 45
            if (preg_match('/^#\d{2}$/', $part)) {
321 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

321
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
322
            } else {
323 45
                $text .= $part;
324
            }
325
        }
326
327 45
        return $text;
328
    }
329
330
    /**
331
     * @param string $text
332
     *
333
     * @return string
334
     */
335 31
    public static function decodeUnicode($text)
336
    {
337 31
        if (preg_match('/^\xFE\xFF/i', $text)) {
338
            // Strip U+FEFF byte order marker.
339 19
            $decode = substr($text, 2);
340 19
            $text = '';
341 19
            $length = \strlen($decode);
342
343 19
            for ($i = 0; $i < $length; $i += 2) {
344 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
345
            }
346
        }
347
348 31
        return $text;
349
    }
350
351
    /**
352
     * @return int
353
     *
354
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
355
     */
356 15
    protected function getFontSpaceLimit()
357
    {
358 15
        return $this->config->getFontSpaceLimit();
359
    }
360
361
    /**
362
     * @param array $commands
363
     *
364
     * @return string
365
     */
366 15
    public function decodeText($commands)
367
    {
368 15
        $text = '';
369 15
        $word_position = 0;
370 15
        $words = [];
371 15
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
372 15
        $font_space = $this->getFontSpaceLimit();
373
374 15
        foreach ($commands as $command) {
375 15
            switch ($command[PDFObject::TYPE]) {
376 15
                case 'n':
377 13
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
378 7
                        $word_position = \count($words);
379
                    }
380 13
                    continue 2;
381 15
                case '<':
382
                    // Decode hexadecimal.
383 8
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
384 8
                    break;
385
386
                default:
387
                    // Decode octal (if necessary).
388 11
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
389
            }
390
391
            // replace escaped chars
392 15
            $text = str_replace(
393 15
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
394 15
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
395
                $text
396
            );
397
398
            // add content to result string
399 15
            if (isset($words[$word_position])) {
400 13
                $words[$word_position] .= $text;
401
            } else {
402 15
                $words[$word_position] = $text;
403
            }
404
        }
405
406 15
        foreach ($words as &$word) {
407 15
            $word = $this->decodeContent($word);
408
        }
409
410 15
        return implode(' ', $words);
411
    }
412
413
    /**
414
     * @param string $text
415
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
416
     *
417
     * @return string
418
     */
419 17
    public function decodeContent($text, &$unicode = null)
420
    {
421 17
        if ($this->has('ToUnicode')) {
422 15
            $bytes = $this->tableSizes['from'];
423
424 15
            if ($bytes) {
425 15
                $result = '';
426 15
                $length = \strlen($text);
427
428 15
                for ($i = 0; $i < $length; $i += $bytes) {
429 15
                    $char = substr($text, $i, $bytes);
430
431 15
                    if (false !== ($decoded = $this->translateChar($char, false))) {
432 15
                        $char = $decoded;
433
                    } elseif ($this->has('DescendantFonts')) {
434
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
435
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

435
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
436
                        } else {
437
                            $fonts = $this->get('DescendantFonts')->getContent();
438
                        }
439
                        $decoded = false;
440
441
                        foreach ($fonts as $font) {
442
                            if ($font instanceof self) {
443
                                if (false !== ($decoded = $font->translateChar($char, false))) {
444
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
445
                                    break;
446
                                }
447
                            }
448
                        }
449
450
                        if (false !== $decoded) {
451
                            $char = $decoded;
452
                        } else {
453
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
454
                        }
455
                    } else {
456
                        $char = self::MISSING;
457
                    }
458
459 15
                    $result .= $char;
460
                }
461
462 15
                $text = $result;
463
            }
464 12
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
465
            /** @var Encoding $encoding */
466 3
            $encoding = $this->get('Encoding');
467 3
            $unicode = mb_check_encoding($text, 'UTF-8');
468 3
            $result = '';
469 3
            if ($unicode) {
470 3
                $chars = preg_split(
471 3
                        '//s'.($unicode ? 'u' : ''),
472
                        $text,
473 3
                        -1,
474 3
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
475
                );
476
477 3
                foreach ($chars as $char) {
478 3
                    $dec_av = hexdec(bin2hex($char));
479 3
                    $dec_ap = $encoding->translateChar($dec_av);
480 3
                    $result .= self::uchr($dec_ap);
481
                }
482
            } else {
483 3
                $length = \strlen($text);
484
485 3
                for ($i = 0; $i < $length; ++$i) {
486 3
                    $dec_av = hexdec(bin2hex($text[$i]));
487 3
                    $dec_ap = $encoding->translateChar($dec_av);
488 3
                    $result .= self::uchr($dec_ap);
489
                }
490
            }
491 3
            $text = $result;
492 10
        } elseif ($this->get('Encoding') instanceof Element &&
493 10
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
494
            // mb_convert_encoding does not support MacRoman/macintosh,
495
            // so we use iconv() here
496 2
            $text = iconv('macintosh', 'UTF-8', $text);
497 10
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
498
            // don't double-encode strings already in UTF-8
499 4
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
500
        }
501
502 17
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
503
    }
504
}
505