Passed
Pull Request — master (#384)
by
unknown
02:35
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 27
    public function init()
53
    {
54
        // Load translate table.
55 27
        $this->loadTranslateTable();
56 27
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 16
    public function translateChar($char, $use_default = true)
97
    {
98 16
        $dec = hexdec(bin2hex($char));
99
100 16
        if (\array_key_exists($dec, $this->table)) {
101 15
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 4
        $fallbackDecoded = $char;
106
        if (
107 4
            \strlen($char) < 2
108 4
            && $this->has('Encoding')
109 4
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
110 4
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
111
        ) {
112
            $fallbackDecoded = self::uchr($dec);
113
        }
114
115 4
        return $use_default ? self::MISSING : $fallbackDecoded;
116
    }
117
118
    /**
119
     * @param int $code
120
     *
121
     * @return string
122
     */
123 27
    public static function uchr($code)
124
    {
125
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
126
        // therefore, we use mb_convert_encoding() instead
127 27
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
128
    }
129
130
    /**
131
     * @return array
132
     */
133 27
    public function loadTranslateTable()
134
    {
135 27
        if (null !== $this->table) {
136 1
            return $this->table;
137
        }
138
139 27
        $this->table = [];
140 27
        $this->tableSizes = [
141
            'from' => 1,
142
            'to' => 1,
143
        ];
144
145 27
        if ($this->has('ToUnicode')) {
146 24
            $content = $this->get('ToUnicode')->getContent();
147 24
            $matches = [];
148
149
            // Support for multiple spacerange sections
150 24
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
151 24
                foreach ($matches['sections'] as $section) {
152 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
153
154 24
                    preg_match_all($regexp, $section, $matches);
155
156 24
                    $this->tableSizes = [
157 24
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
158 24
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
159
                    ];
160
161 24
                    break;
162
                }
163
            }
164
165
            // Support for multiple bfchar sections
166 24
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
167 8
                foreach ($matches['sections'] as $section) {
168 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
169
170 8
                    preg_match_all($regexp, $section, $matches);
171
172 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
173
174 8
                    foreach ($matches['from'] as $key => $from) {
175 8
                        $parts = preg_split(
176 8
                            '/([0-9A-F]{4})/i',
177 8
                            $matches['to'][$key],
178 8
                            0,
179 8
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
180
                        );
181 8
                        $text = '';
182 8
                        foreach ($parts as $part) {
183 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

183
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
184
                        }
185 8
                        $this->table[hexdec($from)] = $text;
186
                    }
187
                }
188
            }
189
190
            // Support for multiple bfrange sections
191 24
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
192 20
                foreach ($matches['sections'] as $section) {
193
                    // Support for : <srcCode1> <srcCode2> <dstString>
194 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
195
196 20
                    preg_match_all($regexp, $section, $matches);
197
198 20
                    foreach ($matches['from'] as $key => $from) {
199 20
                        $char_from = hexdec($from);
200 20
                        $char_to = hexdec($matches['to'][$key]);
201 20
                        $offset = hexdec($matches['offset'][$key]);
202
203 20
                        for ($char = $char_from; $char <= $char_to; ++$char) {
204 20
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
205
                        }
206
                    }
207
208
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
209
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
210 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
211
212 20
                    preg_match_all($regexp, $section, $matches);
213
214 20
                    foreach ($matches['from'] as $key => $from) {
215 1
                        $char_from = hexdec($from);
216 1
                        $strings = [];
217
218 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
219
220 1
                        foreach ($strings['string'] as $position => $string) {
221 1
                            $parts = preg_split(
222 1
                                '/([0-9A-F]{4})/i',
223
                                $string,
224 1
                                0,
225 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
226
                            );
227 1
                            $text = '';
228 1
                            foreach ($parts as $part) {
229 1
                                $text .= self::uchr(hexdec($part));
230
                            }
231 1
                            $this->table[$char_from + $position] = $text;
232
                        }
233
                    }
234
                }
235
            }
236
        }
237
238 27
        return $this->table;
239
    }
240
241
    /**
242
     * @param array $table
243
     */
244
    public function setTable($table)
245
    {
246
        $this->table = $table;
247
    }
248
249
    /**
250
     * @param string $hexa
251
     * @param bool   $add_braces
252
     *
253
     * @return string
254
     */
255 30
    public static function decodeHexadecimal($hexa, $add_braces = false)
256
    {
257
        // Special shortcut for XML content.
258 30
        if (false !== stripos($hexa, '<?xml')) {
259 3
            return $hexa;
260
        }
261
262 30
        $text = '';
263 30
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
264
265 30
        foreach ($parts as $part) {
266 30
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
267
                // strip line breaks
268 9
                $part = preg_replace("/[\r\n]/", '', $part);
269 9
                $part = trim($part, '<>');
270 9
                if ($add_braces) {
271 1
                    $text .= '(';
272
                }
273
274 9
                $part = pack('H*', $part);
275 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
276
277 9
                if ($add_braces) {
278 9
                    $text .= ')';
279
                }
280
            } else {
281 30
                $text .= $part;
282
            }
283
        }
284
285 30
        return $text;
286
    }
287
288
    /**
289
     * @param string $text
290
     *
291
     * @return string
292
     */
293 30
    public static function decodeOctal($text)
294
    {
295 30
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
296 30
        $text = '';
297
298 30
        foreach ($parts as $part) {
299 30
            if (preg_match('/^\\\\\d{3}$/', $part)) {
300 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
301
            } else {
302 30
                $text .= $part;
303
            }
304
        }
305
306 30
        return $text;
307
    }
308
309
    /**
310
     * @param string $text
311
     *
312
     * @return string
313
     */
314 44
    public static function decodeEntities($text)
315
    {
316 44
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
317 44
        $text = '';
318
319 44
        foreach ($parts as $part) {
320 44
            if (preg_match('/^#\d{2}$/', $part)) {
321 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

321
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
322
            } else {
323 44
                $text .= $part;
324
            }
325
        }
326
327 44
        return $text;
328
    }
329
330
    /**
331
     * @param string $text
332
     *
333
     * @return string
334
     */
335 30
    public static function decodeUnicode($text)
336
    {
337 30
        if (preg_match('/^\xFE\xFF/i', $text)) {
338
            // Strip U+FEFF byte order marker.
339 19
            $decode = substr($text, 2);
340 19
            $text = '';
341 19
            $length = \strlen($decode);
342
343 19
            for ($i = 0; $i < $length; $i += 2) {
344 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
345
            }
346
        }
347
348 30
        return $text;
349
    }
350
351
    /**
352
     * @return int
353
     *
354
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
355
     */
356 14
    protected function getFontSpaceLimit()
357
    {
358 14
        return $this->config->getFontSpaceLimit();
359
    }
360
361
    /**
362
     * @param array $commands
363
     *
364
     * @return string
365
     */
366 14
    public function decodeText($commands)
367
    {
368 14
        $text = '';
369 14
        $word_position = 0;
370 14
        $words = [];
371 14
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
372 14
        $font_space = $this->getFontSpaceLimit();
373
374 14
        foreach ($commands as $command) {
375 14
            switch ($command[PDFObject::TYPE]) {
376 14
                case 'n':
377 12
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
378 6
                        $word_position = \count($words);
379
                    }
380 12
                    continue 2;
381 14
                case '<':
382
                    // Decode hexadecimal.
383 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
384 7
                    break;
385
386
                default:
387
                    // Decode octal (if necessary).
388 10
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
389
            }
390
391
            // replace escaped chars
392 14
            $text = str_replace(
393 14
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
394 14
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
395
                $text
396
            );
397
398
            // add content to result string
399 14
            if (isset($words[$word_position])) {
400 12
                $words[$word_position] .= $text;
401
            } else {
402 14
                $words[$word_position] = $text;
403
            }
404
        }
405
406 14
        foreach ($words as &$word) {
407 14
            $word = $this->decodeContent($word);
408
        }
409
410 14
        return implode(' ', $words);
411
    }
412
413
    /**
414
     * @param string $text
415
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
416
     *
417
     * @return string
418
     */
419 16
    public function decodeContent($text, &$unicode = null)
420
    {
421 16
        if ($this->has('ToUnicode')) {
422 14
            $bytes = $this->tableSizes['from'];
423
424 14
            if ($bytes) {
425 14
                $result = '';
426 14
                $length = \strlen($text);
427
428 14
                for ($i = 0; $i < $length; $i += $bytes) {
429 14
                    $char = substr($text, $i, $bytes);
430
431 14
                    if (false !== ($decoded = $this->translateChar($char, false))) {
432 14
                        $char = $decoded;
433
                    } elseif ($this->has('DescendantFonts')) {
434
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
435
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

435
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
436
                        } else {
437
                            $fonts = $this->get('DescendantFonts')->getContent();
438
                        }
439
                        $decoded = false;
440
441
                        foreach ($fonts as $font) {
442
                            if ($font instanceof self) {
443
                                if (false !== ($decoded = $font->translateChar($char, false))) {
444
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
445
                                    break;
446
                                }
447
                            }
448
                        }
449
450
                        if (false !== $decoded) {
451
                            $char = $decoded;
452
                        } else {
453
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
454
                        }
455
                    } else {
456
                        $char = self::MISSING;
457
                    }
458
459 14
                    $result .= $char;
460
                }
461
462 14
                $text = $result;
463
            }
464 11
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
465
            /** @var Encoding $encoding */
466 3
            $encoding = $this->get('Encoding');
467 3
            $unicode = mb_check_encoding($text, 'UTF-8');
468 3
            $result = '';
469 3
            if ($unicode) {
470 3
                $chars = preg_split(
471 3
                        '//s'.($unicode ? 'u' : ''),
472
                        $text,
473 3
                        -1,
474 3
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
475
                );
476
477 3
                foreach ($chars as $char) {
478 3
                    $dec_av = hexdec(bin2hex($char));
479 3
                    $dec_ap = $encoding->translateChar($dec_av);
480 3
                    $result .= self::uchr($dec_ap);
481
                }
482
            } else {
483 3
                $length = \strlen($text);
484
485 3
                for ($i = 0; $i < $length; ++$i) {
486 3
                    $dec_av = hexdec(bin2hex($text[$i]));
487 3
                    $dec_ap = $encoding->translateChar($dec_av);
488 3
                    $result .= self::uchr($dec_ap);
489
                }
490
            }
491 3
            $text = $result;
492 9
        } elseif ($this->get('Encoding') instanceof Element &&
493 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
494
            // mb_convert_encoding does not support MacRoman/macintosh,
495
            // so we use iconv() here
496 1
            $text = iconv('macintosh', 'UTF-8', $text);
497 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
498
            // don't double-encode strings already in UTF-8
499 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
500
        }
501
502 16
        return $text;
503
    }
504
}
505