Passed
Pull Request — master (#358)
by butschster
02:09
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 47.933

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 30
cts 51
cp 0.5881
crap 47.933
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var int
42
     */
43
    public static $font_space = -50;
44
45
    /**
46
     * @var array
47
     */
48
    protected $table = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $tableSizes = null;
54
55 24
    public function init()
56
    {
57
        // Load translate table.
58 24
        $this->loadTranslateTable();
59 24
    }
60
61
    /**
62
     * @return string
63
     */
64 2
    public function getName()
65
    {
66 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
67
    }
68
69
    /**
70
     * @return string
71
     */
72 2
    public function getType()
73
    {
74 2
        return (string) $this->header->get('Subtype');
75
    }
76
77
    /**
78
     * @return array
79
     */
80 1
    public function getDetails($deep = true)
81
    {
82 1
        $details = [];
83
84 1
        $details['Name'] = $this->getName();
85 1
        $details['Type'] = $this->getType();
86 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
87
88 1
        $details += parent::getDetails($deep);
89
90 1
        return $details;
91
    }
92
93
    /**
94
     * @param string $char
95
     * @param bool   $use_default
96
     *
97
     * @return string|bool
98
     */
99 13
    public function translateChar($char, $use_default = true)
100
    {
101 13
        $dec = hexdec(bin2hex($char));
102
103 13
        if (\array_key_exists($dec, $this->table)) {
104 13
            return $this->table[$dec];
105
        }
106
107
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
108 3
        $fallbackDecoded = $char;
109 3
        if (\strlen($char) < 2 && $this->has('Encoding') && 'WinAnsiEncoding' === $this->get('Encoding')->__toString()) {
110
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

110
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
111
        }
112
113 3
        return $use_default ? self::MISSING : $fallbackDecoded;
114
    }
115
116
    /**
117
     * @param int $code
118
     *
119
     * @return string
120
     */
121 25
    public static function uchr($code)
122
    {
123
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
124
        // therefore, we use mb_convert_encoding() instead
125 25
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
126
    }
127
128
    /**
129
     * @return array
130
     */
131 24
    public function loadTranslateTable()
132
    {
133 24
        if (null !== $this->table) {
134 1
            return $this->table;
135
        }
136
137 24
        $this->table = [];
138 24
        $this->tableSizes = [
139
            'from' => 1,
140
            'to' => 1,
141
        ];
142
143 24
        if ($this->has('ToUnicode')) {
144 22
            $content = $this->get('ToUnicode')->getContent();
145 22
            $matches = [];
146
147
            // Support for multiple spacerange sections
148 22
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
149 22
                foreach ($matches['sections'] as $section) {
150 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
151
152 22
                    preg_match_all($regexp, $section, $matches);
153
154 22
                    $this->tableSizes = [
155 22
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
156 22
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
157
                    ];
158
159 22
                    break;
160
                }
161
            }
162
163
            // Support for multiple bfchar sections
164 22
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
165 8
                foreach ($matches['sections'] as $section) {
166 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
167
168 8
                    preg_match_all($regexp, $section, $matches);
169
170 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
171
172 8
                    foreach ($matches['from'] as $key => $from) {
173 8
                        $parts = preg_split(
174 8
                            '/([0-9A-F]{4})/i',
175 8
                            $matches['to'][$key],
176 8
                            0,
177 8
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
178
                        );
179 8
                        $text = '';
180 8
                        foreach ($parts as $part) {
181 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

181
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
182
                        }
183 8
                        $this->table[hexdec($from)] = $text;
184
                    }
185
                }
186
            }
187
188
            // Support for multiple bfrange sections
189 22
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
190 18
                foreach ($matches['sections'] as $section) {
191
                    // Support for : <srcCode1> <srcCode2> <dstString>
192 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
193
194 18
                    preg_match_all($regexp, $section, $matches);
195
196 18
                    foreach ($matches['from'] as $key => $from) {
197 18
                        $char_from = hexdec($from);
198 18
                        $char_to = hexdec($matches['to'][$key]);
199 18
                        $offset = hexdec($matches['offset'][$key]);
200
201 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
202 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
203
                        }
204
                    }
205
206
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
207
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
208 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
209
210 18
                    preg_match_all($regexp, $section, $matches);
211
212 18
                    foreach ($matches['from'] as $key => $from) {
213 1
                        $char_from = hexdec($from);
214 1
                        $strings = [];
215
216 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
217
218 1
                        foreach ($strings['string'] as $position => $string) {
219 1
                            $parts = preg_split(
220 1
                                '/([0-9A-F]{4})/i',
221
                                $string,
222 1
                                0,
223 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
224
                            );
225 1
                            $text = '';
226 1
                            foreach ($parts as $part) {
227 1
                                $text .= self::uchr(hexdec($part));
228
                            }
229 1
                            $this->table[$char_from + $position] = $text;
230
                        }
231
                    }
232
                }
233
            }
234
        }
235
236 24
        return $this->table;
237
    }
238
239
    /**
240
     * @param array $table
241
     */
242
    public function setTable($table)
243
    {
244
        $this->table = $table;
245
    }
246
247
    /**
248
     * @param string $hexa
249
     * @param bool   $add_braces
250
     *
251
     * @return string
252
     */
253 28
    public static function decodeHexadecimal($hexa, $add_braces = false)
254
    {
255
        // Special shortcut for XML content.
256 28
        if (false !== stripos($hexa, '<?xml')) {
257 3
            return $hexa;
258
        }
259
260 28
        $text = '';
261 28
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
262
263 28
        foreach ($parts as $part) {
264 28
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
265
                // strip line breaks
266 9
                $part = preg_replace("/[\r\n]/", '', $part);
267 9
                $part = trim($part, '<>');
268 9
                if ($add_braces) {
269 1
                    $text .= '(';
270
                }
271
272 9
                $part = pack('H*', $part);
273 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
274
275 9
                if ($add_braces) {
276 9
                    $text .= ')';
277
                }
278
            } else {
279 28
                $text .= $part;
280
            }
281
        }
282
283 28
        return $text;
284
    }
285
286
    /**
287
     * @param string $text
288
     *
289
     * @return string
290
     */
291 28
    public static function decodeOctal($text)
292
    {
293 28
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
294 28
        $text = '';
295
296 28
        foreach ($parts as $part) {
297 28
            if (preg_match('/^\\\\\d{3}$/', $part)) {
298 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

298
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
299
            } else {
300 28
                $text .= $part;
301
            }
302
        }
303
304 28
        return $text;
305
    }
306
307
    /**
308
     * @param string $text
309
     *
310
     * @return string
311
     */
312 42
    public static function decodeEntities($text)
313
    {
314 42
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
315 42
        $text = '';
316
317 42
        foreach ($parts as $part) {
318 42
            if (preg_match('/^#\d{2}$/', $part)) {
319 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

319
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
320
            } else {
321 42
                $text .= $part;
322
            }
323
        }
324
325 42
        return $text;
326
    }
327
328
    /**
329
     * @param string $text
330
     *
331
     * @return string
332
     */
333 28
    public static function decodeUnicode($text)
334
    {
335 28
        if (preg_match('/^\xFE\xFF/i', $text)) {
336
            // Strip U+FEFF byte order marker.
337 19
            $decode = substr($text, 2);
338 19
            $text = '';
339 19
            $length = \strlen($decode);
340
341 19
            for ($i = 0; $i < $length; $i += 2) {
342 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

342
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
343
            }
344
        }
345
346 28
        return $text;
347
    }
348
349
    /**
350
     * @return int
351
     */
352 13
    public static function getFontSpaceLimit()
353
    {
354 13
        return self::$font_space;
355
    }
356
357
    /**
358
     * @param int $font_space
359
     *
360
     * @return void
361
     */
362 1
    public static function setFontSpaceLimit($font_space)
363
    {
364 1
        self::$font_space = $font_space;
365 1
    }
366
367
    /**
368
     * @param array $commands
369
     *
370
     * @return string
371
     */
372 12
    public function decodeText($commands)
373
    {
374 12
        $text = '';
375 12
        $word_position = 0;
376 12
        $words = [];
377 12
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
378 12
        $font_space = self::getFontSpaceLimit();
379
380 12
        foreach ($commands as $command) {
381 12
            switch ($command[PDFObject::TYPE]) {
382 12
                case 'n':
383 10
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
384 5
                        $word_position = \count($words);
385
                    }
386 10
                    continue 2;
387
388 12
                case '<':
389
                    // Decode hexadecimal.
390 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
391 7
                    break;
392
393
                default:
394
                    // Decode octal (if necessary).
395 8
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
396
            }
397
398
            // replace escaped chars
399 12
            $text = str_replace(
400 12
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
401 12
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
402
                $text
403
            );
404
405
            // add content to result string
406 12
            if (isset($words[$word_position])) {
407 10
                $words[$word_position] .= $text;
408
            } else {
409 12
                $words[$word_position] = $text;
410
            }
411
        }
412
413 12
        foreach ($words as &$word) {
414 12
            $word = $this->decodeContent($word);
415
        }
416
417 12
        return implode(' ', $words);
418
    }
419
420
    /**
421
     * @param string $text
422
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
423
     *
424
     * @return string
425
     */
426 14
    public function decodeContent($text, &$unicode = null)
427
    {
428 14
        if ($this->has('ToUnicode')) {
429 12
            $bytes = $this->tableSizes['from'];
430
431 12
            if ($bytes) {
432 12
                $result = '';
433 12
                $length = \strlen($text);
434
435 12
                for ($i = 0; $i < $length; $i += $bytes) {
436 12
                    $char = substr($text, $i, $bytes);
437
438 12
                    if (false !== ($decoded = $this->translateChar($char, false))) {
439 12
                        $char = $decoded;
440
                    } elseif ($this->has('DescendantFonts')) {
441
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
442
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

442
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
443
                        } else {
444
                            $fonts = $this->get('DescendantFonts')->getContent();
445
                        }
446
                        $decoded = false;
447
448
                        foreach ($fonts as $font) {
449
                            if ($font instanceof self) {
450
                                if (false !== ($decoded = $font->translateChar($char, false))) {
451
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
452
                                    break;
453
                                }
454
                            }
455
                        }
456
457
                        if (false !== $decoded) {
458
                            $char = $decoded;
459
                        } else {
460
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
461
                        }
462
                    } else {
463
                        $char = self::MISSING;
464
                    }
465
466 12
                    $result .= $char;
467
                }
468
469 12
                $text = $result;
470
            }
471 9
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
472
            /** @var Encoding $encoding */
473 1
            $encoding = $this->get('Encoding');
474 1
            $unicode = mb_check_encoding($text, 'UTF-8');
475 1
            $result = '';
476 1
            if ($unicode) {
477 1
                $chars = preg_split(
478 1
                        '//s'.($unicode ? 'u' : ''),
479
                        $text,
480 1
                        -1,
481 1
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
482
                );
483
484 1
                foreach ($chars as $char) {
485 1
                    $dec_av = hexdec(bin2hex($char));
486 1
                    $dec_ap = $encoding->translateChar($dec_av);
487
                    $result .= self::uchr($dec_ap);
488
                }
489
            } else {
490
                $length = \strlen($text);
491
492
                for ($i = 0; $i < $length; ++$i) {
493
                    $dec_av = hexdec(bin2hex($text[$i]));
494
                    $dec_ap = $encoding->translateChar($dec_av);
495
                    $result .= self::uchr($dec_ap);
496
                }
497
            }
498
            $text = $result;
499 9
        } elseif ($this->get('Encoding') instanceof Element &&
500 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
501
            // mb_convert_encoding does not support MacRoman/macintosh,
502
            // so we use iconv() here
503 1
            $text = iconv('macintosh', 'UTF-8', $text);
504 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
505
            // don't double-encode strings already in UTF-8
506 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
507
        }
508
509 14
        return $text;
510
    }
511
}
512