Passed
Push — master ( b32bb7...35c881 )
by Konrad
08:05
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53 31
    public function init()
54
    {
55
        // Load translate table.
56 31
        $this->loadTranslateTable();
57 31
    }
58
59
    /**
60
     * @return string
61
     */
62 2
    public function getName()
63
    {
64 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
65
    }
66
67
    /**
68
     * @return string
69
     */
70 2
    public function getType()
71
    {
72 2
        return (string) $this->header->get('Subtype');
73
    }
74
75
    /**
76
     * @return array
77
     */
78 1
    public function getDetails($deep = true)
79
    {
80 1
        $details = [];
81
82 1
        $details['Name'] = $this->getName();
83 1
        $details['Type'] = $this->getType();
84 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
85
86 1
        $details += parent::getDetails($deep);
87
88 1
        return $details;
89
    }
90
91
    /**
92
     * @param string $char
93
     * @param bool   $use_default
94
     *
95
     * @return string|bool
96
     */
97 21
    public function translateChar($char, $use_default = true)
98
    {
99 21
        $dec = hexdec(bin2hex($char));
100
101 21
        if (\array_key_exists($dec, $this->table)) {
102 18
            return $this->table[$dec];
103
        }
104
105
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
106 7
        $fallbackDecoded = $char;
107
        if (
108 7
            \strlen($char) < 2
109 7
            && $this->has('Encoding')
110 7
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
111
        ) {
112
            try {
113 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
114 1
                    $fallbackDecoded = self::uchr($dec);
115
                }
116 1
            } catch (EncodingNotFoundException $e) {
117
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
118
                // See table 5.11 on PDF 1.5 specs for more info
119
            }
120
        }
121
122 7
        return $use_default ? self::MISSING : $fallbackDecoded;
123
    }
124
125
    /**
126
     * @param int $code
127
     *
128
     * @return string
129
     */
130 30
    public static function uchr($code)
131
    {
132
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
133
        // therefore, we use mb_convert_encoding() instead
134 30
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') also could return the type array which is incompatible with the documented return type string.
Loading history...
135
    }
136
137
    /**
138
     * @return array
139
     */
140 31
    public function loadTranslateTable()
141
    {
142 31
        if (null !== $this->table) {
143 1
            return $this->table;
144
        }
145
146 31
        $this->table = [];
147 31
        $this->tableSizes = [
148
            'from' => 1,
149
            'to' => 1,
150
        ];
151
152 31
        if ($this->has('ToUnicode')) {
153 28
            $content = $this->get('ToUnicode')->getContent();
154 28
            $matches = [];
155
156
            // Support for multiple spacerange sections
157 28
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
158 28
                foreach ($matches['sections'] as $section) {
159 28
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
160
161 28
                    preg_match_all($regexp, $section, $matches);
162
163 28
                    $this->tableSizes = [
164 28
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
165 28
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
166
                    ];
167
168 28
                    break;
169
                }
170
            }
171
172
            // Support for multiple bfchar sections
173 28
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
174 11
                foreach ($matches['sections'] as $section) {
175 11
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
176
177 11
                    preg_match_all($regexp, $section, $matches);
178
179 11
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
180
181 11
                    foreach ($matches['from'] as $key => $from) {
182 11
                        $parts = preg_split(
183 11
                            '/([0-9A-F]{4})/i',
184 11
                            $matches['to'][$key],
185 11
                            0,
186 11
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
187
                        );
188 11
                        $text = '';
189 11
                        foreach ($parts as $part) {
190 11
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

190
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
191
                        }
192 11
                        $this->table[hexdec($from)] = $text;
193
                    }
194
                }
195
            }
196
197
            // Support for multiple bfrange sections
198 28
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
199 22
                foreach ($matches['sections'] as $section) {
200
                    // Support for : <srcCode1> <srcCode2> <dstString>
201 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
202
203 22
                    preg_match_all($regexp, $section, $matches);
204
205 22
                    foreach ($matches['from'] as $key => $from) {
206 22
                        $char_from = hexdec($from);
207 22
                        $char_to = hexdec($matches['to'][$key]);
208 22
                        $offset = hexdec($matches['offset'][$key]);
209
210 22
                        for ($char = $char_from; $char <= $char_to; ++$char) {
211 22
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
212
                        }
213
                    }
214
215
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
216
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
217 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
218
219 22
                    preg_match_all($regexp, $section, $matches);
220
221 22
                    foreach ($matches['from'] as $key => $from) {
222 1
                        $char_from = hexdec($from);
223 1
                        $strings = [];
224
225 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
226
227 1
                        foreach ($strings['string'] as $position => $string) {
228 1
                            $parts = preg_split(
229 1
                                '/([0-9A-F]{4})/i',
230
                                $string,
231 1
                                0,
232 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
233
                            );
234 1
                            $text = '';
235 1
                            foreach ($parts as $part) {
236 1
                                $text .= self::uchr(hexdec($part));
237
                            }
238 1
                            $this->table[$char_from + $position] = $text;
239
                        }
240
                    }
241
                }
242
            }
243
        }
244
245 31
        return $this->table;
246
    }
247
248
    /**
249
     * @param array $table
250
     */
251 1
    public function setTable($table)
252
    {
253 1
        $this->table = $table;
254 1
    }
255
256
    /**
257
     * @param string $hexa
258
     * @param bool   $add_braces
259
     *
260
     * @return string
261
     */
262 34
    public static function decodeHexadecimal($hexa, $add_braces = false)
263
    {
264
        // Special shortcut for XML content.
265 34
        if (false !== stripos($hexa, '<?xml')) {
266 3
            return $hexa;
267
        }
268
269 34
        $text = '';
270 34
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
271
272 34
        foreach ($parts as $part) {
273 34
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
274
                // strip line breaks
275 13
                $part = preg_replace("/[\r\n]/", '', $part);
276 13
                $part = trim($part, '<>');
277 13
                if ($add_braces) {
278 1
                    $text .= '(';
279
                }
280
281 13
                $part = pack('H*', $part);
282 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
283
284 13
                if ($add_braces) {
285 13
                    $text .= ')';
286
                }
287
            } else {
288 34
                $text .= $part;
289
            }
290
        }
291
292 34
        return $text;
293
    }
294
295
    /**
296
     * @param string $text
297
     *
298
     * @return string
299
     */
300 34
    public static function decodeOctal($text)
301
    {
302 34
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
303 34
        $text = '';
304
305 34
        foreach ($parts as $part) {
306 34
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
307 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

307
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
308
            } else {
309 34
                $text .= $part;
310
            }
311
        }
312
313 34
        return $text;
314
    }
315
316
    /**
317
     * @param string $text
318
     *
319
     * @return string
320
     */
321 48
    public static function decodeEntities($text)
322
    {
323 48
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
324 48
        $text = '';
325
326 48
        foreach ($parts as $part) {
327 48
            if (preg_match('/^#\d{2}$/', $part)) {
328 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

328
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
329
            } else {
330 48
                $text .= $part;
331
            }
332
        }
333
334 48
        return $text;
335
    }
336
337
    /**
338
     * @param string $text
339
     *
340
     * @return string
341
     */
342 34
    public static function decodeUnicode($text)
343
    {
344 34
        if (preg_match('/^\xFE\xFF/i', $text)) {
345
            // Strip U+FEFF byte order marker.
346 21
            $decode = substr($text, 2);
347 21
            $text = '';
348 21
            $length = \strlen($decode);
349
350 21
            for ($i = 0; $i < $length; $i += 2) {
351 21
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

351
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
352
            }
353
        }
354
355 34
        return $text;
356
    }
357
358
    /**
359
     * @return int
360
     *
361
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
362
     */
363 18
    protected function getFontSpaceLimit()
364
    {
365 18
        return $this->config->getFontSpaceLimit();
366
    }
367
368
    /**
369
     * @param array $commands
370
     *
371
     * @return string
372
     */
373 18
    public function decodeText($commands)
374
    {
375 18
        $text = '';
376 18
        $word_position = 0;
377 18
        $words = [];
378 18
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
379 18
        $font_space = $this->getFontSpaceLimit();
380
381 18
        foreach ($commands as $command) {
382 18
            switch ($command[PDFObject::TYPE]) {
383 18
                case 'n':
384 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
385 8
                        $word_position = \count($words);
386
                    }
387 15
                    continue 2;
388 18
                case '<':
389
                    // Decode hexadecimal.
390 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
391 11
                    break;
392
393
                default:
394
                    // Decode octal (if necessary).
395 11
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
396
            }
397
398
            // replace escaped chars
399 18
            $text = str_replace(
400 18
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
401 18
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
402
                $text
403
            );
404
405
            // add content to result string
406 18
            if (isset($words[$word_position])) {
407 15
                $words[$word_position] .= $text;
408
            } else {
409 18
                $words[$word_position] = $text;
410
            }
411
        }
412
413 18
        foreach ($words as &$word) {
414 18
            $word = $this->decodeContent($word);
415
        }
416
417 18
        return implode(' ', $words);
418
    }
419
420
    /**
421
     * @param string $text
422
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
423
     *
424
     * @return string
425
     */
426 20
    public function decodeContent($text, &$unicode = null)
427
    {
428 20
        if ($this->has('ToUnicode')) {
429 18
            $bytes = $this->tableSizes['from'];
430
431 18
            if ($bytes) {
432 18
                $result = '';
433 18
                $length = \strlen($text);
434
435 18
                for ($i = 0; $i < $length; $i += $bytes) {
436 18
                    $char = substr($text, $i, $bytes);
437
438 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
439 18
                        $char = $decoded;
440
                    } elseif ($this->has('DescendantFonts')) {
441
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
442
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

442
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
443
                        } else {
444
                            $fonts = $this->get('DescendantFonts')->getContent();
445
                        }
446
                        $decoded = false;
447
448
                        foreach ($fonts as $font) {
449
                            if ($font instanceof self) {
450
                                if (false !== ($decoded = $font->translateChar($char, false))) {
451
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
452
                                    break;
453
                                }
454
                            }
455
                        }
456
457
                        if (false !== $decoded) {
458
                            $char = $decoded;
459
                        } else {
460
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
461
                        }
462
                    } else {
463
                        $char = self::MISSING;
464
                    }
465
466 18
                    $result .= $char;
467
                }
468
469 18
                $text = $result;
470
            }
471 12
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
472
            /** @var Encoding $encoding */
473 3
            $encoding = $this->get('Encoding');
474 3
            $unicode = mb_check_encoding($text, 'UTF-8');
475 3
            $result = '';
476 3
            if ($unicode) {
477 3
                $chars = preg_split(
478 3
                        '//s'.($unicode ? 'u' : ''),
479
                        $text,
480 3
                        -1,
481 3
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
482
                );
483
484 3
                foreach ($chars as $char) {
485 3
                    $dec_av = hexdec(bin2hex($char));
486 3
                    $dec_ap = $encoding->translateChar($dec_av);
487 3
                    $result .= self::uchr($dec_ap);
488
                }
489
            } else {
490 3
                $length = \strlen($text);
491
492 3
                for ($i = 0; $i < $length; ++$i) {
493 3
                    $dec_av = hexdec(bin2hex($text[$i]));
494 3
                    $dec_ap = $encoding->translateChar($dec_av);
495 3
                    $result .= self::uchr($dec_ap);
496
                }
497
            }
498 3
            $text = $result;
499 10
        } elseif ($this->get('Encoding') instanceof Element &&
500 10
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
501
            // mb_convert_encoding does not support MacRoman/macintosh,
502
            // so we use iconv() here
503 2
            $text = iconv('macintosh', 'UTF-8', $text);
504 10
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
505
            // don't double-encode strings already in UTF-8
506 4
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
507
        }
508
509 20
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
510
    }
511
}
512