Passed
Push — master ( 43e436...5667bd )
by Konrad
08:19
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 28.2734

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 5
b 0
f 1
nc 7
nop 2
dl 0
loc 84
ccs 37
cts 51
cp 0.7255
crap 28.2734
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53 30
    public function init()
54
    {
55
        // Load translate table.
56 30
        $this->loadTranslateTable();
57 30
    }
58
59 2
    public function getName(): string
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64 2
    public function getType(): string
65
    {
66 2
        return (string) $this->header->get('Subtype');
67
    }
68
69 1
    public function getDetails(bool $deep = true): array
70
    {
71 1
        $details = [];
72
73 1
        $details['Name'] = $this->getName();
74 1
        $details['Type'] = $this->getType();
75 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
76
77 1
        $details += parent::getDetails($deep);
78
79 1
        return $details;
80
    }
81
82
    /**
83
     * @return string|bool
84
     */
85 20
    public function translateChar(string $char, bool $use_default = true)
86
    {
87 20
        $dec = hexdec(bin2hex($char));
88
89 20
        if (\array_key_exists($dec, $this->table)) {
90 17
            return $this->table[$dec];
91
        }
92
93
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
94 6
        $fallbackDecoded = $char;
95
        if (
96 6
            \strlen($char) < 2
97 6
            && $this->has('Encoding')
98 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
99
        ) {
100
            try {
101 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
102 1
                    $fallbackDecoded = self::uchr($dec);
103
                }
104 1
            } catch (EncodingNotFoundException $e) {
105
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
106
                // See table 5.11 on PDF 1.5 specs for more info
107
            }
108
        }
109
110 6
        return $use_default ? self::MISSING : $fallbackDecoded;
111
    }
112
113 29
    public static function uchr(int $code): string
114
    {
115
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
116
        // therefore, we use mb_convert_encoding() instead
117 29
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
118
    }
119
120 30
    public function loadTranslateTable(): array
121
    {
122 30
        if (null !== $this->table) {
123 1
            return $this->table;
124
        }
125
126 30
        $this->table = [];
127 30
        $this->tableSizes = [
128
            'from' => 1,
129
            'to' => 1,
130
        ];
131
132 30
        if ($this->has('ToUnicode')) {
133 27
            $content = $this->get('ToUnicode')->getContent();
134 27
            $matches = [];
135
136
            // Support for multiple spacerange sections
137 27
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
138 27
                foreach ($matches['sections'] as $section) {
139 27
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
140
141 27
                    preg_match_all($regexp, $section, $matches);
142
143 27
                    $this->tableSizes = [
144 27
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
145 27
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
146
                    ];
147
148 27
                    break;
149
                }
150
            }
151
152
            // Support for multiple bfchar sections
153 27
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
154 10
                foreach ($matches['sections'] as $section) {
155 10
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
156
157 10
                    preg_match_all($regexp, $section, $matches);
158
159 10
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
160
161 10
                    foreach ($matches['from'] as $key => $from) {
162 10
                        $parts = preg_split(
163 10
                            '/([0-9A-F]{4})/i',
164 10
                            $matches['to'][$key],
165 10
                            0,
166 10
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
167
                        );
168 10
                        $text = '';
169 10
                        foreach ($parts as $part) {
170 10
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

170
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
171
                        }
172 10
                        $this->table[hexdec($from)] = $text;
173
                    }
174
                }
175
            }
176
177
            // Support for multiple bfrange sections
178 27
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
179 21
                foreach ($matches['sections'] as $section) {
180
                    // Support for : <srcCode1> <srcCode2> <dstString>
181 21
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
182
183 21
                    preg_match_all($regexp, $section, $matches);
184
185 21
                    foreach ($matches['from'] as $key => $from) {
186 21
                        $char_from = hexdec($from);
187 21
                        $char_to = hexdec($matches['to'][$key]);
188 21
                        $offset = hexdec($matches['offset'][$key]);
189
190 21
                        for ($char = $char_from; $char <= $char_to; ++$char) {
191 21
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
192
                        }
193
                    }
194
195
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
196
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
197 21
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
198
199 21
                    preg_match_all($regexp, $section, $matches);
200
201 21
                    foreach ($matches['from'] as $key => $from) {
202 1
                        $char_from = hexdec($from);
203 1
                        $strings = [];
204
205 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
206
207 1
                        foreach ($strings['string'] as $position => $string) {
208 1
                            $parts = preg_split(
209 1
                                '/([0-9A-F]{4})/i',
210
                                $string,
211 1
                                0,
212 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
213
                            );
214 1
                            $text = '';
215 1
                            foreach ($parts as $part) {
216 1
                                $text .= self::uchr(hexdec($part));
217
                            }
218 1
                            $this->table[$char_from + $position] = $text;
219
                        }
220
                    }
221
                }
222
            }
223
        }
224
225 30
        return $this->table;
226
    }
227
228 1
    public function setTable(array $table)
229
    {
230 1
        $this->table = $table;
231 1
    }
232
233 33
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
234
    {
235
        // Special shortcut for XML content.
236 33
        if (false !== stripos($hexa, '<?xml')) {
237 2
            return $hexa;
238
        }
239
240 33
        $text = '';
241 33
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
242
243 33
        foreach ($parts as $part) {
244 33
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
245
                // strip line breaks
246 12
                $part = preg_replace("/[\r\n]/", '', $part);
247 12
                $part = trim($part, '<>');
248 12
                if ($add_braces) {
249 1
                    $text .= '(';
250
                }
251
252 12
                $part = pack('H*', $part);
253 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
254
255 12
                if ($add_braces) {
256 12
                    $text .= ')';
257
                }
258
            } else {
259 33
                $text .= $part;
260
            }
261
        }
262
263 33
        return $text;
264
    }
265
266 33
    public static function decodeOctal(string $text): string
267
    {
268 33
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
269 33
        $text = '';
270
271 33
        foreach ($parts as $part) {
272 33
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
273 16
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

273
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
274
            } else {
275 33
                $text .= $part;
276
            }
277
        }
278
279 33
        return $text;
280
    }
281
282 47
    public static function decodeEntities(string $text): string
283
    {
284 47
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
285 47
        $text = '';
286
287 47
        foreach ($parts as $part) {
288 47
            if (preg_match('/^#\d{2}$/', $part)) {
289 2
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

289
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
290
            } else {
291 47
                $text .= $part;
292
            }
293
        }
294
295 47
        return $text;
296
    }
297
298 33
    public static function decodeUnicode(string $text): string
299
    {
300 33
        if (preg_match('/^\xFE\xFF/i', $text)) {
301
            // Strip U+FEFF byte order marker.
302 20
            $decode = substr($text, 2);
303 20
            $text = '';
304 20
            $length = \strlen($decode);
305
306 20
            for ($i = 0; $i < $length; $i += 2) {
307 20
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

307
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
308
            }
309
        }
310
311 33
        return $text;
312
    }
313
314
    /**
315
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
316
     */
317 17
    protected function getFontSpaceLimit(): int
318
    {
319 17
        return $this->config->getFontSpaceLimit();
320
    }
321
322 17
    public function decodeText(array $commands): string
323
    {
324 17
        $word_position = 0;
325 17
        $words = [];
326 17
        $font_space = $this->getFontSpaceLimit();
327
328 17
        foreach ($commands as $command) {
329 17
            switch ($command[PDFObject::TYPE]) {
330 17
                case 'n':
331 14
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
332 7
                        $word_position = \count($words);
333
                    }
334 14
                    continue 2;
335 17
                case '<':
336
                    // Decode hexadecimal.
337 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
338 10
                    break;
339
340
                default:
341
                    // Decode octal (if necessary).
342 10
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
343
            }
344
345
            // replace escaped chars
346 17
            $text = str_replace(
347 17
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
348 17
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
349
                $text
350
            );
351
352
            // add content to result string
353 17
            if (isset($words[$word_position])) {
354 14
                $words[$word_position] .= $text;
355
            } else {
356 17
                $words[$word_position] = $text;
357
            }
358
        }
359
360 17
        foreach ($words as &$word) {
361 17
            $word = $this->decodeContent($word);
362
        }
363
364 17
        return implode(' ', $words);
365
    }
366
367
    /**
368
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
369
     */
370 19
    public function decodeContent(string $text, ?bool &$unicode = null): string
371
    {
372 19
        if ($this->has('ToUnicode')) {
373 17
            $bytes = $this->tableSizes['from'];
374
375 17
            if ($bytes) {
376 17
                $result = '';
377 17
                $length = \strlen($text);
378
379 17
                for ($i = 0; $i < $length; $i += $bytes) {
380 17
                    $char = substr($text, $i, $bytes);
381
382 17
                    if (false !== ($decoded = $this->translateChar($char, false))) {
383 17
                        $char = $decoded;
384
                    } elseif ($this->has('DescendantFonts')) {
385
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
386
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

386
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
387
                        } else {
388
                            $fonts = $this->get('DescendantFonts')->getContent();
389
                        }
390
                        $decoded = false;
391
392
                        foreach ($fonts as $font) {
393
                            if ($font instanceof self) {
394
                                if (false !== ($decoded = $font->translateChar($char, false))) {
395
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

395
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
396
                                    break;
397
                                }
398
                            }
399
                        }
400
401
                        if (false !== $decoded) {
402
                            $char = $decoded;
403
                        } else {
404
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
405
                        }
406
                    } else {
407
                        $char = self::MISSING;
408
                    }
409
410 17
                    $result .= $char;
411
                }
412
413 17
                $text = $result;
414
            }
415 11
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
416
            /** @var Encoding $encoding */
417 2
            $encoding = $this->get('Encoding');
418 2
            $unicode = mb_check_encoding($text, 'UTF-8');
419 2
            $result = '';
420 2
            if ($unicode) {
421 2
                $chars = preg_split(
422 2
                        '//s'.($unicode ? 'u' : ''),
423
                        $text,
424 2
                        -1,
425 2
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
426
                );
427
428 2
                foreach ($chars as $char) {
429 2
                    $dec_av = hexdec(bin2hex($char));
430 2
                    $dec_ap = $encoding->translateChar($dec_av);
431 2
                    $result .= self::uchr($dec_ap);
432
                }
433
            } else {
434 2
                $length = \strlen($text);
435
436 2
                for ($i = 0; $i < $length; ++$i) {
437 2
                    $dec_av = hexdec(bin2hex($text[$i]));
438 2
                    $dec_ap = $encoding->translateChar($dec_av);
439 2
                    $result .= self::uchr($dec_ap);
440
                }
441
            }
442 2
            $text = $result;
443 9
        } elseif ($this->get('Encoding') instanceof Element &&
444 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
445
            // mb_convert_encoding does not support MacRoman/macintosh,
446
            // so we use iconv() here
447 1
            $text = iconv('macintosh', 'UTF-8', $text);
448 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
449
            // don't double-encode strings already in UTF-8
450 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
451
        }
452
453 19
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
454
    }
455
}
456