Passed
Pull Request — master (#349)
by
unknown
01:53
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 7

Size

Total Lines 84
Code Lines 54

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 47.933

Importance

Changes 6
Bugs 0 Features 1
Metric Value
cc 20
eloc 54
c 6
b 0
f 1
nc 7
nop 1
dl 0
loc 84
ccs 30
cts 51
cp 0.5881
crap 47.933
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 23
    public function init()
51
    {
52
        // Load translate table.
53 23
        $this->loadTranslateTable();
54 23
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 13
    public function translateChar($char, $use_default = true)
95
    {
96 13
        $dec = hexdec(bin2hex($char));
97
98 13
        if (\array_key_exists($dec, $this->table)) {
99 13
            return $this->table[$dec];
100
        }
101
102
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
103 3
        $fallbackDecoded = $char;
104 3
        if (\strlen($char) < 2 && $this->has('Encoding') && 'WinAnsiEncoding' === $this->get('Encoding')->__toString()) {
105
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

105
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
106
        }
107
108 3
        return $use_default ? self::MISSING : $fallbackDecoded;
109
    }
110
111
    /**
112
     * @param int $code
113
     *
114
     * @return string
115
     */
116 24
    public static function uchr($code)
117
    {
118
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
119
        // therefore, we use mb_convert_encoding() instead
120 24
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
121
    }
122
123
    /**
124
     * @return array
125
     */
126 23
    public function loadTranslateTable()
127
    {
128 23
        if (null !== $this->table) {
129 1
            return $this->table;
130
        }
131
132 23
        $this->table = [];
133 23
        $this->tableSizes = [
134
            'from' => 1,
135
            'to' => 1,
136
        ];
137
138 23
        if ($this->has('ToUnicode')) {
139 21
            $content = $this->get('ToUnicode')->getContent();
140 21
            $matches = [];
141
142
            // Support for multiple spacerange sections
143 21
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
144 21
                foreach ($matches['sections'] as $section) {
145 21
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
146
147 21
                    preg_match_all($regexp, $section, $matches);
148
149 21
                    $this->tableSizes = [
150 21
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
151 21
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
152
                    ];
153
154 21
                    break;
155
                }
156
            }
157
158
            // Support for multiple bfchar sections
159 21
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
160 7
                foreach ($matches['sections'] as $section) {
161 7
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
162
163 7
                    preg_match_all($regexp, $section, $matches);
164
165 7
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
166
167 7
                    foreach ($matches['from'] as $key => $from) {
168 7
                        $parts = preg_split(
169 7
                            '/([0-9A-F]{4})/i',
170 7
                            $matches['to'][$key],
171 7
                            0,
172 7
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
173
                        );
174 7
                        $text = '';
175 7
                        foreach ($parts as $part) {
176 7
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

176
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
177
                        }
178 7
                        $this->table[hexdec($from)] = $text;
179
                    }
180
                }
181
            }
182
183
            // Support for multiple bfrange sections
184 21
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
185 18
                foreach ($matches['sections'] as $section) {
186
                    // Support for : <srcCode1> <srcCode2> <dstString>
187 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
188
189 18
                    preg_match_all($regexp, $section, $matches);
190
191 18
                    foreach ($matches['from'] as $key => $from) {
192 18
                        $char_from = hexdec($from);
193 18
                        $char_to = hexdec($matches['to'][$key]);
194 18
                        $offset = hexdec($matches['offset'][$key]);
195
196 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
197 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
198
                        }
199
                    }
200
201
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
202
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
203 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
204
205 18
                    preg_match_all($regexp, $section, $matches);
206
207 18
                    foreach ($matches['from'] as $key => $from) {
208 1
                        $char_from = hexdec($from);
209 1
                        $strings = [];
210
211 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
212
213 1
                        foreach ($strings['string'] as $position => $string) {
214 1
                            $parts = preg_split(
215 1
                                '/([0-9A-F]{4})/i',
216
                                $string,
217 1
                                0,
218 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
219
                            );
220 1
                            $text = '';
221 1
                            foreach ($parts as $part) {
222 1
                                $text .= self::uchr(hexdec($part));
223
                            }
224 1
                            $this->table[$char_from + $position] = $text;
225
                        }
226
                    }
227
                }
228
            }
229
        }
230
231 23
        return $this->table;
232
    }
233
234
    /**
235
     * @param array $table
236
     */
237
    public function setTable($table)
238
    {
239
        $this->table = $table;
240
    }
241
242
    /**
243
     * @param string $hexa
244
     * @param bool   $add_braces
245
     *
246
     * @return string
247
     */
248 27
    public static function decodeHexadecimal($hexa, $add_braces = false)
249
    {
250
        // Special shortcut for XML content.
251 27
        if (false !== stripos($hexa, '<?xml')) {
252 3
            return $hexa;
253
        }
254
255 27
        $text = '';
256 27
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
257
258 27
        foreach ($parts as $part) {
259 27
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
260 9
                $part = trim($part, '<>');
261 9
                if ($add_braces) {
262 1
                    $text .= '(';
263
                }
264
265 9
                $part = pack('H*', $part);
266 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
267
268 9
                if ($add_braces) {
269 9
                    $text .= ')';
270
                }
271
            } else {
272 27
                $text .= $part;
273
            }
274
        }
275
276 27
        return $text;
277
    }
278
279
    /**
280
     * @param string $text
281
     *
282
     * @return string
283
     */
284 27
    public static function decodeOctal($text)
285
    {
286 27
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
287 27
        $text = '';
288
289 27
        foreach ($parts as $part) {
290 27
            if (preg_match('/^\\\\\d{3}$/', $part)) {
291 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
292
            } else {
293 27
                $text .= $part;
294
            }
295
        }
296
297 27
        return $text;
298
    }
299
300
    /**
301
     * @param string $text
302
     *
303
     * @return string
304
     */
305 40
    public static function decodeEntities($text)
306
    {
307 40
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
308 40
        $text = '';
309
310 40
        foreach ($parts as $part) {
311 40
            if (preg_match('/^#\d{2}$/', $part)) {
312 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

312
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
313
            } else {
314 40
                $text .= $part;
315
            }
316
        }
317
318 40
        return $text;
319
    }
320
321
    /**
322
     * @param string $text
323
     *
324
     * @return string
325
     */
326 27
    public static function decodeUnicode($text)
327
    {
328 27
        if (preg_match('/^\xFE\xFF/i', $text)) {
329
            // Strip U+FEFF byte order marker.
330 19
            $decode = substr($text, 2);
331 19
            $text = '';
332 19
            $length = \strlen($decode);
333
334 19
            for ($i = 0; $i < $length; $i += 2) {
335 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
336
            }
337
        }
338
339 27
        return $text;
340
    }
341
342
    /**
343
     * @return int
344
     */
345 12
    protected function getFontSpaceLimit()
346
    {
347 12
        return -50;
348
    }
349
350
    /**
351
     * @param array $commands
352
     *
353
     * @return string
354
     */
355 12
    public function decodeText($commands)
356
    {
357 12
        $text = '';
358 12
        $word_position = 0;
359 12
        $words = [];
360 12
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
361 12
        $font_space = $this->getFontSpaceLimit();
362
363 12
        foreach ($commands as $command) {
364 12
            switch ($command[PDFObject::TYPE]) {
365 12
                case 'n':
366 10
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
367 5
                        $word_position = \count($words);
368
                    }
369 10
                    continue 2;
370
371 12
                case '<':
372
                    // Decode hexadecimal.
373 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
374 7
                    break;
375
376
                default:
377
                    // Decode octal (if necessary).
378 8
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
379
            }
380
381
            // replace escaped chars
382 12
            $text = str_replace(
383 12
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
384 12
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
385
                $text
386
            );
387
388
            // add content to result string
389 12
            if (isset($words[$word_position])) {
390 10
                $words[$word_position] .= $text;
391
            } else {
392 12
                $words[$word_position] = $text;
393
            }
394
        }
395
396 12
        foreach ($words as &$word) {
397 12
            $word = $this->decodeContent($word);
398
        }
399
400 12
        return implode(' ', $words);
401
    }
402
403
    /**
404
     * @param string $text
405
     *
406
     * @return string
407
     */
408 14
    public function decodeContent($text)
409
    {
410 14
        if ($this->has('ToUnicode')) {
411 12
            $bytes = $this->tableSizes['from'];
412
413 12
            if ($bytes) {
414 12
                $result = '';
415 12
                $length = \strlen($text);
416
417 12
                for ($i = 0; $i < $length; $i += $bytes) {
418 12
                    $char = substr($text, $i, $bytes);
419
420 12
                    if (false !== ($decoded = $this->translateChar($char, false))) {
421 12
                        $char = $decoded;
422
                    } elseif ($this->has('DescendantFonts')) {
423
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
424
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

424
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
425
                        } else {
426
                            $fonts = $this->get('DescendantFonts')->getContent();
427
                        }
428
                        $decoded = false;
429
430
                        foreach ($fonts as $font) {
431
                            if ($font instanceof self) {
432
                                if (false !== ($decoded = $font->translateChar($char, false))) {
433
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

433
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
434
                                    break;
435
                                }
436
                            }
437
                        }
438
439
                        if (false !== $decoded) {
440
                            $char = $decoded;
441
                        } else {
442
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
443
                        }
444
                    } else {
445
                        $char = self::MISSING;
446
                    }
447
448 12
                    $result .= $char;
449
                }
450
451 12
                $text = $result;
452
            }
453 9
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
454
            /** @var Encoding $encoding */
455 1
            $encoding = $this->get('Encoding');
456 1
            $unicode = mb_check_encoding($text, 'UTF-8');
457 1
            $result = '';
458 1
            if ($unicode) {
459 1
                $chars = preg_split(
460 1
                        '//s'.($unicode ? 'u' : ''),
461
                        $text,
462 1
                        -1,
463 1
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
464
                );
465
466 1
                foreach ($chars as $char) {
467 1
                    $dec_av = hexdec(bin2hex($char));
468 1
                    $dec_ap = $encoding->translateChar($dec_av);
469
                    $result .= self::uchr($dec_ap);
470
                }
471
            } else {
472
                $length = \strlen($text);
473
474
                for ($i = 0; $i < $length; ++$i) {
475
                    $dec_av = hexdec(bin2hex($text[$i]));
476
                    $dec_ap = $encoding->translateChar($dec_av);
477
                    $result .= self::uchr($dec_ap);
478
                }
479
            }
480
            $text = $result;
481 9
        } elseif ($this->get('Encoding') instanceof Element &&
482 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
483
            // mb_convert_encoding does not support MacRoman/macintosh,
484
            // so we use iconv() here
485 1
            $text = iconv('macintosh', 'UTF-8', $text);
486 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
487
            // don't double-encode strings already in UTF-8
488 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
489
        }
490
491 14
        return $text;
492
    }
493
}
494