Completed
Push — pr/238 ( 9ef5ec )
by Konrad
03:54
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 4
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 4
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 17
    public function init()
51
    {
52
        // Load translate table.
53 17
        $this->loadTranslateTable();
54 17
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 7
    public function translateChar($char, $use_default = true)
95
    {
96 7
        $dec = hexdec(bin2hex($char));
97
98 7
        if (\array_key_exists($dec, $this->table)) {
99 7
            return $this->table[$dec];
100
        }
101
102 1
        return $use_default ? self::MISSING : $char;
103
    }
104
105
    /**
106
     * @param int $code
107
     *
108
     * @return string
109
     */
110 17
    public static function uchr($code)
111
    {
112 17
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
113
    }
114
115
    /**
116
     * @return array
117
     */
118 17
    public function loadTranslateTable()
119
    {
120 17
        if (null !== $this->table) {
121 1
            return $this->table;
122
        }
123
124 17
        $this->table = [];
125 17
        $this->tableSizes = [
126
            'from' => 1,
127
            'to' => 1,
128
        ];
129
130 17
        if ($this->has('ToUnicode')) {
131 15
            $content = $this->get('ToUnicode')->getContent();
132 15
            $matches = [];
133
134
            // Support for multiple spacerange sections
135 15
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
136 15
                foreach ($matches['sections'] as $section) {
137 15
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
138
139 15
                    preg_match_all($regexp, $section, $matches);
140
141 15
                    $this->tableSizes = [
142 15
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
143 15
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
144
                    ];
145
146 15
                    break;
147
                }
148
            }
149
150
            // Support for multiple bfchar sections
151 15
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
152 1
                foreach ($matches['sections'] as $section) {
153 1
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 1
                    preg_match_all($regexp, $section, $matches);
156
157 1
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
158
159 1
                    foreach ($matches['from'] as $key => $from) {
160 1
                        $parts = preg_split(
161 1
                            '/([0-9A-F]{4})/i',
162 1
                            $matches['to'][$key],
163 1
                            0,
164 1
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
165
                        );
166 1
                        $text = '';
167 1
                        foreach ($parts as $part) {
168 1
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
169
                        }
170 1
                        $this->table[hexdec($from)] = $text;
171
                    }
172
                }
173
            }
174
175
            // Support for multiple bfrange sections
176 15
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
177 15
                foreach ($matches['sections'] as $section) {
178
                    // Support for : <srcCode1> <srcCode2> <dstString>
179 15
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
180
181 15
                    preg_match_all($regexp, $section, $matches);
182
183 15
                    foreach ($matches['from'] as $key => $from) {
184 15
                        $char_from = hexdec($from);
185 15
                        $char_to = hexdec($matches['to'][$key]);
186 15
                        $offset = hexdec($matches['offset'][$key]);
187
188 15
                        for ($char = $char_from; $char <= $char_to; ++$char) {
189 15
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
190
                        }
191
                    }
192
193
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
194
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
195 15
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
196
197 15
                    preg_match_all($regexp, $section, $matches);
198
199 15
                    foreach ($matches['from'] as $key => $from) {
200 1
                        $char_from = hexdec($from);
201 1
                        $strings = [];
202
203 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
204
205 1
                        foreach ($strings['string'] as $position => $string) {
206 1
                            $parts = preg_split(
207 1
                                '/([0-9A-F]{4})/i',
208
                                $string,
209 1
                                0,
210 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
211
                            );
212 1
                            $text = '';
213 1
                            foreach ($parts as $part) {
214 1
                                $text .= self::uchr(hexdec($part));
215
                            }
216 1
                            $this->table[$char_from + $position] = $text;
217
                        }
218
                    }
219
                }
220
            }
221
        }
222
223 17
        return $this->table;
224
    }
225
226
    /**
227
     * @param array $table
228
     */
229
    public function setTable($table)
230
    {
231
        $this->table = $table;
232
    }
233
234
    /**
235
     * @param string $hexa
236
     * @param bool   $add_braces
237
     *
238
     * @return string
239
     */
240 21
    public static function decodeHexadecimal($hexa, $add_braces = false)
241
    {
242
        // Special shortcut for XML content.
243 21
        if (false !== stripos($hexa, '<?xml')) {
244 3
            return $hexa;
245
        }
246
247 21
        $text = '';
248 21
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
249
250 21
        foreach ($parts as $part) {
251 21
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
252 4
                $part = trim($part, '<>');
253 4
                if ($add_braces) {
254 1
                    $text .= '(';
255
                }
256
257 4
                $part = pack('H*', $part);
258 4
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
259
260 4
                if ($add_braces) {
261 4
                    $text .= ')';
262
                }
263
            } else {
264 21
                $text .= $part;
265
            }
266
        }
267
268 21
        return $text;
269
    }
270
271
    /**
272
     * @param string $text
273
     *
274
     * @return string
275
     */
276 21
    public static function decodeOctal($text)
277
    {
278 21
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
279 21
        $text = '';
280
281 21
        foreach ($parts as $part) {
282 21
            if (preg_match('/^\\\\\d{3}$/', $part)) {
283 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
284
            } else {
285 21
                $text .= $part;
286
            }
287
        }
288
289 21
        return $text;
290
    }
291
292
    /**
293
     * @param string $text
294
     *
295
     * @return string
296
     */
297 34
    public static function decodeEntities($text)
298
    {
299 34
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
300 34
        $text = '';
301
302 34
        foreach ($parts as $part) {
303 34
            if (preg_match('/^#\d{2}$/', $part)) {
304 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

304
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
305
            } else {
306 34
                $text .= $part;
307
            }
308
        }
309
310 34
        return $text;
311
    }
312
313
    /**
314
     * @param string $text
315
     *
316
     * @return string
317
     */
318 21
    public static function decodeUnicode($text)
319
    {
320 21
        if (preg_match('/^\xFE\xFF/i', $text)) {
321
            // Strip U+FEFF byte order marker.
322 16
            $decode = substr($text, 2);
323 16
            $text = '';
324 16
            $length = \strlen($decode);
325
326 16
            for ($i = 0; $i < $length; $i += 2) {
327 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

327
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
328
            }
329
        }
330
331 21
        return $text;
332
    }
333
334
    /**
335
     * @return int
336
     */
337 4
    protected function getFontSpaceLimit()
338
    {
339 4
        return -50;
340
    }
341
342
    /**
343
     * @param array $commands
344
     *
345
     * @return string
346
     */
347 4
    public function decodeText($commands)
348
    {
349 4
        $text = '';
350 4
        $word_position = 0;
351 4
        $words = [];
352 4
        $unicode = false;
353 4
        $font_space = $this->getFontSpaceLimit();
354
355 4
        foreach ($commands as $command) {
356 4
            switch ($command[PDFObject::TYPE]) {
357 4
                case 'n':
358 4
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
359 2
                        $word_position = \count($words);
360
                    }
361 4
                    continue 2;
362
363 4
                case '<':
364
                    // Decode hexadecimal.
365 2
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
366
367 2
                    if (mb_check_encoding($text, 'UTF-8')) {
368 1
                        $unicode = true;
369
                    }
370
371 2
                    break;
372
373
                default:
374
                    // Decode octal (if necessary).
375 4
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
376
            }
377
378
            // replace escaped chars
379 4
            $text = stripcslashes($text);
380
381
            // add content to result string
382 4
            if (isset($words[$word_position])) {
383 4
                $words[$word_position] .= $text;
384
            } else {
385 4
                $words[$word_position] = $text;
386
            }
387
        }
388
389 4
        foreach ($words as &$word) {
390 4
            $loop_unicode = $unicode;
391 4
            $word = $this->decodeContent($word, $loop_unicode);
392
        }
393
394 4
        return implode(' ', $words);
395
    }
396
397
    /**
398
     * @param string $text
399
     * @param bool   $unicode
400
     *
401
     * @return string
402
     */
403 8
    public function decodeContent($text, &$unicode)
404
    {
405 8
        if ($this->has('ToUnicode')) {
406 6
            $bytes = $this->tableSizes['from'];
407
408 6
            if ($bytes) {
409 6
                $result = '';
410 6
                $length = \strlen($text);
411
412 6
                for ($i = 0; $i < $length; $i += $bytes) {
413 6
                    $char = substr($text, $i, $bytes);
414
415 6
                    if (false !== ($decoded = $this->translateChar($char, false))) {
416 6
                        $char = $decoded;
417
                    } elseif ($this->has('DescendantFonts')) {
418
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
419
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

419
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
420
                        } else {
421
                            $fonts = $this->get('DescendantFonts')->getContent();
422
                        }
423
                        $decoded = false;
424
425
                        foreach ($fonts as $font) {
426
                            if ($font instanceof self) {
427
                                if (false !== ($decoded = $font->translateChar($char, false))) {
428
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

428
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
429
                                    break;
430
                                }
431
                            }
432
                        }
433
434
                        if (false !== $decoded) {
435
                            $char = $decoded;
436
                        } else {
437
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
438
                        }
439
                    } else {
440
                        $char = self::MISSING;
441
                    }
442
443 6
                    $result .= $char;
444
                }
445
446 6
                $text = $result;
447
448
                // By definition, this code generates unicode chars.
449 6
                $unicode = true;
450
            }
451 8
        } elseif ($this->has('Encoding')) {
452
            /** @var Encoding $encoding */
453 5
            $encoding = $this->get('Encoding');
454
455 5
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
456 1
                if ($unicode) {
457
                    $chars = preg_split(
458
                        '//su',
459
                        $text,
460
                        -1,
461
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
462
                    );
463
                    $result = '';
464
465
                    foreach ($chars as $char) {
466
                        $dec_av = hexdec(bin2hex($char));
467
                        $dec_ap = $encoding->translateChar($dec_av);
468
                        $result .= self::uchr($dec_ap);
469
                    }
470
471
                    $text = $result;
472
                } else {
473 1
                    $result = '';
474 1
                    $length = \strlen($text);
475
476 1
                    for ($i = 0; $i < $length; ++$i) {
477 1
                        $dec_av = hexdec(bin2hex($text[$i]));
478 1
                        $dec_ap = $encoding->translateChar($dec_av);
479 1
                        $result .= \chr($dec_ap);
480
                    }
481
482 1
                    $text = $result;
483
484 1
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
485
                        $text = mb_convert_encoding($text, 'UTF-8', 'Mac');
486
487
                        return $text;
488
                    }
489
                }
490
            }
491
        }
492
493
        // Convert to unicode if not already done.
494 8
        if (!$unicode) {
495 4
            if ($this->get('Encoding') instanceof Element &&
496 4
                $this->get('Encoding')->equals('MacRomanEncoding')
497
            ) {
498 1
                $text = mb_convert_encoding($text, 'UTF-8', 'Mac');
499
            } else {
500 4
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
501
            }
502
        }
503
504 8
        return $text;
505
    }
506
}
507