Passed
Pull Request — master (#342)
by
unknown
01:50
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 5
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 20
    public function init()
51
    {
52
        // Load translate table.
53 20
        $this->loadTranslateTable();
54 20
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 11
    public function translateChar($char, $use_default = true)
95
    {
96 11
        $dec = hexdec(bin2hex($char));
97
98 11
        if (\array_key_exists($dec, $this->table)) {
99 11
            return $this->table[$dec];
100
        }
101
102 1
        return $use_default ? self::MISSING : $char;
103
    }
104
105
    /**
106
     * @param int $code
107
     *
108
     * @return string
109
     */
110 21
    public static function uchr($code)
111
    {
112 21
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
113
    }
114
115
    /**
116
     * @return array
117
     */
118 20
    public function loadTranslateTable()
119
    {
120 20
        if (null !== $this->table) {
121 1
            return $this->table;
122
        }
123
124 20
        $this->table = [];
125 20
        $this->tableSizes = [
126
            'from' => 1,
127
            'to' => 1,
128
        ];
129
130 20
        if ($this->has('ToUnicode')) {
131 19
            $content = $this->get('ToUnicode')->getContent();
132 19
            $matches = [];
133
134
            // Support for multiple spacerange sections
135 19
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
136 19
                foreach ($matches['sections'] as $section) {
137 19
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
138
139 19
                    preg_match_all($regexp, $section, $matches);
140
141 19
                    $this->tableSizes = [
142 19
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
143 19
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
144
                    ];
145
146 19
                    break;
147
                }
148
            }
149
150
            // Support for multiple bfchar sections
151 19
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
152 5
                foreach ($matches['sections'] as $section) {
153 5
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 5
                    preg_match_all($regexp, $section, $matches);
156
157 5
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
158
159 5
                    foreach ($matches['from'] as $key => $from) {
160 5
                        $parts = preg_split(
161 5
                            '/([0-9A-F]{4})/i',
162 5
                            $matches['to'][$key],
163 5
                            0,
164 5
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
165
                        );
166 5
                        $text = '';
167 5
                        foreach ($parts as $part) {
168 5
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
169
                        }
170 5
                        $this->table[hexdec($from)] = $text;
171
                    }
172
                }
173
            }
174
175
            // Support for multiple bfrange sections
176 19
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
177 18
                foreach ($matches['sections'] as $section) {
178
                    // Support for : <srcCode1> <srcCode2> <dstString>
179 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
180
181 18
                    preg_match_all($regexp, $section, $matches);
182
183 18
                    foreach ($matches['from'] as $key => $from) {
184 18
                        $char_from = hexdec($from);
185 18
                        $char_to = hexdec($matches['to'][$key]);
186 18
                        $offset = hexdec($matches['offset'][$key]);
187
188 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
189 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
190
                        }
191
                    }
192
193
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
194
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
195 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
196
197 18
                    preg_match_all($regexp, $section, $matches);
198
199 18
                    foreach ($matches['from'] as $key => $from) {
200 1
                        $char_from = hexdec($from);
201 1
                        $strings = [];
202
203 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
204
205 1
                        foreach ($strings['string'] as $position => $string) {
206 1
                            $parts = preg_split(
207 1
                                '/([0-9A-F]{4})/i',
208
                                $string,
209 1
                                0,
210 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
211
                            );
212 1
                            $text = '';
213 1
                            foreach ($parts as $part) {
214 1
                                $text .= self::uchr(hexdec($part));
215
                            }
216 1
                            $this->table[$char_from + $position] = $text;
217
                        }
218
                    }
219
                }
220
            }
221
        }
222
223 20
        return $this->table;
224
    }
225
226
    /**
227
     * @param array $table
228
     */
229
    public function setTable($table)
230
    {
231
        $this->table = $table;
232
    }
233
234
    /**
235
     * @param string $hexa
236
     * @param bool   $add_braces
237
     *
238
     * @return string
239
     */
240 24
    public static function decodeHexadecimal($hexa, $add_braces = false)
241
    {
242
        // Special shortcut for XML content.
243 24
        if (false !== stripos($hexa, '<?xml')) {
244 3
            return $hexa;
245
        }
246
247 24
        $text = '';
248 24
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
249
250 24
        foreach ($parts as $part) {
251 24
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
252 7
                $part = trim($part, '<>');
253 7
                if ($add_braces) {
254 1
                    $text .= '(';
255
                }
256
257 7
                $part = pack('H*', $part);
258 7
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
259
260 7
                if ($add_braces) {
261 7
                    $text .= ')';
262
                }
263
            } else {
264 24
                $text .= $part;
265
            }
266
        }
267
268 24
        return $text;
269
    }
270
271
    /**
272
     * @param string $text
273
     *
274
     * @return string
275
     */
276 24
    public static function decodeOctal($text)
277
    {
278 24
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
279 24
        $text = '';
280
281 24
        foreach ($parts as $part) {
282 24
            if (preg_match('/^\\\\\d{3}$/', $part)) {
283 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

283
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
284
            } else {
285 24
                $text .= $part;
286
            }
287
        }
288
289 24
        return $text;
290
    }
291
292
    /**
293
     * @param string $text
294
     *
295
     * @return string
296
     */
297 37
    public static function decodeEntities($text)
298
    {
299 37
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
300 37
        $text = '';
301
302 37
        foreach ($parts as $part) {
303 37
            if (preg_match('/^#\d{2}$/', $part)) {
304 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

304
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
305
            } else {
306 37
                $text .= $part;
307
            }
308
        }
309
310 37
        return $text;
311
    }
312
313
    /**
314
     * @param string $text
315
     *
316
     * @return string
317
     */
318 24
    public static function decodeUnicode($text)
319
    {
320 24
        if (preg_match('/^\xFE\xFF/i', $text)) {
321
            // Strip U+FEFF byte order marker.
322 16
            $decode = substr($text, 2);
323 16
            $text = '';
324 16
            $length = \strlen($decode);
325
326 16
            for ($i = 0; $i < $length; $i += 2) {
327 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

327
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
328
            }
329
        }
330
331 24
        return $text;
332
    }
333
334
    /**
335
     * @return int
336
     */
337 7
    protected function getFontSpaceLimit()
338
    {
339 7
        return -50;
340
    }
341
342
    /**
343
     * @param array $commands
344
     *
345
     * @return string
346
     */
347 7
    public function decodeText($commands)
348
    {
349 7
        $text = '';
350 7
        $word_position = 0;
351 7
        $words = [];
352 7
        $unicode = false;
353 7
        $font_space = $this->getFontSpaceLimit();
354
355 7
        foreach ($commands as $command) {
356 7
            switch ($command[PDFObject::TYPE]) {
357 7
                case 'n':
358 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
359 4
                        $word_position = \count($words);
360
                    }
361 6
                    continue 2;
362
363 7
                case '<':
364
                    // Decode hexadecimal.
365 5
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
366
367 5
                    if (mb_check_encoding($text, 'UTF-8')) {
368 5
                        $unicode = true;
369
                    }
370
371 5
                    break;
372
373
                default:
374
                    // Decode octal (if necessary).
375 4
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
376
            }
377
378
            // replace escaped chars
379 7
            $text = str_replace(
380 7
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
381 7
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
382
                $text
383
            );
384
385
            // add content to result string
386 7
            if (isset($words[$word_position])) {
387 6
                $words[$word_position] .= $text;
388
            } else {
389 7
                $words[$word_position] = $text;
390
            }
391
        }
392
393 7
        foreach ($words as &$word) {
394 7
            $loop_unicode = $unicode;
395 7
            $word = $this->decodeContent($word, $loop_unicode);
396
        }
397
398 7
        return implode(' ', $words);
399
    }
400
401
    /**
402
     * @param string $text
403
     * @param bool   $unicode
404
     *
405
     * @return string
406
     */
407 11
    public function decodeContent($text, &$unicode)
408
    {
409 11
        if ($this->has('ToUnicode')) {
410 10
            $bytes = $this->tableSizes['from'];
411
412 10
            if ($bytes) {
413 10
                $result = '';
414 10
                $length = \strlen($text);
415
416 10
                for ($i = 0; $i < $length; $i += $bytes) {
417 10
                    $char = substr($text, $i, $bytes);
418
419 10
                    if (false !== ($decoded = $this->translateChar($char, false))) {
420 10
                        $char = $decoded;
421
                    } elseif ($this->has('DescendantFonts')) {
422
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
423
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

423
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
424
                        } else {
425
                            $fonts = $this->get('DescendantFonts')->getContent();
426
                        }
427
                        $decoded = false;
428
429
                        foreach ($fonts as $font) {
430
                            if ($font instanceof self) {
431
                                if (false !== ($decoded = $font->translateChar($char, false))) {
432
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

432
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
433
                                    break;
434
                                }
435
                            }
436
                        }
437
438
                        if (false !== $decoded) {
439
                            $char = $decoded;
440
                        } else {
441
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
442
                        }
443
                    } else {
444
                        $char = self::MISSING;
445
                    }
446
447 10
                    $result .= $char;
448
                }
449
450 10
                $text = $result;
451
452
                // By definition, this code generates unicode chars.
453 10
                $unicode = true;
454
            }
455 8
        } elseif ($this->has('Encoding')) {
456
            /** @var Encoding $encoding */
457 5
            $encoding = $this->get('Encoding');
458
459 5
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
460 1
                if ($unicode) {
461
                    $chars = preg_split(
462
                        '//su',
463
                        $text,
464
                        -1,
465
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
466
                    );
467
                    $result = '';
468
469
                    foreach ($chars as $char) {
470
                        $dec_av = hexdec(bin2hex($char));
471
                        $dec_ap = $encoding->translateChar($dec_av);
472
                        $result .= self::uchr($dec_ap);
473
                    }
474
475
                    $text = $result;
476
                } else {
477 1
                    $result = '';
478 1
                    $length = \strlen($text);
479
480 1
                    for ($i = 0; $i < $length; ++$i) {
481 1
                        $dec_av = hexdec(bin2hex($text[$i]));
482 1
                        $dec_ap = $encoding->translateChar($dec_av);
483 1
                        $result .= \chr($dec_ap);
484
                    }
485
486 1
                    $text = $result;
487
488 1
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
489
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
490
491
                        return $text;
492
                    }
493
                }
494
            }
495
        }
496
497
        // Convert to unicode if not already done.
498 11
        if (!$unicode) {
499 4
            if ($this->get('Encoding') instanceof Element &&
500 4
                $this->get('Encoding')->equals('MacRomanEncoding')
501
            ) {
502 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
503
            } else {
504 4
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
505
            }
506
        }
507
508 11
        return $text;
509
    }
510
}
511