Test Failed
Pull Request — master (#344)
by
unknown
02:39
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 5
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 19
    public function init()
51
    {
52
        // Load translate table.
53 19
        $this->loadTranslateTable();
54 19
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 10
    public function translateChar($char, $use_default = true)
95
    {
96 10
        $dec = hexdec(bin2hex($char));
97
98 10
        if (\array_key_exists($dec, $this->table)) {
99 10
            return $this->table[$dec];
100
        }
101
102 2
        return $use_default ? self::MISSING : $char;
103
    }
104
105
    /**
106
     * @param int $code
107
     *
108
     * @return string
109
     */
110 20
    public static function uchr($code)
111
    {
112 20
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
113
    }
114
115
    /**
116
     * @return array
117
     */
118 19
    public function loadTranslateTable()
119
    {
120 19
        if (null !== $this->table) {
121 1
            return $this->table;
122
        }
123
124 19
        $this->table = [];
125 19
        $this->tableSizes = [
126
            'from' => 1,
127
            'to' => 1,
128
        ];
129
130 19
        if ($this->has('ToUnicode')) {
131 18
            $content = $this->get('ToUnicode')->getContent();
132 18
            $matches = [];
133
134
            // Support for multiple spacerange sections
135 18
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
136 18
                foreach ($matches['sections'] as $section) {
137 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
138
139 18
                    preg_match_all($regexp, $section, $matches);
140
141 18
                    $this->tableSizes = [
142 18
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
143 18
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
144
                    ];
145
146 18
                    break;
147
                }
148
            }
149
150
            // Support for multiple bfchar sections
151 18
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
152 4
                foreach ($matches['sections'] as $section) {
153 4
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 4
                    preg_match_all($regexp, $section, $matches);
156
157 4
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
158
159 4
                    foreach ($matches['from'] as $key => $from) {
160 4
                        $parts = preg_split(
161 4
                            '/([0-9A-F]{4})/i',
162 4
                            $matches['to'][$key],
163 4
                            0,
164 4
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
165
                        );
166 4
                        $text = '';
167 4
                        foreach ($parts as $part) {
168 4
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
169
                        }
170 4
                        $this->table[hexdec($from)] = $text;
171
                    }
172
                }
173
            }
174
175
            // Support for multiple bfrange sections
176 18
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
177 15
                foreach ($matches['sections'] as $section) {
178
                    // Support for : <srcCode1> <srcCode2> <dstString>
179 15
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
180
181 15
                    preg_match_all($regexp, $section, $matches);
182
183 15
                    foreach ($matches['from'] as $key => $from) {
184 15
                        $char_from = hexdec($from);
185 15
                        $char_to = hexdec($matches['to'][$key]);
186 15
                        $offset = hexdec($matches['offset'][$key]);
187
188 15
                        for ($char = $char_from; $char <= $char_to; ++$char) {
189 15
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
190
                        }
191
                    }
192
193
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
194
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
195 15
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
196
197 15
                    preg_match_all($regexp, $section, $matches);
198
199 15
                    foreach ($matches['from'] as $key => $from) {
200 1
                        $char_from = hexdec($from);
201 1
                        $strings = [];
202
203 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
204
205 1
                        foreach ($strings['string'] as $position => $string) {
206 1
                            $parts = preg_split(
207 1
                                '/([0-9A-F]{4})/i',
208
                                $string,
209 1
                                0,
210 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
211
                            );
212 1
                            $text = '';
213 1
                            foreach ($parts as $part) {
214 1
                                $text .= self::uchr(hexdec($part));
215
                            }
216 1
                            $this->table[$char_from + $position] = $text;
217
                        }
218
                    }
219
                }
220
            }
221
        }
222
223 19
        return $this->table;
224
    }
225
226
    /**
227
     * @param array $table
228
     */
229
    public function setTable($table)
230
    {
231
        $this->table = $table;
232
    }
233
234
    /**
235
     * @param string $hexa
236
     * @param bool   $add_braces
237
     *
238
     * @return string
239
     */
240 23
    public static function decodeHexadecimal($hexa, $add_braces = false)
241
    {
242
        // Special shortcut for XML content.
243 23
        if (false !== stripos($hexa, '<?xml')) {
244 3
            return $hexa;
245
        }
246
247 23
        $text = '';
248 23
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
249
250 23
        foreach ($parts as $part) {
251 23
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
252
                // strip line breaks
253 6
                $part = preg_replace("/[\r\n]/", "", $part);
254 6
                $part = trim($part, '<>');
255 6
                if ($add_braces) {
256 1
                    $text .= '(';
257
                }
258
259 6
                $part = pack('H*', $part);
260 6
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
261
262 6
                if ($add_braces) {
263 6
                    $text .= ')';
264
                }
265
            } else {
266 23
                $text .= $part;
267
            }
268
        }
269
270 23
        return $text;
271
    }
272
273
    /**
274
     * @param string $text
275
     *
276
     * @return string
277
     */
278 23
    public static function decodeOctal($text)
279
    {
280 23
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
281 23
        $text = '';
282
283 23
        foreach ($parts as $part) {
284 23
            if (preg_match('/^\\\\\d{3}$/', $part)) {
285 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

285
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
286
            } else {
287 23
                $text .= $part;
288
            }
289
        }
290
291 23
        return $text;
292
    }
293
294
    /**
295
     * @param string $text
296
     *
297
     * @return string
298
     */
299 36
    public static function decodeEntities($text)
300
    {
301 36
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
302 36
        $text = '';
303
304 36
        foreach ($parts as $part) {
305 36
            if (preg_match('/^#\d{2}$/', $part)) {
306 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

306
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
307
            } else {
308 36
                $text .= $part;
309
            }
310
        }
311
312 36
        return $text;
313
    }
314
315
    /**
316
     * @param string $text
317
     *
318
     * @return string
319
     */
320 23
    public static function decodeUnicode($text)
321
    {
322 23
        if (preg_match('/^\xFE\xFF/i', $text)) {
323
            // Strip U+FEFF byte order marker.
324 16
            $decode = substr($text, 2);
325 16
            $text = '';
326 16
            $length = \strlen($decode);
327
328 16
            for ($i = 0; $i < $length; $i += 2) {
329 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

329
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
330
            }
331
        }
332
333 23
        return $text;
334
    }
335
336
    /**
337
     * @return int
338
     */
339 8
    protected function getFontSpaceLimit()
340
    {
341 8
        return -50;
342
    }
343
344
    /**
345
     * @param array $commands
346
     *
347
     * @return string
348
     */
349 8
    public function decodeText($commands)
350
    {
351 8
        $text = '';
352 8
        $word_position = 0;
353 8
        $words = [];
354 8
        $unicode = false;
355 8
        $font_space = $this->getFontSpaceLimit();
356
357 8
        foreach ($commands as $command) {
358 8
            switch ($command[PDFObject::TYPE]) {
359 8
                case 'n':
360 8
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
361 3
                        $word_position = \count($words);
362
                    }
363 8
                    continue 2;
364
365 8
                case '<':
366
                    // Decode hexadecimal.
367 4
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
368
369 4
                    if (mb_check_encoding($text, 'UTF-8')) {
370 4
                        $unicode = true;
371
                    }
372
373 4
                    break;
374
375
                default:
376
                    // Decode octal (if necessary).
377 6
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
378
            }
379
380
            // replace escaped chars
381 8
            $text = stripcslashes($text);
382
383
            // add content to result string
384 8
            if (isset($words[$word_position])) {
385 8
                $words[$word_position] .= $text;
386
            } else {
387 8
                $words[$word_position] = $text;
388
            }
389
        }
390
391 8
        foreach ($words as &$word) {
392 8
            $loop_unicode = $unicode;
393 8
            $word = $this->decodeContent($word, $loop_unicode);
394
        }
395
396 8
        return implode(' ', $words);
397
    }
398
399
    /**
400
     * @param string $text
401
     * @param bool   $unicode
402
     *
403
     * @return string
404
     */
405 10
    public function decodeContent($text, &$unicode)
406
    {
407 10
        if ($this->has('ToUnicode')) {
408 9
            $bytes = $this->tableSizes['from'];
409
410 9
            if ($bytes) {
411 9
                $result = '';
412 9
                $length = \strlen($text);
413
414 9
                for ($i = 0; $i < $length; $i += $bytes) {
415 9
                    $char = substr($text, $i, $bytes);
416
417 9
                    if (false !== ($decoded = $this->translateChar($char, false))) {
418 9
                        $char = $decoded;
419
                    } elseif ($this->has('DescendantFonts')) {
420
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
421
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

421
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
422
                        } else {
423
                            $fonts = $this->get('DescendantFonts')->getContent();
424
                        }
425
                        $decoded = false;
426
427
                        foreach ($fonts as $font) {
428
                            if ($font instanceof self) {
429
                                if (false !== ($decoded = $font->translateChar($char, false))) {
430
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

430
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
431
                                    break;
432
                                }
433
                            }
434
                        }
435
436
                        if (false !== $decoded) {
437
                            $char = $decoded;
438
                        } else {
439
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
440
                        }
441
                    } else {
442
                        $char = self::MISSING;
443
                    }
444
445 9
                    $result .= $char;
446
                }
447
448 9
                $text = $result;
449
450
                // By definition, this code generates unicode chars.
451 9
                $unicode = true;
452
            }
453 8
        } elseif ($this->has('Encoding')) {
454
            /** @var Encoding $encoding */
455 5
            $encoding = $this->get('Encoding');
456
457 5
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
458 1
                if ($unicode) {
459
                    $chars = preg_split(
460
                        '//su',
461
                        $text,
462
                        -1,
463
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
464
                    );
465
                    $result = '';
466
467
                    foreach ($chars as $char) {
468
                        $dec_av = hexdec(bin2hex($char));
469
                        $dec_ap = $encoding->translateChar($dec_av);
470
                        $result .= self::uchr($dec_ap);
471
                    }
472
473
                    $text = $result;
474
                } else {
475 1
                    $result = '';
476 1
                    $length = \strlen($text);
477
478 1
                    for ($i = 0; $i < $length; ++$i) {
479 1
                        $dec_av = hexdec(bin2hex($text[$i]));
480 1
                        $dec_ap = $encoding->translateChar($dec_av);
481 1
                        $result .= \chr($dec_ap);
482
                    }
483
484 1
                    $text = $result;
485
486 1
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
487
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
488
489
                        return $text;
490
                    }
491
                }
492
            }
493
        }
494
495
        // Convert to unicode if not already done.
496 10
        if (!$unicode) {
497 6
            if ($this->get('Encoding') instanceof Element &&
498 6
                $this->get('Encoding')->equals('MacRomanEncoding')
499
            ) {
500 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
501
            } else {
502 6
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
503
            }
504
        }
505
506 10
        return $text;
507
    }
508
}
509