Passed
Pull Request — master (#346)
by
unknown
03:00
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 5
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 21
    public function init()
51
    {
52
        // Load translate table.
53 21
        $this->loadTranslateTable();
54 21
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 12
    public function translateChar($char, $use_default = true)
95
    {
96 12
        $dec = hexdec(bin2hex($char));
97
98 12
        if (\array_key_exists($dec, $this->table)) {
99 12
            return $this->table[$dec];
100
        }
101
102 2
        return $use_default ? self::MISSING : $char;
103
    }
104
105
    /**
106
     * @param int $code
107
     *
108
     * @return string
109
     */
110 22
    public static function uchr($code)
111
    {
112 22
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
113
    }
114
115
    /**
116
     * @return array
117
     */
118 21
    public function loadTranslateTable()
119
    {
120 21
        if (null !== $this->table) {
121 1
            return $this->table;
122
        }
123
124 21
        $this->table = [];
125 21
        $this->tableSizes = [
126
            'from' => 1,
127
            'to' => 1,
128
        ];
129
130 21
        if ($this->has('ToUnicode')) {
131 20
            $content = $this->get('ToUnicode')->getContent();
132 20
            $matches = [];
133
134
            // Support for multiple spacerange sections
135 20
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
136 20
                foreach ($matches['sections'] as $section) {
137 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
138
139 20
                    preg_match_all($regexp, $section, $matches);
140
141 20
                    $this->tableSizes = [
142 20
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
143 20
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
144
                    ];
145
146 20
                    break;
147
                }
148
            }
149
150
            // Support for multiple bfchar sections
151 20
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
152 6
                foreach ($matches['sections'] as $section) {
153 6
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 6
                    preg_match_all($regexp, $section, $matches);
156
157 6
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
158
159 6
                    foreach ($matches['from'] as $key => $from) {
160 6
                        $parts = preg_split(
161 6
                            '/([0-9A-F]{4})/i',
162 6
                            $matches['to'][$key],
163 6
                            0,
164 6
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
165
                        );
166 6
                        $text = '';
167 6
                        foreach ($parts as $part) {
168 6
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
169
                        }
170 6
                        $this->table[hexdec($from)] = $text;
171
                    }
172
                }
173
            }
174
175
            // Support for multiple bfrange sections
176 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
177 18
                foreach ($matches['sections'] as $section) {
178
                    // Support for : <srcCode1> <srcCode2> <dstString>
179 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
180
181 18
                    preg_match_all($regexp, $section, $matches);
182
183 18
                    foreach ($matches['from'] as $key => $from) {
184 18
                        $char_from = hexdec($from);
185 18
                        $char_to = hexdec($matches['to'][$key]);
186 18
                        $offset = hexdec($matches['offset'][$key]);
187
188 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
189 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
190
                        }
191
                    }
192
193
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
194
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
195 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
196
197 18
                    preg_match_all($regexp, $section, $matches);
198
199 18
                    foreach ($matches['from'] as $key => $from) {
200 1
                        $char_from = hexdec($from);
201 1
                        $strings = [];
202
203 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
204
205 1
                        foreach ($strings['string'] as $position => $string) {
206 1
                            $parts = preg_split(
207 1
                                '/([0-9A-F]{4})/i',
208
                                $string,
209 1
                                0,
210 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
211
                            );
212 1
                            $text = '';
213 1
                            foreach ($parts as $part) {
214 1
                                $text .= self::uchr(hexdec($part));
215
                            }
216 1
                            $this->table[$char_from + $position] = $text;
217
                        }
218
                    }
219
                }
220
            }
221
        }
222
223 21
        return $this->table;
224
    }
225
226
    /**
227
     * @param array $table
228
     */
229
    public function setTable($table)
230
    {
231
        $this->table = $table;
232
    }
233
234
    /**
235
     * @param string $hexa
236
     * @param bool   $add_braces
237
     *
238
     * @return string
239
     */
240 25
    public static function decodeHexadecimal($hexa, $add_braces = false)
241
    {
242
        // Special shortcut for XML content.
243 25
        if (false !== stripos($hexa, '<?xml')) {
244 3
            return $hexa;
245
        }
246
247 25
        $text = '';
248 25
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
249
250 25
        foreach ($parts as $part) {
251 25
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
252
                // strip line breaks
253 8
                $part = preg_replace("/[\r\n]/", "", $part);
254 8
                $part = trim($part, '<>');
255 8
                if ($add_braces) {
256 1
                    $text .= '(';
257
                }
258
259 8
                $part = pack('H*', $part);
260 8
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
261
262 8
                if ($add_braces) {
263 8
                    $text .= ')';
264
                }
265
            } else {
266 25
                $text .= $part;
267
            }
268
        }
269
270 25
        return $text;
271
    }
272
273
    /**
274
     * @param string $text
275
     *
276
     * @return string
277
     */
278 25
    public static function decodeOctal($text)
279
    {
280 25
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
281 25
        $text = '';
282
283 25
        foreach ($parts as $part) {
284 25
            if (preg_match('/^\\\\\d{3}$/', $part)) {
285 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

285
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
286
            } else {
287 25
                $text .= $part;
288
            }
289
        }
290
291 25
        return $text;
292
    }
293
294
    /**
295
     * @param string $text
296
     *
297
     * @return string
298
     */
299 38
    public static function decodeEntities($text)
300
    {
301 38
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
302 38
        $text = '';
303
304 38
        foreach ($parts as $part) {
305 38
            if (preg_match('/^#\d{2}$/', $part)) {
306 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

306
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
307
            } else {
308 38
                $text .= $part;
309
            }
310
        }
311
312 38
        return $text;
313
    }
314
315
    /**
316
     * @param string $text
317
     *
318
     * @return string
319
     */
320 25
    public static function decodeUnicode($text)
321
    {
322 25
        if (preg_match('/^\xFE\xFF/i', $text)) {
323
            // Strip U+FEFF byte order marker.
324 16
            $decode = substr($text, 2);
325 16
            $text = '';
326 16
            $length = \strlen($decode);
327
328 16
            for ($i = 0; $i < $length; $i += 2) {
329 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

329
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
330
            }
331
        }
332
333 25
        return $text;
334
    }
335
336
    /**
337
     * @return int
338
     */
339 10
    protected function getFontSpaceLimit()
340
    {
341 10
        return -50;
342
    }
343
344
    /**
345
     * @param array $commands
346
     *
347
     * @return string
348
     */
349 10
    public function decodeText($commands)
350
    {
351 10
        $text = '';
352 10
        $word_position = 0;
353 10
        $words = [];
354 10
        $unicode = false;
355 10
        $font_space = $this->getFontSpaceLimit();
356
357 10
        foreach ($commands as $command) {
358 10
            switch ($command[PDFObject::TYPE]) {
359 10
                case 'n':
360 9
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
361 4
                        $word_position = \count($words);
362
                    }
363 9
                    continue 2;
364
365 10
                case '<':
366
                    // Decode hexadecimal.
367 6
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
368
369 6
                    if (mb_check_encoding($text, 'UTF-8')) {
370 6
                        $unicode = true;
371
                    }
372
373 6
                    break;
374
375
                default:
376
                    // Decode octal (if necessary).
377 6
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
378
            }
379
380
            // replace escaped chars
381 10
            $text = str_replace(
382 10
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
383 10
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
384
                $text
385
            );
386
387
            // add content to result string
388 10
            if (isset($words[$word_position])) {
389 9
                $words[$word_position] .= $text;
390
            } else {
391 10
                $words[$word_position] = $text;
392
            }
393
        }
394
395 10
        foreach ($words as &$word) {
396 10
            $loop_unicode = $unicode;
397 10
            $word = $this->decodeContent($word, $loop_unicode);
398
        }
399
400 10
        return implode(' ', $words);
401
    }
402
403
    /**
404
     * @param string $text
405
     * @param bool   $unicode
406
     *
407
     * @return string
408
     */
409 12
    public function decodeContent($text, &$unicode)
410
    {
411 12
        if ($this->has('ToUnicode')) {
412 11
            $bytes = $this->tableSizes['from'];
413
414 11
            if ($bytes) {
415 11
                $result = '';
416 11
                $length = \strlen($text);
417
418 11
                for ($i = 0; $i < $length; $i += $bytes) {
419 11
                    $char = substr($text, $i, $bytes);
420
421 11
                    if (false !== ($decoded = $this->translateChar($char, false))) {
422 11
                        $char = $decoded;
423
                    } elseif ($this->has('DescendantFonts')) {
424
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
425
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

425
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
426
                        } else {
427
                            $fonts = $this->get('DescendantFonts')->getContent();
428
                        }
429
                        $decoded = false;
430
431
                        foreach ($fonts as $font) {
432
                            if ($font instanceof self) {
433
                                if (false !== ($decoded = $font->translateChar($char, false))) {
434
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

434
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
435
                                    break;
436
                                }
437
                            }
438
                        }
439
440
                        if (false !== $decoded) {
441
                            $char = $decoded;
442
                        } else {
443
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
444
                        }
445
                    } else {
446
                        $char = self::MISSING;
447
                    }
448
449 11
                    $result .= $char;
450
                }
451
452 11
                $text = $result;
453
454
                // By definition, this code generates unicode chars.
455 11
                $unicode = true;
456
            }
457 8
        } elseif ($this->has('Encoding')) {
458
            /** @var Encoding $encoding */
459 5
            $encoding = $this->get('Encoding');
460
461 5
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
462 1
                if ($unicode) {
463
                    $chars = preg_split(
464
                        '//su',
465
                        $text,
466
                        -1,
467
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
468
                    );
469
                    $result = '';
470
471
                    foreach ($chars as $char) {
472
                        $dec_av = hexdec(bin2hex($char));
473
                        $dec_ap = $encoding->translateChar($dec_av);
474
                        $result .= self::uchr($dec_ap);
475
                    }
476
477
                    $text = $result;
478
                } else {
479 1
                    $result = '';
480 1
                    $length = \strlen($text);
481
482 1
                    for ($i = 0; $i < $length; ++$i) {
483 1
                        $dec_av = hexdec(bin2hex($text[$i]));
484 1
                        $dec_ap = $encoding->translateChar($dec_av);
485 1
                        $result .= \chr($dec_ap);
486
                    }
487
488 1
                    $text = $result;
489
490 1
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
491
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
492
493
                        return $text;
494
                    }
495
                }
496
            }
497
        }
498
499
        // Convert to unicode if not already done.
500 12
        if (!$unicode) {
501 6
            if ($this->get('Encoding') instanceof Element &&
502 6
                $this->get('Encoding')->equals('MacRomanEncoding')
503
            ) {
504 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
505
            } else {
506 6
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
507
            }
508
        }
509
510 12
        return $text;
511
    }
512
}
513