Completed
Pull Request — master (#318)
by
unknown
05:32
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 5
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 22
    public function init()
51
    {
52
        // Load translate table.
53 22
        $this->loadTranslateTable();
54 22
    }
55
56
    /**
57
     * @return string
58
     */
59 8
    public function getName()
60
    {
61 8
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 8
    public function getType()
68
    {
69 8
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 7
    public function getDetails($deep = true)
76
    {
77 7
        $details = [];
78
79 7
        $details['Name'] = $this->getName();
80 7
        $details['Type'] = $this->getType();
81 7
        if ($this->has('Encoding')) {
82 5
            if ($this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
83 2
                $details['Encoding'] = @($this->get('Encoding')->getDetails()['BaseEncoding']) ?: 'Ansi';
84
            } else {
85 5
                $details['Encoding'] = (string) $this->get('Encoding');
86
            }
87
        } else {
88 7
            $details['Encoding'] = 'Ansi';
89
        }
90
91 7
        $details += parent::getDetails($deep);
92
93 7
        return $details;
94
    }
95
96
    /**
97
     * @param string $char
98
     * @param bool   $use_default
99
     *
100
     * @return string|bool
101
     */
102 12
    public function translateChar($char, $use_default = true)
103
    {
104 12
        $dec = hexdec(bin2hex($char));
105
106 12
        if (\array_key_exists($dec, $this->table)) {
107 12
            return $this->table[$dec];
108
        }
109
110 1
        return $use_default ? self::MISSING : $char;
111
    }
112
113
    /**
114
     * @param int $code
115
     *
116
     * @return string
117
     */
118 22
    public static function uchr($code)
119
    {
120 22
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
121
    }
122
123
    /**
124
     * @return array
125
     */
126 22
    public function loadTranslateTable()
127
    {
128 22
        if (null !== $this->table) {
129 1
            return $this->table;
130
        }
131
132 22
        $this->table = [];
133 22
        $this->tableSizes = [
134
            'from' => 1,
135
            'to' => 1,
136
        ];
137
138 22
        if ($this->has('ToUnicode')) {
139 20
            $content = $this->get('ToUnicode')->getContent();
140 20
            $matches = [];
141
142
            // Support for multiple spacerange sections
143 20
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
144 20
                foreach ($matches['sections'] as $section) {
145 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
146
147 20
                    preg_match_all($regexp, $section, $matches);
148
149 20
                    $this->tableSizes = [
150 20
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
151 20
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
152
                    ];
153
154 20
                    break;
155
                }
156
            }
157
158
            // Support for multiple bfchar sections
159 20
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
160 6
                foreach ($matches['sections'] as $section) {
161 6
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
162
163 6
                    preg_match_all($regexp, $section, $matches);
164
165 6
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
166
167 6
                    foreach ($matches['from'] as $key => $from) {
168 6
                        $parts = preg_split(
169 6
                            '/([0-9A-F]{4})/i',
170 6
                            $matches['to'][$key],
171 6
                            0,
172 6
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
173
                        );
174 6
                        $text = '';
175 6
                        foreach ($parts as $part) {
176 6
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

176
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
177
                        }
178 6
                        $this->table[hexdec($from)] = $text;
179
                    }
180
                }
181
            }
182
183
            // Support for multiple bfrange sections
184 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
185 18
                foreach ($matches['sections'] as $section) {
186
                    // Support for : <srcCode1> <srcCode2> <dstString>
187 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
188
189 18
                    preg_match_all($regexp, $section, $matches);
190
191 18
                    foreach ($matches['from'] as $key => $from) {
192 18
                        $char_from = hexdec($from);
193 18
                        $char_to = hexdec($matches['to'][$key]);
194 18
                        $offset = hexdec($matches['offset'][$key]);
195
196 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
197 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
198
                        }
199
                    }
200
201
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
202
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
203 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
204
205 18
                    preg_match_all($regexp, $section, $matches);
206
207 18
                    foreach ($matches['from'] as $key => $from) {
208 1
                        $char_from = hexdec($from);
209 1
                        $strings = [];
210
211 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
212
213 1
                        foreach ($strings['string'] as $position => $string) {
214 1
                            $parts = preg_split(
215 1
                                '/([0-9A-F]{4})/i',
216
                                $string,
217 1
                                0,
218 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
219
                            );
220 1
                            $text = '';
221 1
                            foreach ($parts as $part) {
222 1
                                $text .= self::uchr(hexdec($part));
223
                            }
224 1
                            $this->table[$char_from + $position] = $text;
225
                        }
226
                    }
227
                }
228
            }
229
        }
230
231 22
        return $this->table;
232
    }
233
234
    /**
235
     * @param array $table
236
     */
237
    public function setTable($table)
238
    {
239
        $this->table = $table;
240
    }
241
242
    /**
243
     * @param string $hexa
244
     * @param bool   $add_braces
245
     *
246
     * @return string
247
     */
248 26
    public static function decodeHexadecimal($hexa, $add_braces = false)
249
    {
250
        // Special shortcut for XML content.
251 26
        if (false !== stripos($hexa, '<?xml')) {
252 3
            return $hexa;
253
        }
254
255 26
        $text = '';
256 26
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
257
258 26
        foreach ($parts as $part) {
259 26
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
260 8
                $part = trim($part, '<>');
261 8
                if ($add_braces) {
262 1
                    $text .= '(';
263
                }
264
265 8
                $part = pack('H*', $part);
266 8
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
267
268 8
                if ($add_braces) {
269 8
                    $text .= ')';
270
                }
271
            } else {
272 26
                $text .= $part;
273
            }
274
        }
275
276 26
        return $text;
277
    }
278
279
    /**
280
     * @param string $text
281
     *
282
     * @return string
283
     */
284 26
    public static function decodeOctal($text)
285
    {
286 26
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
287 26
        $text = '';
288
289 26
        foreach ($parts as $part) {
290 26
            if (preg_match('/^\\\\\d{3}$/', $part)) {
291 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
292
            } else {
293 26
                $text .= $part;
294
            }
295
        }
296
297 26
        return $text;
298
    }
299
300
    /**
301
     * @param string $text
302
     *
303
     * @return string
304
     */
305 39
    public static function decodeEntities($text)
306
    {
307 39
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
308 39
        $text = '';
309
310 39
        foreach ($parts as $part) {
311 39
            if (preg_match('/^#\d{2}$/', $part)) {
312 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

312
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
313
            } else {
314 39
                $text .= $part;
315
            }
316
        }
317
318 39
        return $text;
319
    }
320
321
    /**
322
     * @param string $text
323
     *
324
     * @return string
325
     */
326 26
    public static function decodeUnicode($text)
327
    {
328 26
        if (preg_match('/^\xFE\xFF/i', $text)) {
329
            // Strip U+FEFF byte order marker.
330 16
            $decode = substr($text, 2);
331 16
            $text = '';
332 16
            $length = \strlen($decode);
333
334 16
            for ($i = 0; $i < $length; $i += 2) {
335 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
336
            }
337
        }
338
339 26
        return $text;
340
    }
341
342
    /**
343
     * @return int
344
     */
345 9
    protected function getFontSpaceLimit()
346
    {
347 9
        return -50;
348
    }
349
350
    /**
351
     * @param array $commands
352
     *
353
     * @return string
354
     */
355 9
    public function decodeText($commands)
356
    {
357 9
        $text = '';
358 9
        $word_position = 0;
359 9
        $words = [];
360 9
        $unicode = false;
361 9
        $font_space = $this->getFontSpaceLimit();
362
363 9
        foreach ($commands as $command) {
364 9
            switch ($command[PDFObject::TYPE]) {
365 9
                case 'n':
366 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
367 3
                        $word_position = \count($words);
368
                    }
369 6
                    continue 2;
370
371 9
                case '<':
372
                    // Decode hexadecimal.
373 6
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
374
375 6
                    if (mb_check_encoding($text, 'UTF-8')) {
376 6
                        $unicode = true;
377
                    }
378
379 6
                    break;
380
381
                default:
382
                    // Decode octal (if necessary).
383 5
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
384
            }
385
386
            // replace escaped chars
387 9
            $text = stripcslashes($text);
388
389
            // add content to result string
390 9
            if (isset($words[$word_position])) {
391 6
                $words[$word_position] .= $text;
392
            } else {
393 9
                $words[$word_position] = $text;
394
            }
395
        }
396
397 9
        foreach ($words as &$word) {
398 9
            $loop_unicode = $unicode;
399 9
            $word = $this->decodeContent($word, $loop_unicode);
400
        }
401
402 9
        return implode(' ', $words);
403
    }
404
405
    /**
406
     * @param string $text
407
     * @param bool   $unicode
408
     *
409
     * @return string
410
     */
411 13
    public function decodeContent($text, &$unicode)
412
    {
413 13
        if ($this->has('ToUnicode')) {
414 11
            $bytes = $this->tableSizes['from'];
415
416 11
            if ($bytes) {
417 11
                $result = '';
418 11
                $length = \strlen($text);
419
420 11
                for ($i = 0; $i < $length; $i += $bytes) {
421 11
                    $char = substr($text, $i, $bytes);
422
423 11
                    if (false !== ($decoded = $this->translateChar($char, false))) {
424 11
                        $char = $decoded;
425
                    } elseif ($this->has('DescendantFonts')) {
426
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
427
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

427
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
428
                        } else {
429
                            $fonts = $this->get('DescendantFonts')->getContent();
430
                        }
431
                        $decoded = false;
432
433
                        foreach ($fonts as $font) {
434
                            if ($font instanceof self) {
435
                                if (false !== ($decoded = $font->translateChar($char, false))) {
436
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

436
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
437
                                    break;
438
                                }
439
                            }
440
                        }
441
442
                        if (false !== $decoded) {
443
                            $char = $decoded;
444
                        } else {
445
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
446
                        }
447
                    } else {
448
                        $char = self::MISSING;
449
                    }
450
451 11
                    $result .= $char;
452
                }
453
454 11
                $text = $result;
455
456
                // By definition, this code generates unicode chars.
457 11
                $unicode = true;
458
            }
459 9
        } elseif ($this->has('Encoding')) {
460
            /** @var Encoding $encoding */
461 6
            $encoding = $this->get('Encoding');
462
463 6
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
464 2
                if ($unicode) {
465
                    $chars = preg_split(
466
                        '//su',
467
                        $text,
468
                        -1,
469
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
470
                    );
471
                    $result = '';
472
473
                    foreach ($chars as $char) {
474
                        $dec_av = hexdec(bin2hex($char));
475
                        $dec_ap = $encoding->translateChar($dec_av);
476
                        $result .= self::uchr($dec_ap);
477
                    }
478
479
                    $text = $result;
480
                } else {
481 2
                    $result = '';
482 2
                    $length = \strlen($text);
483
484 2
                    for ($i = 0; $i < $length; ++$i) {
485 2
                        $dec_av = hexdec(bin2hex($text[$i]));
486 2
                        $dec_ap = $encoding->translateChar($dec_av);
487 2
                        $result .= \chr($dec_ap);
488
                    }
489
490 2
                    $text = $result;
491
492 2
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
493
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
494
495
                        return $text;
496
                    }
497
                }
498
            }
499
        }
500
501
        // Convert to unicode if not already done.
502 13
        if (!$unicode) {
503 5
            if ($this->get('Encoding') instanceof Element &&
504 5
                $this->get('Encoding')->equals('MacRomanEncoding')
505
            ) {
506 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
507
            } else {
508 5
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
509
            }
510
        }
511
512 13
        return $text;
513
    }
514
}
515