Passed
Pull Request — master (#318)
by
unknown
03:16
created

Font::decodeContent()   D

Complexity

Conditions 20
Paths 19

Size

Total Lines 102
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 57.9524

Importance

Changes 5
Bugs 0 Features 1
Metric Value
cc 20
eloc 61
c 5
b 0
f 1
nc 19
nop 2
dl 0
loc 102
ccs 31
cts 57
cp 0.5439
crap 57.9524
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 22
    public function init()
51
    {
52
        // Load translate table.
53 22
        $this->loadTranslateTable();
54 22
    }
55
56
    /**
57
     * @return string
58
     */
59 8
    public function getName()
60
    {
61 8
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 8
    public function getType()
68
    {
69 8
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 7
    public function getDetails($deep = true)
76
    {
77 7
        $details = [];
78
79 7
        $details['Name'] = $this->getName();
80 7
        $details['Type'] = $this->getType();
81 7
        $details['Encoding'] = 'Ansi';
82 7
        if ($this->has('Encoding')) {
83 5
            if ($this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
84 2
                $details['Encoding'] = @($this->get('Encoding')->getDetails()['BaseEncoding']) ?: 'Ansi';
85
            } else {
86 4
                $details['Encoding'] = (string) $this->get('Encoding');
87
            }
88
        }
89
90 7
        $details += parent::getDetails($deep);
91
92 7
        return $details;
93
    }
94
95
    /**
96
     * @param string $char
97
     * @param bool   $use_default
98
     *
99
     * @return string|bool
100
     */
101 12
    public function translateChar($char, $use_default = true)
102
    {
103 12
        $dec = hexdec(bin2hex($char));
104
105 12
        if (\array_key_exists($dec, $this->table)) {
106 12
            return $this->table[$dec];
107
        }
108
109 1
        return $use_default ? self::MISSING : $char;
110
    }
111
112
    /**
113
     * @param int $code
114
     *
115
     * @return string
116
     */
117 22
    public static function uchr($code)
118
    {
119 22
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
120
    }
121
122
    /**
123
     * @return array
124
     */
125 22
    public function loadTranslateTable()
126
    {
127 22
        if (null !== $this->table) {
128 1
            return $this->table;
129
        }
130
131 22
        $this->table = [];
132 22
        $this->tableSizes = [
133
            'from' => 1,
134
            'to' => 1,
135
        ];
136
137 22
        if ($this->has('ToUnicode')) {
138 20
            $content = $this->get('ToUnicode')->getContent();
139 20
            $matches = [];
140
141
            // Support for multiple spacerange sections
142 20
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
143 20
                foreach ($matches['sections'] as $section) {
144 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
145
146 20
                    preg_match_all($regexp, $section, $matches);
147
148 20
                    $this->tableSizes = [
149 20
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
150 20
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
151
                    ];
152
153 20
                    break;
154
                }
155
            }
156
157
            // Support for multiple bfchar sections
158 20
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
159 6
                foreach ($matches['sections'] as $section) {
160 6
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
161
162 6
                    preg_match_all($regexp, $section, $matches);
163
164 6
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
165
166 6
                    foreach ($matches['from'] as $key => $from) {
167 6
                        $parts = preg_split(
168 6
                            '/([0-9A-F]{4})/i',
169 6
                            $matches['to'][$key],
170 6
                            0,
171 6
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
172
                        );
173 6
                        $text = '';
174 6
                        foreach ($parts as $part) {
175 6
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

175
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
176
                        }
177 6
                        $this->table[hexdec($from)] = $text;
178
                    }
179
                }
180
            }
181
182
            // Support for multiple bfrange sections
183 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
184 18
                foreach ($matches['sections'] as $section) {
185
                    // Support for : <srcCode1> <srcCode2> <dstString>
186 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
187
188 18
                    preg_match_all($regexp, $section, $matches);
189
190 18
                    foreach ($matches['from'] as $key => $from) {
191 18
                        $char_from = hexdec($from);
192 18
                        $char_to = hexdec($matches['to'][$key]);
193 18
                        $offset = hexdec($matches['offset'][$key]);
194
195 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
196 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
197
                        }
198
                    }
199
200
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
201
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
202 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
203
204 18
                    preg_match_all($regexp, $section, $matches);
205
206 18
                    foreach ($matches['from'] as $key => $from) {
207 1
                        $char_from = hexdec($from);
208 1
                        $strings = [];
209
210 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
211
212 1
                        foreach ($strings['string'] as $position => $string) {
213 1
                            $parts = preg_split(
214 1
                                '/([0-9A-F]{4})/i',
215
                                $string,
216 1
                                0,
217 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
218
                            );
219 1
                            $text = '';
220 1
                            foreach ($parts as $part) {
221 1
                                $text .= self::uchr(hexdec($part));
222
                            }
223 1
                            $this->table[$char_from + $position] = $text;
224
                        }
225
                    }
226
                }
227
            }
228
        }
229
230 22
        return $this->table;
231
    }
232
233
    /**
234
     * @param array $table
235
     */
236
    public function setTable($table)
237
    {
238
        $this->table = $table;
239
    }
240
241
    /**
242
     * @param string $hexa
243
     * @param bool   $add_braces
244
     *
245
     * @return string
246
     */
247 26
    public static function decodeHexadecimal($hexa, $add_braces = false)
248
    {
249
        // Special shortcut for XML content.
250 26
        if (false !== stripos($hexa, '<?xml')) {
251 3
            return $hexa;
252
        }
253
254 26
        $text = '';
255 26
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
256
257 26
        foreach ($parts as $part) {
258 26
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
259 8
                $part = trim($part, '<>');
260 8
                if ($add_braces) {
261 1
                    $text .= '(';
262
                }
263
264 8
                $part = pack('H*', $part);
265 8
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
266
267 8
                if ($add_braces) {
268 8
                    $text .= ')';
269
                }
270
            } else {
271 26
                $text .= $part;
272
            }
273
        }
274
275 26
        return $text;
276
    }
277
278
    /**
279
     * @param string $text
280
     *
281
     * @return string
282
     */
283 26
    public static function decodeOctal($text)
284
    {
285 26
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
286 26
        $text = '';
287
288 26
        foreach ($parts as $part) {
289 26
            if (preg_match('/^\\\\\d{3}$/', $part)) {
290 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

290
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
291
            } else {
292 26
                $text .= $part;
293
            }
294
        }
295
296 26
        return $text;
297
    }
298
299
    /**
300
     * @param string $text
301
     *
302
     * @return string
303
     */
304 39
    public static function decodeEntities($text)
305
    {
306 39
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
307 39
        $text = '';
308
309 39
        foreach ($parts as $part) {
310 39
            if (preg_match('/^#\d{2}$/', $part)) {
311 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

311
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
312
            } else {
313 39
                $text .= $part;
314
            }
315
        }
316
317 39
        return $text;
318
    }
319
320
    /**
321
     * @param string $text
322
     *
323
     * @return string
324
     */
325 26
    public static function decodeUnicode($text)
326
    {
327 26
        if (preg_match('/^\xFE\xFF/i', $text)) {
328
            // Strip U+FEFF byte order marker.
329 16
            $decode = substr($text, 2);
330 16
            $text = '';
331 16
            $length = \strlen($decode);
332
333 16
            for ($i = 0; $i < $length; $i += 2) {
334 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

334
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
335
            }
336
        }
337
338 26
        return $text;
339
    }
340
341
    /**
342
     * @return int
343
     */
344 9
    protected function getFontSpaceLimit()
345
    {
346 9
        return -50;
347
    }
348
349
    /**
350
     * @param array $commands
351
     *
352
     * @return string
353
     */
354 9
    public function decodeText($commands)
355
    {
356 9
        $text = '';
357 9
        $word_position = 0;
358 9
        $words = [];
359 9
        $unicode = false;
360 9
        $font_space = $this->getFontSpaceLimit();
361
362 9
        foreach ($commands as $command) {
363 9
            switch ($command[PDFObject::TYPE]) {
364 9
                case 'n':
365 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
366 3
                        $word_position = \count($words);
367
                    }
368 6
                    continue 2;
369
370 9
                case '<':
371
                    // Decode hexadecimal.
372 6
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
373
374 6
                    if (mb_check_encoding($text, 'UTF-8')) {
375 6
                        $unicode = true;
376
                    }
377
378 6
                    break;
379
380
                default:
381
                    // Decode octal (if necessary).
382 5
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
383
            }
384
385
            // replace escaped chars
386 9
            $text = stripcslashes($text);
387
388
            // add content to result string
389 9
            if (isset($words[$word_position])) {
390 6
                $words[$word_position] .= $text;
391
            } else {
392 9
                $words[$word_position] = $text;
393
            }
394
        }
395
396 9
        foreach ($words as &$word) {
397 9
            $loop_unicode = $unicode;
398 9
            $word = $this->decodeContent($word, $loop_unicode);
399
        }
400
401 9
        return implode(' ', $words);
402
    }
403
404
    /**
405
     * @param string $text
406
     * @param bool   $unicode
407
     *
408
     * @return string
409
     */
410 13
    public function decodeContent($text, &$unicode)
411
    {
412 13
        if ($this->has('ToUnicode')) {
413 11
            $bytes = $this->tableSizes['from'];
414
415 11
            if ($bytes) {
416 11
                $result = '';
417 11
                $length = \strlen($text);
418
419 11
                for ($i = 0; $i < $length; $i += $bytes) {
420 11
                    $char = substr($text, $i, $bytes);
421
422 11
                    if (false !== ($decoded = $this->translateChar($char, false))) {
423 11
                        $char = $decoded;
424
                    } elseif ($this->has('DescendantFonts')) {
425
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
426
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

426
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
427
                        } else {
428
                            $fonts = $this->get('DescendantFonts')->getContent();
429
                        }
430
                        $decoded = false;
431
432
                        foreach ($fonts as $font) {
433
                            if ($font instanceof self) {
434
                                if (false !== ($decoded = $font->translateChar($char, false))) {
435
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

435
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
436
                                    break;
437
                                }
438
                            }
439
                        }
440
441
                        if (false !== $decoded) {
442
                            $char = $decoded;
443
                        } else {
444
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
445
                        }
446
                    } else {
447
                        $char = self::MISSING;
448
                    }
449
450 11
                    $result .= $char;
451
                }
452
453 11
                $text = $result;
454
455
                // By definition, this code generates unicode chars.
456 11
                $unicode = true;
457
            }
458 9
        } elseif ($this->has('Encoding')) {
459
            /** @var Encoding $encoding */
460 6
            $encoding = $this->get('Encoding');
461
462 6
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
463 2
                if ($unicode) {
464
                    $chars = preg_split(
465
                        '//su',
466
                        $text,
467
                        -1,
468
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
469
                    );
470
                    $result = '';
471
472
                    foreach ($chars as $char) {
473
                        $dec_av = hexdec(bin2hex($char));
474
                        $dec_ap = $encoding->translateChar($dec_av);
475
                        $result .= self::uchr($dec_ap);
476
                    }
477
478
                    $text = $result;
479
                } else {
480 2
                    $result = '';
481 2
                    $length = \strlen($text);
482
483 2
                    for ($i = 0; $i < $length; ++$i) {
484 2
                        $dec_av = hexdec(bin2hex($text[$i]));
485 2
                        $dec_ap = $encoding->translateChar($dec_av);
486 2
                        $result .= \chr($dec_ap);
487
                    }
488
489 2
                    $text = $result;
490
491 2
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
492
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
493
494
                        return $text;
495
                    }
496
                }
497
            }
498
        }
499
500
        // Convert to unicode if not already done.
501 13
        if (!$unicode) {
502 5
            if ($this->get('Encoding') instanceof Element &&
503 5
                $this->get('Encoding')->equals('MacRomanEncoding')
504
            ) {
505 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
506
            } else {
507 5
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
508
            }
509
        }
510
511 13
        return $text;
512
    }
513
}
514