Passed
Pull Request — master (#358)
by butschster
02:09
created

Font::decodeOctal()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 3

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 8
dl 0
loc 14
rs 10
c 1
b 0
f 0
ccs 8
cts 8
cp 1
cc 3
nc 3
nop 1
crap 3
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var int
42
     */
43
    public static $font_space = -50;
44
45
    /**
46
     * @var array
47
     */
48
    protected $table = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $tableSizes = null;
54
55 24
    public function init()
56
    {
57
        // Load translate table.
58 24
        $this->loadTranslateTable();
59 24
    }
60
61
    /**
62
     * @return string
63
     */
64 2
    public function getName()
65
    {
66 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
67
    }
68
69
    /**
70
     * @return string
71
     */
72 2
    public function getType()
73
    {
74 2
        return (string) $this->header->get('Subtype');
75
    }
76
77
    /**
78
     * @return array
79
     */
80 1
    public function getDetails($deep = true)
81
    {
82 1
        $details = [];
83
84 1
        $details['Name'] = $this->getName();
85 1
        $details['Type'] = $this->getType();
86 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
87
88 1
        $details += parent::getDetails($deep);
89
90 1
        return $details;
91
    }
92
93
    /**
94
     * @param string $char
95
     * @param bool   $use_default
96
     *
97
     * @return string|bool
98
     */
99 13
    public function translateChar($char, $use_default = true)
100
    {
101 13
        $dec = hexdec(bin2hex($char));
102
103 13
        if (\array_key_exists($dec, $this->table)) {
104 13
            return $this->table[$dec];
105
        }
106
107
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
108 3
        $fallbackDecoded = $char;
109 3
        if (\strlen($char) < 2 && $this->has('Encoding') && 'WinAnsiEncoding' === $this->get('Encoding')->__toString()) {
110
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

110
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
111
        }
112
113 3
        return $use_default ? self::MISSING : $fallbackDecoded;
114
    }
115
116
    /**
117
     * @param int $code
118
     *
119
     * @return string
120
     */
121 25
    public static function uchr($code)
122
    {
123
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
124
        // therefore, we use mb_convert_encoding() instead
125 25
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
126
    }
127
128
    /**
129
     * @return array
130
     */
131 24
    public function loadTranslateTable()
132
    {
133 24
        if (null !== $this->table) {
134 1
            return $this->table;
135
        }
136
137 24
        $this->table = [];
138 24
        $this->tableSizes = [
139
            'from' => 1,
140
            'to' => 1,
141
        ];
142
143 24
        if ($this->has('ToUnicode')) {
144 22
            $content = $this->get('ToUnicode')->getContent();
145 22
            $matches = [];
146
147
            // Support for multiple spacerange sections
148 22
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
149 22
                foreach ($matches['sections'] as $section) {
150 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
151
152 22
                    preg_match_all($regexp, $section, $matches);
153
154 22
                    $this->tableSizes = [
155 22
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
156 22
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
157
                    ];
158
159 22
                    break;
160
                }
161
            }
162
163
            // Support for multiple bfchar sections
164 22
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
165 8
                foreach ($matches['sections'] as $section) {
166 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
167
168 8
                    preg_match_all($regexp, $section, $matches);
169
170 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
171
172 8
                    foreach ($matches['from'] as $key => $from) {
173 8
                        $parts = preg_split(
174 8
                            '/([0-9A-F]{4})/i',
175 8
                            $matches['to'][$key],
176 8
                            0,
177 8
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
178
                        );
179 8
                        $text = '';
180 8
                        foreach ($parts as $part) {
181 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

181
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
182
                        }
183 8
                        $this->table[hexdec($from)] = $text;
184
                    }
185
                }
186
            }
187
188
            // Support for multiple bfrange sections
189 22
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
190 18
                foreach ($matches['sections'] as $section) {
191
                    // Support for : <srcCode1> <srcCode2> <dstString>
192 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
193
194 18
                    preg_match_all($regexp, $section, $matches);
195
196 18
                    foreach ($matches['from'] as $key => $from) {
197 18
                        $char_from = hexdec($from);
198 18
                        $char_to = hexdec($matches['to'][$key]);
199 18
                        $offset = hexdec($matches['offset'][$key]);
200
201 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
202 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
203
                        }
204
                    }
205
206
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
207
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
208 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
209
210 18
                    preg_match_all($regexp, $section, $matches);
211
212 18
                    foreach ($matches['from'] as $key => $from) {
213 1
                        $char_from = hexdec($from);
214 1
                        $strings = [];
215
216 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
217
218 1
                        foreach ($strings['string'] as $position => $string) {
219 1
                            $parts = preg_split(
220 1
                                '/([0-9A-F]{4})/i',
221
                                $string,
222 1
                                0,
223 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
224
                            );
225 1
                            $text = '';
226 1
                            foreach ($parts as $part) {
227 1
                                $text .= self::uchr(hexdec($part));
228
                            }
229 1
                            $this->table[$char_from + $position] = $text;
230
                        }
231
                    }
232
                }
233
            }
234
        }
235
236 24
        return $this->table;
237
    }
238
239
    /**
240
     * @param array $table
241
     */
242
    public function setTable($table)
243
    {
244
        $this->table = $table;
245
    }
246
247
    /**
248
     * @param string $hexa
249
     * @param bool   $add_braces
250
     *
251
     * @return string
252
     */
253 28
    public static function decodeHexadecimal($hexa, $add_braces = false)
254
    {
255
        // Special shortcut for XML content.
256 28
        if (false !== stripos($hexa, '<?xml')) {
257 3
            return $hexa;
258
        }
259
260 28
        $text = '';
261 28
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
262
263 28
        foreach ($parts as $part) {
264 28
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
265
                // strip line breaks
266 9
                $part = preg_replace("/[\r\n]/", '', $part);
267 9
                $part = trim($part, '<>');
268 9
                if ($add_braces) {
269 1
                    $text .= '(';
270
                }
271
272 9
                $part = pack('H*', $part);
273 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
274
275 9
                if ($add_braces) {
276 9
                    $text .= ')';
277
                }
278
            } else {
279 28
                $text .= $part;
280
            }
281
        }
282
283 28
        return $text;
284
    }
285
286
    /**
287
     * @param string $text
288
     *
289
     * @return string
290
     */
291 28
    public static function decodeOctal($text)
292
    {
293 28
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
294 28
        $text = '';
295
296 28
        foreach ($parts as $part) {
297 28
            if (preg_match('/^\\\\\d{3}$/', $part)) {
298 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

298
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
299
            } else {
300 28
                $text .= $part;
301
            }
302
        }
303
304 28
        return $text;
305
    }
306
307
    /**
308
     * @param string $text
309
     *
310
     * @return string
311
     */
312 42
    public static function decodeEntities($text)
313
    {
314 42
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
315 42
        $text = '';
316
317 42
        foreach ($parts as $part) {
318 42
            if (preg_match('/^#\d{2}$/', $part)) {
319 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

319
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
320
            } else {
321 42
                $text .= $part;
322
            }
323
        }
324
325 42
        return $text;
326
    }
327
328
    /**
329
     * @param string $text
330
     *
331
     * @return string
332
     */
333 28
    public static function decodeUnicode($text)
334
    {
335 28
        if (preg_match('/^\xFE\xFF/i', $text)) {
336
            // Strip U+FEFF byte order marker.
337 19
            $decode = substr($text, 2);
338 19
            $text = '';
339 19
            $length = \strlen($decode);
340
341 19
            for ($i = 0; $i < $length; $i += 2) {
342 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

342
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
343
            }
344
        }
345
346 28
        return $text;
347
    }
348
349
    /**
350
     * @return int
351
     */
352 13
    public static function getFontSpaceLimit()
353
    {
354 13
        return self::$font_space;
355
    }
356
357
    /**
358
     * @param int $font_space
359
     *
360
     * @return void
361
     */
362 1
    public static function setFontSpaceLimit($font_space)
363
    {
364 1
        self::$font_space = $font_space;
365 1
    }
366
367
    /**
368
     * @param array $commands
369
     *
370
     * @return string
371
     */
372 12
    public function decodeText($commands)
373
    {
374 12
        $text = '';
375 12
        $word_position = 0;
376 12
        $words = [];
377 12
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
378 12
        $font_space = self::getFontSpaceLimit();
379
380 12
        foreach ($commands as $command) {
381 12
            switch ($command[PDFObject::TYPE]) {
382 12
                case 'n':
383 10
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
384 5
                        $word_position = \count($words);
385
                    }
386 10
                    continue 2;
387
388 12
                case '<':
389
                    // Decode hexadecimal.
390 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
391 7
                    break;
392
393
                default:
394
                    // Decode octal (if necessary).
395 8
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
396
            }
397
398
            // replace escaped chars
399 12
            $text = str_replace(
400 12
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
401 12
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
402
                $text
403
            );
404
405
            // add content to result string
406 12
            if (isset($words[$word_position])) {
407 10
                $words[$word_position] .= $text;
408
            } else {
409 12
                $words[$word_position] = $text;
410
            }
411
        }
412
413 12
        foreach ($words as &$word) {
414 12
            $word = $this->decodeContent($word);
415
        }
416
417 12
        return implode(' ', $words);
418
    }
419
420
    /**
421
     * @param string $text
422
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
423
     *
424
     * @return string
425
     */
426 14
    public function decodeContent($text, &$unicode = null)
427
    {
428 14
        if ($this->has('ToUnicode')) {
429 12
            $bytes = $this->tableSizes['from'];
430
431 12
            if ($bytes) {
432 12
                $result = '';
433 12
                $length = \strlen($text);
434
435 12
                for ($i = 0; $i < $length; $i += $bytes) {
436 12
                    $char = substr($text, $i, $bytes);
437
438 12
                    if (false !== ($decoded = $this->translateChar($char, false))) {
439 12
                        $char = $decoded;
440
                    } elseif ($this->has('DescendantFonts')) {
441
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
442
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

442
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
443
                        } else {
444
                            $fonts = $this->get('DescendantFonts')->getContent();
445
                        }
446
                        $decoded = false;
447
448
                        foreach ($fonts as $font) {
449
                            if ($font instanceof self) {
450
                                if (false !== ($decoded = $font->translateChar($char, false))) {
451
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

451
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
452
                                    break;
453
                                }
454
                            }
455
                        }
456
457
                        if (false !== $decoded) {
458
                            $char = $decoded;
459
                        } else {
460
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
461
                        }
462
                    } else {
463
                        $char = self::MISSING;
464
                    }
465
466 12
                    $result .= $char;
467
                }
468
469 12
                $text = $result;
470
            }
471 9
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
472
            /** @var Encoding $encoding */
473 1
            $encoding = $this->get('Encoding');
474 1
            $unicode = mb_check_encoding($text, 'UTF-8');
475 1
            $result = '';
476 1
            if ($unicode) {
477 1
                $chars = preg_split(
478 1
                        '//s'.($unicode ? 'u' : ''),
479
                        $text,
480 1
                        -1,
481 1
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
482
                );
483
484 1
                foreach ($chars as $char) {
485 1
                    $dec_av = hexdec(bin2hex($char));
486 1
                    $dec_ap = $encoding->translateChar($dec_av);
487
                    $result .= self::uchr($dec_ap);
488
                }
489
            } else {
490
                $length = \strlen($text);
491
492
                for ($i = 0; $i < $length; ++$i) {
493
                    $dec_av = hexdec(bin2hex($text[$i]));
494
                    $dec_ap = $encoding->translateChar($dec_av);
495
                    $result .= self::uchr($dec_ap);
496
                }
497
            }
498
            $text = $result;
499 9
        } elseif ($this->get('Encoding') instanceof Element &&
500 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
501
            // mb_convert_encoding does not support MacRoman/macintosh,
502
            // so we use iconv() here
503 1
            $text = iconv('macintosh', 'UTF-8', $text);
504 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
505
            // don't double-encode strings already in UTF-8
506 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
507
        }
508
509 14
        return $text;
510
    }
511
}
512