Passed
Pull Request — master (#433)
by
unknown
02:12
created

Font::decodeText()   B

Complexity

Conditions 7
Paths 14

Size

Total Lines 45
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 7

Importance

Changes 5
Bugs 1 Features 0
Metric Value
cc 7
eloc 27
nc 14
nop 1
dl 0
loc 45
rs 8.5546
c 5
b 1
f 0
ccs 25
cts 25
cp 1
crap 7
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 31
    public function init()
53
    {
54
        // Load translate table.
55 31
        $this->loadTranslateTable();
56 31
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 21
    public function translateChar($char, $use_default = true)
97
    {
98 21
        $dec = hexdec(bin2hex($char));
99
100 21
        if (\array_key_exists($dec, $this->table)) {
101 18
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 7
        $fallbackDecoded = $char;
106
        if (
107 7
            \strlen($char) < 2
108 7
            && $this->has('Encoding')
109 7
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
110
        ) {
111
            try {
112 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
113 1
                    $fallbackDecoded = self::uchr($dec);
114
                }
115 1
            } catch (EncodingNotFoundException $e) {
116
                // Encoding->getEncodingClass() throws an exception when BaseEncoding doesn't exists
117
                // See table 5.11 on PDF 1.5 specs for more info
118
            }
119
        }
120
121 7
        return $use_default ? self::MISSING : $fallbackDecoded;
122
    }
123
124
    /**
125
     * @param int $code
126
     *
127
     * @return string
128
     */
129 30
    public static function uchr($code)
130
    {
131
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
132
        // therefore, we use mb_convert_encoding() instead
133 30
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') also could return the type array which is incompatible with the documented return type string.
Loading history...
134
    }
135
136
    /**
137
     * @return array
138
     */
139 31
    public function loadTranslateTable()
140
    {
141 31
        if (null !== $this->table) {
142 1
            return $this->table;
143
        }
144
145 31
        $this->table = [];
146 31
        $this->tableSizes = [
147
            'from' => 1,
148
            'to' => 1,
149
        ];
150
151 31
        if ($this->has('ToUnicode')) {
152 28
            $content = $this->get('ToUnicode')->getContent();
153 28
            $matches = [];
154
155
            // Support for multiple spacerange sections
156 28
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
157 28
                foreach ($matches['sections'] as $section) {
158 28
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
159
160 28
                    preg_match_all($regexp, $section, $matches);
161
162 28
                    $this->tableSizes = [
163 28
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
164 28
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
165
                    ];
166
167 28
                    break;
168
                }
169
            }
170
171
            // Support for multiple bfchar sections
172 28
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
173 11
                foreach ($matches['sections'] as $section) {
174 11
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
175
176 11
                    preg_match_all($regexp, $section, $matches);
177
178 11
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
179
180 11
                    foreach ($matches['from'] as $key => $from) {
181 11
                        $parts = preg_split(
182 11
                            '/([0-9A-F]{4})/i',
183 11
                            $matches['to'][$key],
184 11
                            0,
185 11
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
186
                        );
187 11
                        $text = '';
188 11
                        foreach ($parts as $part) {
189 11
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

189
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
190
                        }
191 11
                        $this->table[hexdec($from)] = $text;
192
                    }
193
                }
194
            }
195
196
            // Support for multiple bfrange sections
197 28
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
198 22
                foreach ($matches['sections'] as $section) {
199
                    // Support for : <srcCode1> <srcCode2> <dstString>
200 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
201
202 22
                    preg_match_all($regexp, $section, $matches);
203
204 22
                    foreach ($matches['from'] as $key => $from) {
205 22
                        $char_from = hexdec($from);
206 22
                        $char_to = hexdec($matches['to'][$key]);
207 22
                        $offset = hexdec($matches['offset'][$key]);
208
209 22
                        for ($char = $char_from; $char <= $char_to; ++$char) {
210 22
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
211
                        }
212
                    }
213
214
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
215
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
216 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
217
218 22
                    preg_match_all($regexp, $section, $matches);
219
220 22
                    foreach ($matches['from'] as $key => $from) {
221 1
                        $char_from = hexdec($from);
222 1
                        $strings = [];
223
224 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
225
226 1
                        foreach ($strings['string'] as $position => $string) {
227 1
                            $parts = preg_split(
228 1
                                '/([0-9A-F]{4})/i',
229
                                $string,
230 1
                                0,
231 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
232
                            );
233 1
                            $text = '';
234 1
                            foreach ($parts as $part) {
235 1
                                $text .= self::uchr(hexdec($part));
236
                            }
237 1
                            $this->table[$char_from + $position] = $text;
238
                        }
239
                    }
240
                }
241
            }
242
        }
243
244 31
        return $this->table;
245
    }
246
247
    /**
248
     * @param array $table
249
     */
250 1
    public function setTable($table)
251
    {
252 1
        $this->table = $table;
253 1
    }
254
255
    /**
256
     * @param string $hexa
257
     * @param bool   $add_braces
258
     *
259
     * @return string
260
     */
261 34
    public static function decodeHexadecimal($hexa, $add_braces = false)
262
    {
263
        // Special shortcut for XML content.
264 34
        if (false !== stripos($hexa, '<?xml')) {
265 3
            return $hexa;
266
        }
267
268 34
        $text = '';
269 34
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
270
271 34
        foreach ($parts as $part) {
272 34
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
273
                // strip line breaks
274 13
                $part = preg_replace("/[\r\n]/", '', $part);
275 13
                $part = trim($part, '<>');
276 13
                if ($add_braces) {
277 1
                    $text .= '(';
278
                }
279
280 13
                $part = pack('H*', $part);
281 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
282
283 13
                if ($add_braces) {
284 13
                    $text .= ')';
285
                }
286
            } else {
287 34
                $text .= $part;
288
            }
289
        }
290
291 34
        return $text;
292
    }
293
294
    /**
295
     * @param string $text
296
     *
297
     * @return string
298
     */
299 34
    public static function decodeOctal($text)
300
    {
301 34
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
302 34
        $text = '';
303
304 34
        foreach ($parts as $part) {
305 34
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
306 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

306
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
307
            } else {
308 34
                $text .= $part;
309
            }
310
        }
311
312 34
        return $text;
313
    }
314
315
    /**
316
     * @param string $text
317
     *
318
     * @return string
319
     */
320 48
    public static function decodeEntities($text)
321
    {
322 48
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
323 48
        $text = '';
324
325 48
        foreach ($parts as $part) {
326 48
            if (preg_match('/^#\d{2}$/', $part)) {
327 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

327
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
328
            } else {
329 48
                $text .= $part;
330
            }
331
        }
332
333 48
        return $text;
334
    }
335
336
    /**
337
     * @param string $text
338
     *
339
     * @return string
340
     */
341 34
    public static function decodeUnicode($text)
342
    {
343 34
        if (preg_match('/^\xFE\xFF/i', $text)) {
344
            // Strip U+FEFF byte order marker.
345 21
            $decode = substr($text, 2);
346 21
            $text = '';
347 21
            $length = \strlen($decode);
348
349 21
            for ($i = 0; $i < $length; $i += 2) {
350 21
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

350
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
351
            }
352
        }
353
354 34
        return $text;
355
    }
356
357
    /**
358
     * @return int
359
     *
360
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
361
     */
362 18
    protected function getFontSpaceLimit()
363
    {
364 18
        return $this->config->getFontSpaceLimit();
365
    }
366
367
    /**
368
     * @param array $commands
369
     *
370
     * @return string
371
     */
372 18
    public function decodeText($commands)
373
    {
374 18
        $text = '';
375 18
        $word_position = 0;
376 18
        $words = [];
377 18
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
378 18
        $font_space = $this->getFontSpaceLimit();
379
380 18
        foreach ($commands as $command) {
381 18
            switch ($command[PDFObject::TYPE]) {
382 18
                case 'n':
383 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
384 8
                        $word_position = \count($words);
385
                    }
386 15
                    continue 2;
387 18
                case '<':
388
                    // Decode hexadecimal.
389 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
390 11
                    break;
391
392
                default:
393
                    // Decode octal (if necessary).
394 11
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
395
            }
396
397
            // replace escaped chars
398 18
            $text = str_replace(
399 18
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
400 18
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
401
                $text
402
            );
403
404
            // add content to result string
405 18
            if (isset($words[$word_position])) {
406 15
                $words[$word_position] .= $text;
407
            } else {
408 18
                $words[$word_position] = $text;
409
            }
410
        }
411
412 18
        foreach ($words as &$word) {
413 18
            $word = $this->decodeContent($word);
414
        }
415
416 18
        return implode(' ', $words);
417
    }
418
419
    /**
420
     * @param string $text
421
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
422
     *
423
     * @return string
424
     */
425 20
    public function decodeContent($text, &$unicode = null)
426
    {
427 20
        if ($this->has('ToUnicode')) {
428 18
            $bytes = $this->tableSizes['from'];
429
430 18
            if ($bytes) {
431 18
                $result = '';
432 18
                $length = \strlen($text);
433
434 18
                for ($i = 0; $i < $length; $i += $bytes) {
435 18
                    $char = substr($text, $i, $bytes);
436
437 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
438 18
                        $char = $decoded;
439
                    } elseif ($this->has('DescendantFonts')) {
440
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
441
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

441
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
442
                        } else {
443
                            $fonts = $this->get('DescendantFonts')->getContent();
444
                        }
445
                        $decoded = false;
446
447
                        foreach ($fonts as $font) {
448
                            if ($font instanceof self) {
449
                                if (false !== ($decoded = $font->translateChar($char, false))) {
450
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

450
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
451
                                    break;
452
                                }
453
                            }
454
                        }
455
456
                        if (false !== $decoded) {
457
                            $char = $decoded;
458
                        } else {
459
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
460
                        }
461
                    } else {
462
                        $char = self::MISSING;
463
                    }
464
465 18
                    $result .= $char;
466
                }
467
468 18
                $text = $result;
469
            }
470 12
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
471
            /** @var Encoding $encoding */
472 3
            $encoding = $this->get('Encoding');
473 3
            $unicode = mb_check_encoding($text, 'UTF-8');
474 3
            $result = '';
475 3
            if ($unicode) {
476 3
                $chars = preg_split(
477 3
                        '//s'.($unicode ? 'u' : ''),
478
                        $text,
479 3
                        -1,
480 3
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
481
                );
482
483 3
                foreach ($chars as $char) {
484 3
                    $dec_av = hexdec(bin2hex($char));
485 3
                    $dec_ap = $encoding->translateChar($dec_av);
486 3
                    $result .= self::uchr($dec_ap);
487
                }
488
            } else {
489 3
                $length = \strlen($text);
490
491 3
                for ($i = 0; $i < $length; ++$i) {
492 3
                    $dec_av = hexdec(bin2hex($text[$i]));
493 3
                    $dec_ap = $encoding->translateChar($dec_av);
494 3
                    $result .= self::uchr($dec_ap);
495
                }
496
            }
497 3
            $text = $result;
498 10
        } elseif ($this->get('Encoding') instanceof Element &&
499 10
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
500
            // mb_convert_encoding does not support MacRoman/macintosh,
501
            // so we use iconv() here
502 2
            $text = iconv('macintosh', 'UTF-8', $text);
503 10
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
504
            // don't double-encode strings already in UTF-8
505 4
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
506
        }
507
508 20
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
509
    }
510
}
511