Completed
Pull Request — master (#349)
by
unknown
05:52
created

Font::decodeText()   B

Complexity

Conditions 7
Paths 14

Size

Total Lines 46
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 24
CRAP Score 7.0031

Importance

Changes 6
Bugs 1 Features 0
Metric Value
cc 7
eloc 27
c 6
b 1
f 0
nc 14
nop 1
dl 0
loc 46
ccs 24
cts 25
cp 0.96
crap 7.0031
rs 8.5546
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 11
    public function init()
51
    {
52
        // Load translate table.
53 11
        $this->loadTranslateTable();
54 11
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 1
    public function translateChar($char, $use_default = true)
95
    {
96 1
        $dec = hexdec(bin2hex($char));
97
98 1
        if (\array_key_exists($dec, $this->table)) {
99 1
            return $this->table[$dec];
100
        }
101
102
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
103 1
        $fallbackDecoded = $char;
104 1
        if (\strlen($char) < 2 && $this->has('Encoding') && 'WinAnsiEncoding' === $this->get('Encoding')->__toString()) {
105
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

105
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
106
        }
107
108 1
        return $use_default ? self::MISSING : $fallbackDecoded;
109
    }
110
111
    /**
112
     * @param int $code
113
     *
114
     * @return string
115
     */
116 12
    public static function uchr($code)
117
    {
118
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
119
        // therefore, we use mb_convert_encoding() instead
120 12
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
121
    }
122
123
    /**
124
     * @return array
125
     */
126 11
    public function loadTranslateTable()
127
    {
128 11
        if (null !== $this->table) {
129 1
            return $this->table;
130
        }
131
132 11
        $this->table = [];
133 11
        $this->tableSizes = [
134
            'from' => 1,
135
            'to' => 1,
136
        ];
137
138 11
        if ($this->has('ToUnicode')) {
139 10
            $content = $this->get('ToUnicode')->getContent();
140 10
            $matches = [];
141
142
            // Support for multiple spacerange sections
143 10
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
144 10
                foreach ($matches['sections'] as $section) {
145 10
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
146
147 10
                    preg_match_all($regexp, $section, $matches);
148
149 10
                    $this->tableSizes = [
150 10
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
151 10
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
152
                    ];
153
154 10
                    break;
155
                }
156
            }
157
158
            // Support for multiple bfchar sections
159 10
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
160 2
                foreach ($matches['sections'] as $section) {
161 2
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
162
163 2
                    preg_match_all($regexp, $section, $matches);
164
165 2
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
166
167 2
                    foreach ($matches['from'] as $key => $from) {
168 2
                        $parts = preg_split(
169 2
                            '/([0-9A-F]{4})/i',
170 2
                            $matches['to'][$key],
171 2
                            0,
172 2
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
173
                        );
174 2
                        $text = '';
175 2
                        foreach ($parts as $part) {
176 2
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

176
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
177
                        }
178 2
                        $this->table[hexdec($from)] = $text;
179
                    }
180
                }
181
            }
182
183
            // Support for multiple bfrange sections
184 10
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
185 9
                foreach ($matches['sections'] as $section) {
186
                    // Support for : <srcCode1> <srcCode2> <dstString>
187 9
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
188
189 9
                    preg_match_all($regexp, $section, $matches);
190
191 9
                    foreach ($matches['from'] as $key => $from) {
192 9
                        $char_from = hexdec($from);
193 9
                        $char_to = hexdec($matches['to'][$key]);
194 9
                        $offset = hexdec($matches['offset'][$key]);
195
196 9
                        for ($char = $char_from; $char <= $char_to; ++$char) {
197 9
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
198
                        }
199
                    }
200
201
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
202
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
203 9
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
204
205 9
                    preg_match_all($regexp, $section, $matches);
206
207 9
                    foreach ($matches['from'] as $key => $from) {
208 1
                        $char_from = hexdec($from);
209 1
                        $strings = [];
210
211 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
212
213 1
                        foreach ($strings['string'] as $position => $string) {
214 1
                            $parts = preg_split(
215 1
                                '/([0-9A-F]{4})/i',
216
                                $string,
217 1
                                0,
218 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
219
                            );
220 1
                            $text = '';
221 1
                            foreach ($parts as $part) {
222 1
                                $text .= self::uchr(hexdec($part));
223
                            }
224 1
                            $this->table[$char_from + $position] = $text;
225
                        }
226
                    }
227
                }
228
            }
229
        }
230
231 11
        return $this->table;
232
    }
233
234
    /**
235
     * @param array $table
236
     */
237
    public function setTable($table)
238
    {
239
        $this->table = $table;
240
    }
241
242
    /**
243
     * @param string $hexa
244
     * @param bool   $add_braces
245
     *
246
     * @return string
247
     */
248 15
    public static function decodeHexadecimal($hexa, $add_braces = false)
249
    {
250
        // Special shortcut for XML content.
251 15
        if (false !== stripos($hexa, '<?xml')) {
252 2
            return $hexa;
253
        }
254
255 15
        $text = '';
256 15
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
257
258 15
        foreach ($parts as $part) {
259 15
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
260 3
                $part = trim($part, '<>');
261 3
                if ($add_braces) {
262 1
                    $text .= '(';
263
                }
264
265 3
                $part = pack('H*', $part);
266 3
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
267
268 3
                if ($add_braces) {
269 3
                    $text .= ')';
270
                }
271
            } else {
272 15
                $text .= $part;
273
            }
274
        }
275
276 15
        return $text;
277
    }
278
279
    /**
280
     * @param string $text
281
     *
282
     * @return string
283
     */
284 15
    public static function decodeOctal($text)
285
    {
286 15
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
287 15
        $text = '';
288
289 15
        foreach ($parts as $part) {
290 15
            if (preg_match('/^\\\\\d{3}$/', $part)) {
291 10
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
292
            } else {
293 15
                $text .= $part;
294
            }
295
        }
296
297 15
        return $text;
298
    }
299
300
    /**
301
     * @param string $text
302
     *
303
     * @return string
304
     */
305 28
    public static function decodeEntities($text)
306
    {
307 28
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
308 28
        $text = '';
309
310 28
        foreach ($parts as $part) {
311 28
            if (preg_match('/^#\d{2}$/', $part)) {
312 2
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

312
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
313
            } else {
314 28
                $text .= $part;
315
            }
316
        }
317
318 28
        return $text;
319
    }
320
321
    /**
322
     * @param string $text
323
     *
324
     * @return string
325
     */
326 15
    public static function decodeUnicode($text)
327
    {
328 15
        if (preg_match('/^\xFE\xFF/i', $text)) {
329
            // Strip U+FEFF byte order marker.
330 10
            $decode = substr($text, 2);
331 10
            $text = '';
332 10
            $length = \strlen($decode);
333
334 10
            for ($i = 0; $i < $length; $i += 2) {
335 10
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
336
            }
337
        }
338
339 15
        return $text;
340
    }
341
342
    /**
343
     * @return int
344
     */
345 2
    protected function getFontSpaceLimit()
346
    {
347 2
        return -50;
348
    }
349
350
    /**
351
     * @param array $commands
352
     *
353
     * @return string
354
     */
355 2
    public function decodeText($commands)
356
    {
357 2
        $text = '';
358 2
        $word_position = 0;
359 2
        $words = [];
360 2
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
361 2
        $font_space = $this->getFontSpaceLimit();
362
363 2
        foreach ($commands as $command) {
364 2
            switch ($command[PDFObject::TYPE]) {
365 2
                case 'n':
366 2
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
367 1
                        $word_position = \count($words);
368
                    }
369 2
                    continue 2;
370
371 2
                case '<':
372
                    // Decode hexadecimal.
373 1
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
374 1
                    break;
375
376
                default:
377
                    // Decode octal (if necessary).
378 1
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
379
            }
380
381
            // replace escaped chars
382 2
            $text = str_replace(
383 2
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
384 2
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
385
                $text
386
            );
387
388
            // add content to result string
389 2
            if (isset($words[$word_position])) {
390 2
                $words[$word_position] .= $text;
391
            } else {
392 2
                $words[$word_position] = $text;
393
            }
394
        }
395
396 2
        foreach ($words as &$word) {
397 2
            $word = $this->decodeContent($word);
0 ignored issues
show
Bug introduced by
The call to Smalot\PdfParser\Font::decodeContent() has too few arguments starting with unicode. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

397
            /** @scrutinizer ignore-call */ 
398
            $word = $this->decodeContent($word);

This check compares calls to functions or methods with their respective definitions. If the call has less arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
Deprecated Code introduced by
The function Smalot\PdfParser\Font::decodeContent() has been deprecated: Usage of second parameter $unicode is deprecated. It might be removed in a future release. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

397
            $word = /** @scrutinizer ignore-deprecated */ $this->decodeContent($word);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
398
        }
399
400
        return implode(' ', $words);
401
    }
402
403
    /**
404
     * @param string $text
405
     *
406
     * @deprecated Usage of second parameter $unicode is deprecated. It might be removed in a future release.
407
     *
408
     * @return string
409
     */
410
    public function decodeContent($text, &$unicode)
411
    {
412
        if ($this->has('ToUnicode')) {
413
            $bytes = $this->tableSizes['from'];
414
415
            if ($bytes) {
416
                $result = '';
417
                $length = \strlen($text);
418
419
                for ($i = 0; $i < $length; $i += $bytes) {
420
                    $char = substr($text, $i, $bytes);
421
422
                    if (false !== ($decoded = $this->translateChar($char, false))) {
423
                        $char = $decoded;
424
                    } elseif ($this->has('DescendantFonts')) {
425
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
426
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

426
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
427
                        } else {
428
                            $fonts = $this->get('DescendantFonts')->getContent();
429
                        }
430
                        $decoded = false;
431
432
                        foreach ($fonts as $font) {
433
                            if ($font instanceof self) {
434
                                if (false !== ($decoded = $font->translateChar($char, false))) {
435
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

435
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
436
                                    break;
437
                                }
438
                            }
439
                        }
440
441
                        if (false !== $decoded) {
442
                            $char = $decoded;
443
                        } else {
444
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
445
                        }
446
                    } else {
447
                        $char = self::MISSING;
448
                    }
449
450
                    $result .= $char;
451
                }
452
453
                $text = $result;
454
            }
455
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
456
            /** @var Encoding $encoding */
457
            $encoding = $this->get('Encoding');
458
            $unicode = mb_check_encoding($text, 'UTF-8');
459
            $result = '';
460
            if ($unicode) {
461
                $chars = preg_split(
462
                        '//s'.($unicode ? 'u' : ''),
463
                        $text,
464
                        -1,
465
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
466
                );
467
468
                foreach ($chars as $char) {
469
                    $dec_av = hexdec(bin2hex($char));
470
                    $dec_ap = $encoding->translateChar($dec_av);
471
                    $result .= self::uchr($dec_ap);
472
                }
473
            } else {
474
                $length = \strlen($text);
475
476
                for ($i = 0; $i < $length; ++$i) {
477
                    $dec_av = hexdec(bin2hex($text[$i]));
478
                    $dec_ap = $encoding->translateChar($dec_av);
479
                    $result .= self::uchr($dec_ap);
480
                }
481
            }
482
            $text = $result;
483
        } elseif ($this->get('Encoding') instanceof Element &&
484
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
485
            // mb_convert_encoding does not support MacRoman/macintosh,
486
            // so we use iconv() here
487
            $text = iconv('macintosh', 'UTF-8', $text);
488
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
489
            // don't double-encode strings already in UTF-8
490
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
491
        }
492
493
        return $text;
494
    }
495
}
496