Completed
Pull Request — master (#349)
by
unknown
04:33 queued 02:38
created

Font::uchr()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 2
b 0
f 0
nc 1
nop 1
dl 0
loc 5
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 23
    public function init()
51
    {
52
        // Load translate table.
53 23
        $this->loadTranslateTable();
54 23
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 13
    public function translateChar($char, $use_default = true)
95
    {
96 13
        $dec = hexdec(bin2hex($char));
97
98 13
        if (\array_key_exists($dec, $this->table)) {
99 13
            return $this->table[$dec];
100
        }
101
102
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
103 3
        $fallbackDecoded = $char;
104 3
        if (\strlen($char) < 2 && $this->has('Encoding') && 'WinAnsiEncoding' === $this->get('Encoding')->__toString()) {
105
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

105
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
106
        }
107
108 3
        return $use_default ? self::MISSING : $fallbackDecoded;
109
    }
110
111
    /**
112
     * @param int $code
113
     *
114
     * @return string
115
     */
116 24
    public static function uchr($code)
117
    {
118
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
119
        // therefore, we use mb_convert_encoding() instead
120 24
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
121
    }
122
123
    /**
124
     * @return array
125
     */
126 23
    public function loadTranslateTable()
127
    {
128 23
        if (null !== $this->table) {
129 1
            return $this->table;
130
        }
131
132 23
        $this->table = [];
133 23
        $this->tableSizes = [
134
            'from' => 1,
135
            'to' => 1,
136
        ];
137
138 23
        if ($this->has('ToUnicode')) {
139 21
            $content = $this->get('ToUnicode')->getContent();
140 21
            $matches = [];
141
142
            // Support for multiple spacerange sections
143 21
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
144 21
                foreach ($matches['sections'] as $section) {
145 21
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
146
147 21
                    preg_match_all($regexp, $section, $matches);
148
149 21
                    $this->tableSizes = [
150 21
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
151 21
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
152
                    ];
153
154 21
                    break;
155
                }
156
            }
157
158
            // Support for multiple bfchar sections
159 21
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
160 7
                foreach ($matches['sections'] as $section) {
161 7
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
162
163 7
                    preg_match_all($regexp, $section, $matches);
164
165 7
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
166
167 7
                    foreach ($matches['from'] as $key => $from) {
168 7
                        $parts = preg_split(
169 7
                            '/([0-9A-F]{4})/i',
170 7
                            $matches['to'][$key],
171 7
                            0,
172 7
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
173
                        );
174 7
                        $text = '';
175 7
                        foreach ($parts as $part) {
176 7
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

176
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
177
                        }
178 7
                        $this->table[hexdec($from)] = $text;
179
                    }
180
                }
181
            }
182
183
            // Support for multiple bfrange sections
184 21
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
185 18
                foreach ($matches['sections'] as $section) {
186
                    // Support for : <srcCode1> <srcCode2> <dstString>
187 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
188
189 18
                    preg_match_all($regexp, $section, $matches);
190
191 18
                    foreach ($matches['from'] as $key => $from) {
192 18
                        $char_from = hexdec($from);
193 18
                        $char_to = hexdec($matches['to'][$key]);
194 18
                        $offset = hexdec($matches['offset'][$key]);
195
196 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
197 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
198
                        }
199
                    }
200
201
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
202
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
203 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
204
205 18
                    preg_match_all($regexp, $section, $matches);
206
207 18
                    foreach ($matches['from'] as $key => $from) {
208 1
                        $char_from = hexdec($from);
209 1
                        $strings = [];
210
211 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
212
213 1
                        foreach ($strings['string'] as $position => $string) {
214 1
                            $parts = preg_split(
215 1
                                '/([0-9A-F]{4})/i',
216
                                $string,
217 1
                                0,
218 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
219
                            );
220 1
                            $text = '';
221 1
                            foreach ($parts as $part) {
222 1
                                $text .= self::uchr(hexdec($part));
223
                            }
224 1
                            $this->table[$char_from + $position] = $text;
225
                        }
226
                    }
227
                }
228
            }
229
        }
230
231 23
        return $this->table;
232
    }
233
234
    /**
235
     * @param array $table
236
     */
237
    public function setTable($table)
238
    {
239
        $this->table = $table;
240
    }
241
242
    /**
243
     * @param string $hexa
244
     * @param bool   $add_braces
245
     *
246
     * @return string
247
     */
248 27
    public static function decodeHexadecimal($hexa, $add_braces = false)
249
    {
250
        // Special shortcut for XML content.
251 27
        if (false !== stripos($hexa, '<?xml')) {
252 3
            return $hexa;
253
        }
254
255 27
        $text = '';
256 27
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
257
258 27
        foreach ($parts as $part) {
259 27
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
260 9
                $part = trim($part, '<>');
261 9
                if ($add_braces) {
262 1
                    $text .= '(';
263
                }
264
265 9
                $part = pack('H*', $part);
266 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
267
268 9
                if ($add_braces) {
269 9
                    $text .= ')';
270
                }
271
            } else {
272 27
                $text .= $part;
273
            }
274
        }
275
276 27
        return $text;
277
    }
278
279
    /**
280
     * @param string $text
281
     *
282
     * @return string
283
     */
284 27
    public static function decodeOctal($text)
285
    {
286 27
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
287 27
        $text = '';
288
289 27
        foreach ($parts as $part) {
290 27
            if (preg_match('/^\\\\\d{3}$/', $part)) {
291 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

291
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
292
            } else {
293 27
                $text .= $part;
294
            }
295
        }
296
297 27
        return $text;
298
    }
299
300
    /**
301
     * @param string $text
302
     *
303
     * @return string
304
     */
305 40
    public static function decodeEntities($text)
306
    {
307 40
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
308 40
        $text = '';
309
310 40
        foreach ($parts as $part) {
311 40
            if (preg_match('/^#\d{2}$/', $part)) {
312 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

312
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
313
            } else {
314 40
                $text .= $part;
315
            }
316
        }
317
318 40
        return $text;
319
    }
320
321
    /**
322
     * @param string $text
323
     *
324
     * @return string
325
     */
326 27
    public static function decodeUnicode($text)
327
    {
328 27
        if (preg_match('/^\xFE\xFF/i', $text)) {
329
            // Strip U+FEFF byte order marker.
330 19
            $decode = substr($text, 2);
331 19
            $text = '';
332 19
            $length = \strlen($decode);
333
334 19
            for ($i = 0; $i < $length; $i += 2) {
335 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

335
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
336
            }
337
        }
338
339 27
        return $text;
340
    }
341
342
    /**
343
     * @return int
344
     */
345 12
    protected function getFontSpaceLimit()
346
    {
347 12
        return -50;
348
    }
349
350
    /**
351
     * @param array $commands
352
     *
353
     * @return string
354
     */
355 12
    public function decodeText($commands)
356
    {
357 12
        $text = '';
358 12
        $word_position = 0;
359 12
        $words = [];
360 12
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
361 12
        $font_space = $this->getFontSpaceLimit();
362
363 12
        foreach ($commands as $command) {
364 12
            switch ($command[PDFObject::TYPE]) {
365 12
                case 'n':
366 10
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
367 5
                        $word_position = \count($words);
368
                    }
369 10
                    continue 2;
370
371 12
                case '<':
372
                    // Decode hexadecimal.
373 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
374 7
                    break;
375
376
                default:
377
                    // Decode octal (if necessary).
378 8
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
379
            }
380
381
            // replace escaped chars
382 12
            $text = str_replace(
383 12
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
384 12
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
385
                $text
386
            );
387
388
            // add content to result string
389 12
            if (isset($words[$word_position])) {
390 10
                $words[$word_position] .= $text;
391
            } else {
392 12
                $words[$word_position] = $text;
393
            }
394
        }
395
396 12
        foreach ($words as &$word) {
397 12
            $word = $this->decodeContent($word);
0 ignored issues
show
Deprecated Code introduced by
The function Smalot\PdfParser\Font::decodeContent() has been deprecated: Usage of second parameter $unicode is deprecated. It might be removed in a future release. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

397
            $word = /** @scrutinizer ignore-deprecated */ $this->decodeContent($word);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
398
        }
399
400 12
        return implode(' ', $words);
401
    }
402
403
    /**
404
     * @param string $text
405
     *
406
     * @deprecated Usage of second parameter $unicode is deprecated. It might be removed in a future release.
407
     *
408
     * @return string
409
     */
410 14
    public function decodeContent($text, &$unicode = null)
411
    {
412 14
        if ($this->has('ToUnicode')) {
413 12
            $bytes = $this->tableSizes['from'];
414
415 12
            if ($bytes) {
416 12
                $result = '';
417 12
                $length = \strlen($text);
418
419 12
                for ($i = 0; $i < $length; $i += $bytes) {
420 12
                    $char = substr($text, $i, $bytes);
421
422 12
                    if (false !== ($decoded = $this->translateChar($char, false))) {
423 12
                        $char = $decoded;
424
                    } elseif ($this->has('DescendantFonts')) {
425
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
426
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

426
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
427
                        } else {
428
                            $fonts = $this->get('DescendantFonts')->getContent();
429
                        }
430
                        $decoded = false;
431
432
                        foreach ($fonts as $font) {
433
                            if ($font instanceof self) {
434
                                if (false !== ($decoded = $font->translateChar($char, false))) {
435
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

435
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
436
                                    break;
437
                                }
438
                            }
439
                        }
440
441
                        if (false !== $decoded) {
442
                            $char = $decoded;
443
                        } else {
444
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
445
                        }
446
                    } else {
447
                        $char = self::MISSING;
448
                    }
449
450 12
                    $result .= $char;
451
                }
452
453 12
                $text = $result;
454
            }
455 9
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
456
            /** @var Encoding $encoding */
457 1
            $encoding = $this->get('Encoding');
458 1
            $unicode = mb_check_encoding($text, 'UTF-8');
459 1
            $result = '';
460 1
            if ($unicode) {
461 1
                $chars = preg_split(
462 1
                        '//s'.($unicode ? 'u' : ''),
463
                        $text,
464 1
                        -1,
465 1
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
466
                );
467
468 1
                foreach ($chars as $char) {
469 1
                    $dec_av = hexdec(bin2hex($char));
470 1
                    $dec_ap = $encoding->translateChar($dec_av);
471
                    $result .= self::uchr($dec_ap);
472
                }
473
            } else {
474
                $length = \strlen($text);
475
476
                for ($i = 0; $i < $length; ++$i) {
477
                    $dec_av = hexdec(bin2hex($text[$i]));
478
                    $dec_ap = $encoding->translateChar($dec_av);
479
                    $result .= self::uchr($dec_ap);
480
                }
481
            }
482
            $text = $result;
483 9
        } elseif ($this->get('Encoding') instanceof Element &&
484 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
485
            // mb_convert_encoding does not support MacRoman/macintosh,
486
            // so we use iconv() here
487 1
            $text = iconv('macintosh', 'UTF-8', $text);
488 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
489
            // don't double-encode strings already in UTF-8
490 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
491
        }
492
493 14
        return $text;
494
    }
495
}
496