Passed
Pull Request — master (#378)
by Konrad
02:01
created

Font::getName()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
ccs 2
cts 2
cp 1
cc 2
nc 2
nop 0
crap 2
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 26
    public function init()
53
    {
54
        // Load translate table.
55 26
        $this->loadTranslateTable();
56 26
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 15
    public function translateChar($char, $use_default = true)
97
    {
98 15
        $dec = hexdec(bin2hex($char));
99
100 15
        if (\array_key_exists($dec, $this->table)) {
101 14
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 4
        $fallbackDecoded = $char;
106
        if (
107 4
            \strlen($char) < 2
108 4
            && $this->has('Encoding')
109 4
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
110
        ) {
111
            $fallbackDecoded = self::uchr($dec);
0 ignored issues
show
Bug introduced by
It seems like $dec can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

111
            $fallbackDecoded = self::uchr(/** @scrutinizer ignore-type */ $dec);
Loading history...
112
        }
113
114 4
        return $use_default ? self::MISSING : $fallbackDecoded;
115
    }
116
117
    /**
118
     * @param int $code
119
     *
120
     * @return string
121
     */
122 26
    public static function uchr($code)
123
    {
124
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
125
        // therefore, we use mb_convert_encoding() instead
126 26
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
127
    }
128
129
    /**
130
     * @return array
131
     */
132 26
    public function loadTranslateTable()
133
    {
134 26
        if (null !== $this->table) {
135 1
            return $this->table;
136
        }
137
138 26
        $this->table = [];
139 26
        $this->tableSizes = [
140
            'from' => 1,
141
            'to' => 1,
142
        ];
143
144 26
        if ($this->has('ToUnicode')) {
145 23
            $content = $this->get('ToUnicode')->getContent();
146 23
            $matches = [];
147
148
            // Support for multiple spacerange sections
149 23
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
150 23
                foreach ($matches['sections'] as $section) {
151 23
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
152
153 23
                    preg_match_all($regexp, $section, $matches);
154
155 23
                    $this->tableSizes = [
156 23
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
157 23
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
158
                    ];
159
160 23
                    break;
161
                }
162
            }
163
164
            // Support for multiple bfchar sections
165 23
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
166 8
                foreach ($matches['sections'] as $section) {
167 8
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
168
169 8
                    preg_match_all($regexp, $section, $matches);
170
171 8
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
172
173 8
                    foreach ($matches['from'] as $key => $from) {
174 8
                        $parts = preg_split(
175 8
                            '/([0-9A-F]{4})/i',
176 8
                            $matches['to'][$key],
177 8
                            0,
178 8
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
179
                        );
180 8
                        $text = '';
181 8
                        foreach ($parts as $part) {
182 8
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

182
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
183
                        }
184 8
                        $this->table[hexdec($from)] = $text;
185
                    }
186
                }
187
            }
188
189
            // Support for multiple bfrange sections
190 23
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
191 19
                foreach ($matches['sections'] as $section) {
192
                    // Support for : <srcCode1> <srcCode2> <dstString>
193 19
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
194
195 19
                    preg_match_all($regexp, $section, $matches);
196
197 19
                    foreach ($matches['from'] as $key => $from) {
198 19
                        $char_from = hexdec($from);
199 19
                        $char_to = hexdec($matches['to'][$key]);
200 19
                        $offset = hexdec($matches['offset'][$key]);
201
202 19
                        for ($char = $char_from; $char <= $char_to; ++$char) {
203 19
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
204
                        }
205
                    }
206
207
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
208
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
209 19
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
210
211 19
                    preg_match_all($regexp, $section, $matches);
212
213 19
                    foreach ($matches['from'] as $key => $from) {
214 1
                        $char_from = hexdec($from);
215 1
                        $strings = [];
216
217 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
218
219 1
                        foreach ($strings['string'] as $position => $string) {
220 1
                            $parts = preg_split(
221 1
                                '/([0-9A-F]{4})/i',
222
                                $string,
223 1
                                0,
224 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
225
                            );
226 1
                            $text = '';
227 1
                            foreach ($parts as $part) {
228 1
                                $text .= self::uchr(hexdec($part));
229
                            }
230 1
                            $this->table[$char_from + $position] = $text;
231
                        }
232
                    }
233
                }
234
            }
235
        }
236
237 26
        return $this->table;
238
    }
239
240
    /**
241
     * @param array $table
242
     */
243
    public function setTable($table)
244
    {
245
        $this->table = $table;
246
    }
247
248
    /**
249
     * @param string $hexa
250
     * @param bool   $add_braces
251
     *
252
     * @return string
253
     */
254 29
    public static function decodeHexadecimal($hexa, $add_braces = false)
255
    {
256
        // Special shortcut for XML content.
257 29
        if (false !== stripos($hexa, '<?xml')) {
258 3
            return $hexa;
259
        }
260
261 29
        $text = '';
262 29
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
263
264 29
        foreach ($parts as $part) {
265 29
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
266
                // strip line breaks
267 9
                $part = preg_replace("/[\r\n]/", '', $part);
268 9
                $part = trim($part, '<>');
269 9
                if ($add_braces) {
270 1
                    $text .= '(';
271
                }
272
273 9
                $part = pack('H*', $part);
274 9
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
275
276 9
                if ($add_braces) {
277 9
                    $text .= ')';
278
                }
279
            } else {
280 29
                $text .= $part;
281
            }
282
        }
283
284 29
        return $text;
285
    }
286
287
    /**
288
     * @param string $text
289
     *
290
     * @return string
291
     */
292 29
    public static function decodeOctal($text)
293
    {
294 29
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
295 29
        $text = '';
296
297 29
        foreach ($parts as $part) {
298 29
            if (preg_match('/^\\\\\d{3}$/', $part)) {
299 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

299
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
300
            } else {
301 29
                $text .= $part;
302
            }
303
        }
304
305 29
        return $text;
306
    }
307
308
    /**
309
     * @param string $text
310
     *
311
     * @return string
312
     */
313 43
    public static function decodeEntities($text)
314
    {
315 43
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
316 43
        $text = '';
317
318 43
        foreach ($parts as $part) {
319 43
            if (preg_match('/^#\d{2}$/', $part)) {
320 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

320
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
321
            } else {
322 43
                $text .= $part;
323
            }
324
        }
325
326 43
        return $text;
327
    }
328
329
    /**
330
     * @param string $text
331
     *
332
     * @return string
333
     */
334 29
    public static function decodeUnicode($text)
335
    {
336 29
        if (preg_match('/^\xFE\xFF/i', $text)) {
337
            // Strip U+FEFF byte order marker.
338 19
            $decode = substr($text, 2);
339 19
            $text = '';
340 19
            $length = \strlen($decode);
341
342 19
            for ($i = 0; $i < $length; $i += 2) {
343 19
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

343
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
344
            }
345
        }
346
347 29
        return $text;
348
    }
349
350
    /**
351
     * @return int
352
     */
353 13
    protected function getFontSpaceLimit()
354
    {
355 13
        return -50;
356
    }
357
358
    /**
359
     * @param array $commands
360
     *
361
     * @return string
362
     */
363 13
    public function decodeText($commands)
364
    {
365 13
        $text = '';
366 13
        $word_position = 0;
367 13
        $words = [];
368 13
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
369 13
        $font_space = $this->getFontSpaceLimit();
370
371 13
        foreach ($commands as $command) {
372 13
            switch ($command[PDFObject::TYPE]) {
373 13
                case 'n':
374 11
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
375 5
                        $word_position = \count($words);
376
                    }
377 11
                    continue 2;
378 13
                case '<':
379
                    // Decode hexadecimal.
380 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
381 7
                    break;
382
383
                default:
384
                    // Decode octal (if necessary).
385 9
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
386
            }
387
388
            // replace escaped chars
389 13
            $text = str_replace(
390 13
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
391 13
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
392
                $text
393
            );
394
395
            // add content to result string
396 13
            if (isset($words[$word_position])) {
397 11
                $words[$word_position] .= $text;
398
            } else {
399 13
                $words[$word_position] = $text;
400
            }
401
        }
402
403 13
        foreach ($words as &$word) {
404 13
            $word = $this->decodeContent($word);
405
        }
406
407 13
        return implode(' ', $words);
408
    }
409
410
    /**
411
     * @param string $text
412
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
413
     *
414
     * @return string
415
     */
416 15
    public function decodeContent($text, &$unicode = null)
417
    {
418 15
        if ($this->has('ToUnicode')) {
419 13
            $bytes = $this->tableSizes['from'];
420
421 13
            if ($bytes) {
422 13
                $result = '';
423 13
                $length = \strlen($text);
424
425 13
                for ($i = 0; $i < $length; $i += $bytes) {
426 13
                    $char = substr($text, $i, $bytes);
427
428 13
                    if (false !== ($decoded = $this->translateChar($char, false))) {
429 13
                        $char = $decoded;
430
                    } elseif ($this->has('DescendantFonts')) {
431
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
432
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

432
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
433
                        } else {
434
                            $fonts = $this->get('DescendantFonts')->getContent();
435
                        }
436
                        $decoded = false;
437
438
                        foreach ($fonts as $font) {
439
                            if ($font instanceof self) {
440
                                if (false !== ($decoded = $font->translateChar($char, false))) {
441
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

441
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
442
                                    break;
443
                                }
444
                            }
445
                        }
446
447
                        if (false !== $decoded) {
448
                            $char = $decoded;
449
                        } else {
450
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
451
                        }
452
                    } else {
453
                        $char = self::MISSING;
454
                    }
455
456 13
                    $result .= $char;
457
                }
458
459 13
                $text = $result;
460
            }
461 10
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
462
            /** @var Encoding $encoding */
463 2
            $encoding = $this->get('Encoding');
464 2
            $unicode = mb_check_encoding($text, 'UTF-8');
465 2
            $result = '';
466 2
            if ($unicode) {
467 2
                $chars = preg_split(
468 2
                        '//s'.($unicode ? 'u' : ''),
469
                        $text,
470 2
                        -1,
471 2
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
472
                );
473
474 2
                foreach ($chars as $char) {
475 2
                    $dec_av = hexdec(bin2hex($char));
476 2
                    $dec_ap = $encoding->translateChar($dec_av);
477 2
                    $result .= self::uchr($dec_ap);
478
                }
479
            } else {
480 2
                $length = \strlen($text);
481
482 2
                for ($i = 0; $i < $length; ++$i) {
483 2
                    $dec_av = hexdec(bin2hex($text[$i]));
484 2
                    $dec_ap = $encoding->translateChar($dec_av);
485 2
                    $result .= self::uchr($dec_ap);
486
                }
487
            }
488 2
            $text = $result;
489 9
        } elseif ($this->get('Encoding') instanceof Element &&
490 9
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
491
            // mb_convert_encoding does not support MacRoman/macintosh,
492
            // so we use iconv() here
493 1
            $text = iconv('macintosh', 'UTF-8', $text);
494 9
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
495
            // don't double-encode strings already in UTF-8
496 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
497
        }
498
499 15
        return $text;
500
    }
501
}
502