Passed
Push — master ( 416ff0...c2c117 )
by Konrad
04:12 queued 02:09
created

Font::setTable()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53
    /**
54
     * Caches results from uchr.
55
     *
56
     * @var array
57
     */
58
    private static $uchrCache = [];
59
60 35
    public function init()
61
    {
62
        // Load translate table.
63 35
        $this->loadTranslateTable();
64 35
    }
65
66 2
    public function getName(): string
67
    {
68 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
69
    }
70
71 2
    public function getType(): string
72
    {
73 2
        return (string) $this->header->get('Subtype');
74
    }
75
76 1
    public function getDetails(bool $deep = true): array
77
    {
78 1
        $details = [];
79
80 1
        $details['Name'] = $this->getName();
81 1
        $details['Type'] = $this->getType();
82 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
83
84 1
        $details += parent::getDetails($deep);
85
86 1
        return $details;
87
    }
88
89
    /**
90
     * @return string|bool
91
     */
92 21
    public function translateChar(string $char, bool $use_default = true)
93
    {
94 21
        $dec = hexdec(bin2hex($char));
95
96 21
        if (\array_key_exists($dec, $this->table)) {
97 18
            return $this->table[$dec];
98
        }
99
100
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
101 6
        $fallbackDecoded = $char;
102
        if (
103 6
            \strlen($char) < 2
104 6
            && $this->has('Encoding')
105 6
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
106
        ) {
107
            try {
108 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
109 1
                    $fallbackDecoded = self::uchr($dec);
110
                }
111 1
            } catch (EncodingNotFoundException $e) {
112
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
113
                // See table 5.11 on PDF 1.5 specs for more info
114
            }
115
        }
116
117 6
        return $use_default ? self::MISSING : $fallbackDecoded;
118
    }
119
120 34
    public static function uchr(int $code): string
121
    {
122 34
        if (!isset(self::$uchrCache[$code])) {
123
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
124
            // therefore, we use mb_convert_encoding() instead
125 12
            self::$uchrCache[$code] = mb_convert_encoding("&#{$code};", 'UTF-8', 'HTML-ENTITIES');
126
        }
127
128 34
        return self::$uchrCache[$code];
129
    }
130
131 35
    public function loadTranslateTable(): array
132
    {
133 35
        if (null !== $this->table) {
134 1
            return $this->table;
135
        }
136
137 35
        $this->table = [];
138 35
        $this->tableSizes = [
139
            'from' => 1,
140
            'to' => 1,
141
        ];
142
143 35
        if ($this->has('ToUnicode')) {
144 30
            $content = $this->get('ToUnicode')->getContent();
145 30
            $matches = [];
146
147
            // Support for multiple spacerange sections
148 30
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
149 30
                foreach ($matches['sections'] as $section) {
150 30
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
151
152 30
                    preg_match_all($regexp, $section, $matches);
153
154 30
                    $this->tableSizes = [
155 30
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
156 30
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
157
                    ];
158
159 30
                    break;
160
                }
161
            }
162
163
            // Support for multiple bfchar sections
164 30
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
165 12
                foreach ($matches['sections'] as $section) {
166 12
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
167
168 12
                    preg_match_all($regexp, $section, $matches);
169
170 12
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
171
172 12
                    foreach ($matches['from'] as $key => $from) {
173 12
                        $parts = preg_split(
174 12
                            '/([0-9A-F]{4})/i',
175 12
                            $matches['to'][$key],
176 12
                            0,
177 12
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
178
                        );
179 12
                        $text = '';
180 12
                        foreach ($parts as $part) {
181 12
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

181
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
182
                        }
183 12
                        $this->table[hexdec($from)] = $text;
184
                    }
185
                }
186
            }
187
188
            // Support for multiple bfrange sections
189 30
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
190 24
                foreach ($matches['sections'] as $section) {
191
                    // Support for : <srcCode1> <srcCode2> <dstString>
192 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
193
194 24
                    preg_match_all($regexp, $section, $matches);
195
196 24
                    foreach ($matches['from'] as $key => $from) {
197 24
                        $char_from = hexdec($from);
198 24
                        $char_to = hexdec($matches['to'][$key]);
199 24
                        $offset = hexdec($matches['offset'][$key]);
200
201 24
                        for ($char = $char_from; $char <= $char_to; ++$char) {
202 24
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
203
                        }
204
                    }
205
206
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
207
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
208 24
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
209
210 24
                    preg_match_all($regexp, $section, $matches);
211
212 24
                    foreach ($matches['from'] as $key => $from) {
213 1
                        $char_from = hexdec($from);
214 1
                        $strings = [];
215
216 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
217
218 1
                        foreach ($strings['string'] as $position => $string) {
219 1
                            $parts = preg_split(
220 1
                                '/([0-9A-F]{4})/i',
221
                                $string,
222 1
                                0,
223 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
224
                            );
225 1
                            $text = '';
226 1
                            foreach ($parts as $part) {
227 1
                                $text .= self::uchr(hexdec($part));
228
                            }
229 1
                            $this->table[$char_from + $position] = $text;
230
                        }
231
                    }
232
                }
233
            }
234
        }
235
236 35
        return $this->table;
237
    }
238
239 1
    public function setTable(array $table)
240
    {
241 1
        $this->table = $table;
242 1
    }
243
244 38
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
245
    {
246
        // Special shortcut for XML content.
247 38
        if (false !== stripos($hexa, '<?xml')) {
248 2
            return $hexa;
249
        }
250
251 38
        $text = '';
252 38
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
253
254 38
        foreach ($parts as $part) {
255 38
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
256
                // strip line breaks
257 12
                $part = preg_replace("/[\r\n]/", '', $part);
258 12
                $part = trim($part, '<>');
259 12
                if ($add_braces) {
260 1
                    $text .= '(';
261
                }
262
263 12
                $part = pack('H*', $part);
264 12
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
265
266 12
                if ($add_braces) {
267 12
                    $text .= ')';
268
                }
269
            } else {
270 38
                $text .= $part;
271
            }
272
        }
273
274 38
        return $text;
275
    }
276
277 38
    public static function decodeOctal(string $text): string
278
    {
279 38
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
280 38
        $text = '';
281
282 38
        foreach ($parts as $part) {
283 38
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
284 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

284
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
285
            } else {
286 38
                $text .= $part;
287
            }
288
        }
289
290 38
        return $text;
291
    }
292
293 52
    public static function decodeEntities(string $text): string
294
    {
295 52
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
296 52
        $text = '';
297
298 52
        foreach ($parts as $part) {
299 52
            if (preg_match('/^#\d{2}$/', $part)) {
300 4
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
301
            } else {
302 52
                $text .= $part;
303
            }
304
        }
305
306 52
        return $text;
307
    }
308
309 38
    public static function decodeUnicode(string $text): string
310
    {
311 38
        if (preg_match('/^\xFE\xFF/i', $text)) {
312
            // Strip U+FEFF byte order marker.
313 24
            $decode = substr($text, 2);
314 24
            $text = '';
315 24
            $length = \strlen($decode);
316
317 24
            for ($i = 0; $i < $length; $i += 2) {
318 24
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

318
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
319
            }
320
        }
321
322 38
        return $text;
323
    }
324
325
    /**
326
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
327
     */
328 19
    protected function getFontSpaceLimit(): int
329
    {
330 19
        return $this->config->getFontSpaceLimit();
331
    }
332
333 19
    public function decodeText(array $commands): string
334
    {
335 19
        $word_position = 0;
336 19
        $words = [];
337 19
        $font_space = $this->getFontSpaceLimit();
338
339 19
        foreach ($commands as $command) {
340 19
            switch ($command[PDFObject::TYPE]) {
341 19
                case 'n':
342 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
343 7
                        $word_position = \count($words);
344
                    }
345 15
                    continue 2;
346 19
                case '<':
347
                    // Decode hexadecimal.
348 10
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
349 10
                    break;
350
351
                default:
352
                    // Decode octal (if necessary).
353 12
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
354
            }
355
356
            // replace escaped chars
357 19
            $text = str_replace(
358 19
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
359 19
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
360
                $text
361
            );
362
363
            // add content to result string
364 19
            if (isset($words[$word_position])) {
365 15
                $words[$word_position] .= $text;
366
            } else {
367 19
                $words[$word_position] = $text;
368
            }
369
        }
370
371 19
        foreach ($words as &$word) {
372 19
            $word = $this->decodeContent($word);
373
        }
374
375 19
        return implode(' ', $words);
376
    }
377
378
    /**
379
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
380
     */
381 21
    public function decodeContent(string $text, ?bool &$unicode = null): string
382
    {
383 21
        if ($this->has('ToUnicode')) {
384 18
            $bytes = $this->tableSizes['from'];
385
386 18
            if ($bytes) {
387 18
                $result = '';
388 18
                $length = \strlen($text);
389
390 18
                for ($i = 0; $i < $length; $i += $bytes) {
391 18
                    $char = substr($text, $i, $bytes);
392
393 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
394 18
                        $char = $decoded;
395
                    } elseif ($this->has('DescendantFonts')) {
396
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
397
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

397
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
398
                        } else {
399
                            $fonts = $this->get('DescendantFonts')->getContent();
400
                        }
401
                        $decoded = false;
402
403
                        foreach ($fonts as $font) {
404
                            if ($font instanceof self) {
405
                                if (false !== ($decoded = $font->translateChar($char, false))) {
406
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

406
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
407
                                    break;
408
                                }
409
                            }
410
                        }
411
412
                        if (false !== $decoded) {
413
                            $char = $decoded;
414
                        } else {
415
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
416
                        }
417
                    } else {
418
                        $char = self::MISSING;
419
                    }
420
421 18
                    $result .= $char;
422
                }
423
424 18
                $text = $result;
425
            }
426 13
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
427
            /** @var Encoding $encoding */
428 2
            $encoding = $this->get('Encoding');
429 2
            $unicode = mb_check_encoding($text, 'UTF-8');
430 2
            $result = '';
431 2
            if ($unicode) {
432 2
                $chars = preg_split(
433 2
                        '//s'.($unicode ? 'u' : ''),
434
                        $text,
435 2
                        -1,
436 2
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
437
                );
438
439 2
                foreach ($chars as $char) {
440 2
                    $dec_av = hexdec(bin2hex($char));
441 2
                    $dec_ap = $encoding->translateChar($dec_av);
442 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
443
                }
444
            } else {
445 2
                $length = \strlen($text);
446
447 2
                for ($i = 0; $i < $length; ++$i) {
448 2
                    $dec_av = hexdec(bin2hex($text[$i]));
449 2
                    $dec_ap = $encoding->translateChar($dec_av);
450 2
                    $result .= self::uchr($dec_ap ?? $dec_av);
451
                }
452
            }
453 2
            $text = $result;
454 11
        } elseif ($this->get('Encoding') instanceof Element &&
455 11
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
456
            // mb_convert_encoding does not support MacRoman/macintosh,
457
            // so we use iconv() here
458 1
            $text = iconv('macintosh', 'UTF-8', $text);
459 11
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
460
            // don't double-encode strings already in UTF-8
461 3
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
462
        }
463
464 21
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
465
    }
466
}
467