Passed
Pull Request — master (#440)
by
unknown
02:15
created

Font::translateChar()   B

Complexity

Conditions 8
Paths 11

Size

Total Lines 26
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 8

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 8
eloc 13
c 2
b 0
f 0
nc 11
nop 2
dl 0
loc 26
ccs 12
cts 12
cp 1
crap 8
rs 8.4444
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
use Smalot\PdfParser\Exception\EncodingNotFoundException;
35
36
/**
37
 * Class Font
38
 */
39
class Font extends PDFObject
40
{
41
    const MISSING = '?';
42
43
    /**
44
     * @var array
45
     */
46
    protected $table = null;
47
48
    /**
49
     * @var array
50
     */
51
    protected $tableSizes = null;
52
53 31
    public function init()
54
    {
55
        // Load translate table.
56 31
        $this->loadTranslateTable();
57 31
    }
58
59 2
    public function getName(): string
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64 2
    public function getType(): string
65
    {
66 2
        return (string) $this->header->get('Subtype');
67
    }
68
69 1
    public function getDetails(bool $deep = true): array
70
    {
71 1
        $details = [];
72
73 1
        $details['Name'] = $this->getName();
74 1
        $details['Type'] = $this->getType();
75 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
76
77 1
        $details += parent::getDetails($deep);
78
79 1
        return $details;
80
    }
81
82
    /**
83
     * @return string|bool
84
     */
85 21
    public function translateChar(string $char, bool $use_default = true)
86
    {
87 21
        $dec = hexdec(bin2hex($char));
88
89 21
        if (\array_key_exists($dec, $this->table)) {
90 18
            return $this->table[$dec];
91
        }
92
93
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
94 7
        $fallbackDecoded = $char;
95
        if (
96 7
            \strlen($char) < 2
97 7
            && $this->has('Encoding')
98 7
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
99
        ) {
100
            try {
101 2
                if (WinAnsiEncoding::class === $this->get('Encoding')->__toString()) {
102 1
                    $fallbackDecoded = self::uchr($dec);
103
                }
104 1
            } catch (EncodingNotFoundException $e) {
105
                // Encoding->getEncodingClass() throws EncodingNotFoundException when BaseEncoding doesn't exists
106
                // See table 5.11 on PDF 1.5 specs for more info
107
            }
108
        }
109
110 7
        return $use_default ? self::MISSING : $fallbackDecoded;
111
    }
112
113 30
    public static function uchr(int $code): string
114
    {
115
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
116
        // therefore, we use mb_convert_encoding() instead
117 30
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
118
    }
119
120 31
    public function loadTranslateTable(): array
121
    {
122 31
        if (null !== $this->table) {
123 1
            return $this->table;
124
        }
125
126 31
        $this->table = [];
127 31
        $this->tableSizes = [
128
            'from' => 1,
129
            'to' => 1,
130
        ];
131
132 31
        if ($this->has('ToUnicode')) {
133 28
            $content = $this->get('ToUnicode')->getContent();
134 28
            $matches = [];
135
136
            // Support for multiple spacerange sections
137 28
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
138 28
                foreach ($matches['sections'] as $section) {
139 28
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
140
141 28
                    preg_match_all($regexp, $section, $matches);
142
143 28
                    $this->tableSizes = [
144 28
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
145 28
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
146
                    ];
147
148 28
                    break;
149
                }
150
            }
151
152
            // Support for multiple bfchar sections
153 28
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
154 11
                foreach ($matches['sections'] as $section) {
155 11
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
156
157 11
                    preg_match_all($regexp, $section, $matches);
158
159 11
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
160
161 11
                    foreach ($matches['from'] as $key => $from) {
162 11
                        $parts = preg_split(
163 11
                            '/([0-9A-F]{4})/i',
164 11
                            $matches['to'][$key],
165 11
                            0,
166 11
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
167
                        );
168 11
                        $text = '';
169 11
                        foreach ($parts as $part) {
170 11
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

170
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
171
                        }
172 11
                        $this->table[hexdec($from)] = $text;
173
                    }
174
                }
175
            }
176
177
            // Support for multiple bfrange sections
178 28
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
179 22
                foreach ($matches['sections'] as $section) {
180
                    // Support for : <srcCode1> <srcCode2> <dstString>
181 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
182
183 22
                    preg_match_all($regexp, $section, $matches);
184
185 22
                    foreach ($matches['from'] as $key => $from) {
186 22
                        $char_from = hexdec($from);
187 22
                        $char_to = hexdec($matches['to'][$key]);
188 22
                        $offset = hexdec($matches['offset'][$key]);
189
190 22
                        for ($char = $char_from; $char <= $char_to; ++$char) {
191 22
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
192
                        }
193
                    }
194
195
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
196
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
197 22
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
198
199 22
                    preg_match_all($regexp, $section, $matches);
200
201 22
                    foreach ($matches['from'] as $key => $from) {
202 1
                        $char_from = hexdec($from);
203 1
                        $strings = [];
204
205 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
206
207 1
                        foreach ($strings['string'] as $position => $string) {
208 1
                            $parts = preg_split(
209 1
                                '/([0-9A-F]{4})/i',
210
                                $string,
211 1
                                0,
212 1
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
213
                            );
214 1
                            $text = '';
215 1
                            foreach ($parts as $part) {
216 1
                                $text .= self::uchr(hexdec($part));
217
                            }
218 1
                            $this->table[$char_from + $position] = $text;
219
                        }
220
                    }
221
                }
222
            }
223
        }
224
225 31
        return $this->table;
226
    }
227
228 1
    public function setTable(array $table)
229
    {
230 1
        $this->table = $table;
231 1
    }
232
233 34
    public static function decodeHexadecimal(string $hexa, bool $add_braces = false): string
234
    {
235
        // Special shortcut for XML content.
236 34
        if (false !== stripos($hexa, '<?xml')) {
237 3
            return $hexa;
238
        }
239
240 34
        $text = '';
241 34
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
242
243 34
        foreach ($parts as $part) {
244 34
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
245
                // strip line breaks
246 13
                $part = preg_replace("/[\r\n]/", '', $part);
247 13
                $part = trim($part, '<>');
248 13
                if ($add_braces) {
249 1
                    $text .= '(';
250
                }
251
252 13
                $part = pack('H*', $part);
253 13
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
254
255 13
                if ($add_braces) {
256 13
                    $text .= ')';
257
                }
258
            } else {
259 34
                $text .= $part;
260
            }
261
        }
262
263 34
        return $text;
264
    }
265
266 34
    public static function decodeOctal(string $text): string
267
    {
268 34
        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
269 34
        $text = '';
270
271 34
        foreach ($parts as $part) {
272 34
            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
273 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

273
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
274
            } else {
275 34
                $text .= $part;
276
            }
277
        }
278
279 34
        return $text;
280
    }
281
282 48
    public static function decodeEntities(string $text): string
283
    {
284 48
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
285 48
        $text = '';
286
287 48
        foreach ($parts as $part) {
288 48
            if (preg_match('/^#\d{2}$/', $part)) {
289 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

289
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
290
            } else {
291 48
                $text .= $part;
292
            }
293
        }
294
295 48
        return $text;
296
    }
297
298 34
    public static function decodeUnicode(string $text): string
299
    {
300 34
        if (preg_match('/^\xFE\xFF/i', $text)) {
301
            // Strip U+FEFF byte order marker.
302 21
            $decode = substr($text, 2);
303 21
            $text = '';
304 21
            $length = \strlen($decode);
305
306 21
            for ($i = 0; $i < $length; $i += 2) {
307 21
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

307
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
308
            }
309
        }
310
311 34
        return $text;
312
    }
313
314
    /**
315
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
316
     */
317 18
    protected function getFontSpaceLimit(): int
318
    {
319 18
        return $this->config->getFontSpaceLimit();
320
    }
321
322 18
    public function decodeText(array $commands): string
323
    {
324 18
        $word_position = 0;
325 18
        $words = [];
326 18
        $font_space = $this->getFontSpaceLimit();
327
328 18
        foreach ($commands as $command) {
329 18
            switch ($command[PDFObject::TYPE]) {
330 18
                case 'n':
331 15
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
332 8
                        $word_position = \count($words);
333
                    }
334 15
                    continue 2;
335 18
                case '<':
336
                    // Decode hexadecimal.
337 11
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
338 11
                    break;
339
340
                default:
341
                    // Decode octal (if necessary).
342 11
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
343
            }
344
345
            // replace escaped chars
346 18
            $text = str_replace(
347 18
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
348 18
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
349
                $text
350
            );
351
352
            // add content to result string
353 18
            if (isset($words[$word_position])) {
354 15
                $words[$word_position] .= $text;
355
            } else {
356 18
                $words[$word_position] = $text;
357
            }
358
        }
359
360 18
        foreach ($words as &$word) {
361 18
            $word = $this->decodeContent($word);
362
        }
363
364 18
        return implode(' ', $words);
365
    }
366
367 20
    public function decodeContent(string $text, ?bool &$unicode = null): string
368
    {
369 20
        if ($this->has('ToUnicode')) {
370 18
            $bytes = $this->tableSizes['from'];
371
372 18
            if ($bytes) {
373 18
                $result = '';
374 18
                $length = \strlen($text);
375
376 18
                for ($i = 0; $i < $length; $i += $bytes) {
377 18
                    $char = substr($text, $i, $bytes);
378
379 18
                    if (false !== ($decoded = $this->translateChar($char, false))) {
380 18
                        $char = $decoded;
381
                    } elseif ($this->has('DescendantFonts')) {
382
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
383
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

383
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
384
                        } else {
385
                            $fonts = $this->get('DescendantFonts')->getContent();
386
                        }
387
                        $decoded = false;
388
389
                        foreach ($fonts as $font) {
390
                            if ($font instanceof self) {
391
                                if (false !== ($decoded = $font->translateChar($char, false))) {
392
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

392
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
393
                                    break;
394
                                }
395
                            }
396
                        }
397
398
                        if (false !== $decoded) {
399
                            $char = $decoded;
400
                        } else {
401
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
402
                        }
403
                    } else {
404
                        $char = self::MISSING;
405
                    }
406
407 18
                    $result .= $char;
408
                }
409
410 18
                $text = $result;
411
            }
412 12
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
413
            /** @var Encoding $encoding */
414 3
            $encoding = $this->get('Encoding');
415 3
            $unicode = mb_check_encoding($text, 'UTF-8');
416 3
            $result = '';
417 3
            if ($unicode) {
418 3
                $chars = preg_split(
419 3
                        '//s'.($unicode ? 'u' : ''),
420
                        $text,
421 3
                        -1,
422 3
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
423
                );
424
425 3
                foreach ($chars as $char) {
426 3
                    $dec_av = hexdec(bin2hex($char));
427 3
                    $dec_ap = $encoding->translateChar($dec_av);
428 3
                    $result .= self::uchr($dec_ap);
429
                }
430
            } else {
431 3
                $length = \strlen($text);
432
433 3
                for ($i = 0; $i < $length; ++$i) {
434 3
                    $dec_av = hexdec(bin2hex($text[$i]));
435 3
                    $dec_ap = $encoding->translateChar($dec_av);
436 3
                    $result .= self::uchr($dec_ap);
437
                }
438
            }
439 3
            $text = $result;
440 10
        } elseif ($this->get('Encoding') instanceof Element &&
441 10
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
442
            // mb_convert_encoding does not support MacRoman/macintosh,
443
            // so we use iconv() here
444 2
            $text = iconv('macintosh', 'UTF-8', $text);
445 10
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
446
            // don't double-encode strings already in UTF-8
447 4
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
448
        }
449
450 20
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
451
    }
452
}
453