Test Failed
Push — php8 ( cf16a0...fa58a5 )
by Konrad
04:36 queued 03:06
created

Font::getFontSpaceLimit()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 3
ccs 1
cts 1
cp 1
crap 1
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Encoding\WinAnsiEncoding;
34
35
/**
36
 * Class Font
37
 */
38
class Font extends PDFObject
39
{
40
    const MISSING = '?';
41
42
    /**
43
     * @var array
44
     */
45
    protected $table = null;
46
47
    /**
48
     * @var array
49
     */
50
    protected $tableSizes = null;
51
52 27
    public function init()
53
    {
54
        // Load translate table.
55 27
        $this->loadTranslateTable();
56 27
    }
57
58
    /**
59
     * @return string
60
     */
61 2
    public function getName()
62
    {
63 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
64
    }
65
66
    /**
67
     * @return string
68
     */
69 2
    public function getType()
70
    {
71 2
        return (string) $this->header->get('Subtype');
72
    }
73
74
    /**
75
     * @return array
76
     */
77 1
    public function getDetails($deep = true)
78
    {
79 1
        $details = [];
80
81 1
        $details['Name'] = $this->getName();
82 1
        $details['Type'] = $this->getType();
83 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
84
85 1
        $details += parent::getDetails($deep);
86
87 1
        return $details;
88
    }
89
90
    /**
91
     * @param string $char
92
     * @param bool   $use_default
93
     *
94
     * @return string|bool
95
     */
96 16
    public function translateChar($char, $use_default = true)
97
    {
98 16
        $dec = hexdec(bin2hex($char));
99
100 16
        if (\array_key_exists($dec, $this->table)) {
101 15
            return $this->table[$dec];
102
        }
103
104
        // fallback for decoding single-byte ANSI characters that are not in the lookup table
105 4
        $fallbackDecoded = $char;
106
        if (
107 4
            \strlen($char) < 2
108 4
            && $this->has('Encoding')
109 4
            && $this->get('Encoding') instanceof Encoding
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
110
            && WinAnsiEncoding::class === $this->get('Encoding')->__toString()
111
        ) {
112
            $fallbackDecoded = self::uchr($dec);
113
        }
114 4
115
        return $use_default ? self::MISSING : $fallbackDecoded;
116
    }
117
118
    /**
119
     * @param int $code
120
     *
121
     * @return string
122 27
     */
123
    public static function uchr($code)
124
    {
125
        // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
126 27
        // therefore, we use mb_convert_encoding() instead
127
        return mb_convert_encoding('&#'.((int) $code).';', 'UTF-8', 'HTML-ENTITIES');
0 ignored issues
show
Bug Best Practice introduced by
The expression return mb_convert_encodi...TF-8', 'HTML-ENTITIES') also could return the type array which is incompatible with the documented return type string.
Loading history...
128
    }
129
130
    /**
131
     * @return array
132 27
     */
133
    public function loadTranslateTable()
134 27
    {
135 1
        if (null !== $this->table) {
136
            return $this->table;
137
        }
138 27
139 27
        $this->table = [];
140
        $this->tableSizes = [
141
            'from' => 1,
142
            'to' => 1,
143
        ];
144 27
145 24
        if ($this->has('ToUnicode')) {
146 24
            $content = $this->get('ToUnicode')->getContent();
147
            $matches = [];
148
149 24
            // Support for multiple spacerange sections
150 24
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
151 24
                foreach ($matches['sections'] as $section) {
152
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
153 24
154
                    preg_match_all($regexp, $section, $matches);
155 24
156 24
                    $this->tableSizes = [
157 24
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
158
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
159
                    ];
160 24
161
                    break;
162
                }
163
            }
164
165 24
            // Support for multiple bfchar sections
166 8
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
167 8
                foreach ($matches['sections'] as $section) {
168
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
169 8
170
                    preg_match_all($regexp, $section, $matches);
171 8
172
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
173 8
174 8
                    foreach ($matches['from'] as $key => $from) {
175 8
                        $parts = preg_split(
176 8
                            '/([0-9A-F]{4})/i',
177 8
                            $matches['to'][$key],
178 8
                            0,
179
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
180 8
                        );
181 8
                        $text = '';
182 8
                        foreach ($parts as $part) {
183
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

183
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
184 8
                        }
185
                        $this->table[hexdec($from)] = $text;
186
                    }
187
                }
188
            }
189
190 24
            // Support for multiple bfrange sections
191 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
192
                foreach ($matches['sections'] as $section) {
193 20
                    // Support for : <srcCode1> <srcCode2> <dstString>
194
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
195 20
196
                    preg_match_all($regexp, $section, $matches);
197 20
198 20
                    foreach ($matches['from'] as $key => $from) {
199 20
                        $char_from = hexdec($from);
200 20
                        $char_to = hexdec($matches['to'][$key]);
201
                        $offset = hexdec($matches['offset'][$key]);
202 20
203 20
                        for ($char = $char_from; $char <= $char_to; ++$char) {
204
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
205
                        }
206
                    }
207
208
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
209 20
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
210
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
211 20
212
                    preg_match_all($regexp, $section, $matches);
213 20
214 1
                    foreach ($matches['from'] as $key => $from) {
215 1
                        $char_from = hexdec($from);
216
                        $strings = [];
217 1
218
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
219 1
220 1
                        foreach ($strings['string'] as $position => $string) {
221 1
                            $parts = preg_split(
222
                                '/([0-9A-F]{4})/i',
223 1
                                $string,
224 1
                                0,
225
                                \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
226 1
                            );
227 1
                            $text = '';
228 1
                            foreach ($parts as $part) {
229
                                $text .= self::uchr(hexdec($part));
230 1
                            }
231
                            $this->table[$char_from + $position] = $text;
232
                        }
233
                    }
234
                }
235
            }
236
        }
237 27
238
        return $this->table;
239
    }
240
241
    /**
242
     * @param array $table
243
     */
244
    public function setTable($table)
245
    {
246
        $this->table = $table;
247
    }
248
249
    /**
250
     * @param string $hexa
251
     * @param bool   $add_braces
252
     *
253
     * @return string
254 30
     */
255
    public static function decodeHexadecimal($hexa, $add_braces = false)
256
    {
257 30
        // Special shortcut for XML content.
258 3
        if (false !== stripos($hexa, '<?xml')) {
259
            return $hexa;
260
        }
261 30
262 30
        $text = '';
263
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
264 30
265 30
        foreach ($parts as $part) {
266
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
267 9
                // strip line breaks
268 9
                $part = preg_replace("/[\r\n]/", '', $part);
269 9
                $part = trim($part, '<>');
270 1
                if ($add_braces) {
271
                    $text .= '(';
272
                }
273 9
274 9
                $part = pack('H*', $part);
275
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
276 9
277 9
                if ($add_braces) {
278
                    $text .= ')';
279
                }
280 30
            } else {
281
                $text .= $part;
282
            }
283
        }
284 30
285
        return $text;
286
    }
287
288
    /**
289
     * @param string $text
290
     *
291
     * @return string
292 30
     */
293
    public static function decodeOctal($text)
294 30
    {
295 30
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
296
        $text = '';
297 30
298 30
        foreach ($parts as $part) {
299 17
            if (preg_match('/^\\\\\d{3}$/', $part)) {
300
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

300
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
301 30
            } else {
302
                $text .= $part;
303
            }
304
        }
305 30
306
        return $text;
307
    }
308
309
    /**
310
     * @param string $text
311
     *
312
     * @return string
313 44
     */
314
    public static function decodeEntities($text)
315 44
    {
316 44
        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
317
        $text = '';
318 44
319 44
        foreach ($parts as $part) {
320 3
            if (preg_match('/^#\d{2}$/', $part)) {
321
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $codepoint of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

321
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
322 44
            } else {
323
                $text .= $part;
324
            }
325
        }
326 44
327
        return $text;
328
    }
329
330
    /**
331
     * @param string $text
332
     *
333
     * @return string
334 30
     */
335
    public static function decodeUnicode($text)
336 30
    {
337
        if (preg_match('/^\xFE\xFF/i', $text)) {
338 19
            // Strip U+FEFF byte order marker.
339 19
            $decode = substr($text, 2);
340 19
            $text = '';
341
            $length = \strlen($decode);
342 19
343 19
            for ($i = 0; $i < $length; $i += 2) {
344
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

344
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
345
            }
346
        }
347 30
348
        return $text;
349
    }
350
351
    /**
352
     * @return int
353
     *
354
     * @todo Deprecated, use $this->config->getFontSpaceLimit() instead.
355 14
     */
356
    protected function getFontSpaceLimit()
357 14
    {
358
        return $this->config->getFontSpaceLimit();
359
    }
360
361
    /**
362
     * @param array $commands
363
     *
364
     * @return string
365 14
     */
366
    public function decodeText($commands)
367 14
    {
368 14
        $text = '';
369 14
        $word_position = 0;
370 14
        $words = [];
371 14
        $unicode = false;
0 ignored issues
show
Unused Code introduced by
The assignment to $unicode is dead and can be removed.
Loading history...
372
        $font_space = $this->getFontSpaceLimit();
373 14
374 14
        foreach ($commands as $command) {
375 14
            switch ($command[PDFObject::TYPE]) {
376 12
                case 'n':
377 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
378
                        $word_position = \count($words);
379 12
                    }
380 14
                    continue 2;
381
                case '<':
382 7
                    // Decode hexadecimal.
383 7
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
384
                    break;
385
386
                default:
387 10
                    // Decode octal (if necessary).
388
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
389
            }
390
391 14
            // replace escaped chars
392 14
            $text = str_replace(
393 14
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
394
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
395
                $text
396
            );
397
398 14
            // add content to result string
399 12
            if (isset($words[$word_position])) {
400
                $words[$word_position] .= $text;
401 14
            } else {
402
                $words[$word_position] = $text;
403
            }
404
        }
405 14
406 14
        foreach ($words as &$word) {
407
            $word = $this->decodeContent($word);
408
        }
409 14
410
        return implode(' ', $words);
411
    }
412
413
    /**
414
     * @param string $text
415
     * @param bool   $unicode This parameter is deprecated and might be removed in a future release
416
     *
417
     * @return string
418 16
     */
419
    public function decodeContent($text, &$unicode = null)
420 16
    {
421 14
        if ($this->has('ToUnicode')) {
422
            $bytes = $this->tableSizes['from'];
423 14
424 14
            if ($bytes) {
425 14
                $result = '';
426
                $length = \strlen($text);
427 14
428 14
                for ($i = 0; $i < $length; $i += $bytes) {
429
                    $char = substr($text, $i, $bytes);
430 14
431 14
                    if (false !== ($decoded = $this->translateChar($char, false))) {
432
                        $char = $decoded;
433
                    } elseif ($this->has('DescendantFonts')) {
434
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
435
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

435
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
436
                        } else {
437
                            $fonts = $this->get('DescendantFonts')->getContent();
438
                        }
439
                        $decoded = false;
440
441
                        foreach ($fonts as $font) {
442
                            if ($font instanceof self) {
443
                                if (false !== ($decoded = $font->translateChar($char, false))) {
444
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $string of mb_convert_encoding() does only seem to accept array|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
445
                                    break;
446
                                }
447
                            }
448
                        }
449
450
                        if (false !== $decoded) {
451
                            $char = $decoded;
452
                        } else {
453
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
454
                        }
455
                    } else {
456
                        $char = self::MISSING;
457
                    }
458 14
459
                    $result .= $char;
460
                }
461 14
462
                $text = $result;
463 11
            }
464
        } elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
465 3
            /** @var Encoding $encoding */
466 3
            $encoding = $this->get('Encoding');
467 3
            $unicode = mb_check_encoding($text, 'UTF-8');
468 3
            $result = '';
469 3
            if ($unicode) {
470 3
                $chars = preg_split(
471
                        '//s'.($unicode ? 'u' : ''),
472 3
                        $text,
473 3
                        -1,
474
                        \PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
475
                );
476 3
477 3
                foreach ($chars as $char) {
478 3
                    $dec_av = hexdec(bin2hex($char));
479 3
                    $dec_ap = $encoding->translateChar($dec_av);
480
                    $result .= self::uchr($dec_ap);
481
                }
482 3
            } else {
483
                $length = \strlen($text);
484 3
485 3
                for ($i = 0; $i < $length; ++$i) {
486 3
                    $dec_av = hexdec(bin2hex($text[$i]));
487 3
                    $dec_ap = $encoding->translateChar($dec_av);
488
                    $result .= self::uchr($dec_ap);
489
                }
490 3
            }
491 9
            $text = $result;
492 9
        } elseif ($this->get('Encoding') instanceof Element &&
493
                  $this->get('Encoding')->equals('MacRomanEncoding')) {
494
            // mb_convert_encoding does not support MacRoman/macintosh,
495 1
            // so we use iconv() here
496 9
            $text = iconv('macintosh', 'UTF-8', $text);
497
        } elseif (!mb_check_encoding($text, 'UTF-8')) {
498 3
            // don't double-encode strings already in UTF-8
499
            $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
500
        }
501 16
502
        return $text;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $text also could return the type array which is incompatible with the documented return type string.
Loading history...
503
    }
504
}
505