Passed
Pull Request — master (#318)
by
unknown
03:16
created

Font::setTable()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 1
dl 0
loc 3
rs 10
c 1
b 0
f 0
ccs 0
cts 2
cp 0
cc 1
nc 1
nop 1
crap 2
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 22
    public function init()
51
    {
52
        // Load translate table.
53 22
        $this->loadTranslateTable();
54 22
    }
55
56
    /**
57
     * @return string
58
     */
59 8
    public function getName()
60
    {
61 8
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 8
    public function getType()
68
    {
69 8
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 7
    public function getDetails($deep = true)
76
    {
77 7
        $details = [];
78
79 7
        $details['Name'] = $this->getName();
80 7
        $details['Type'] = $this->getType();
81 7
        $details['Encoding'] = 'Ansi';
82 7
        if ($this->has('Encoding')) {
83 5
            if ($this->get('Encoding') instanceof Encoding) {
0 ignored issues
show
introduced by
$this->get('Encoding') is never a sub-type of Smalot\PdfParser\Encoding.
Loading history...
84 2
                $details['Encoding'] = @($this->get('Encoding')->getDetails()['BaseEncoding']) ?: 'Ansi';
85
            } else {
86 4
                $details['Encoding'] = (string) $this->get('Encoding');
87
            }
88
        }
89
90 7
        $details += parent::getDetails($deep);
91
92 7
        return $details;
93
    }
94
95
    /**
96
     * @param string $char
97
     * @param bool   $use_default
98
     *
99
     * @return string|bool
100
     */
101 12
    public function translateChar($char, $use_default = true)
102
    {
103 12
        $dec = hexdec(bin2hex($char));
104
105 12
        if (\array_key_exists($dec, $this->table)) {
106 12
            return $this->table[$dec];
107
        }
108
109 1
        return $use_default ? self::MISSING : $char;
110
    }
111
112
    /**
113
     * @param int $code
114
     *
115
     * @return string
116
     */
117 22
    public static function uchr($code)
118
    {
119 22
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
120
    }
121
122
    /**
123
     * @return array
124
     */
125 22
    public function loadTranslateTable()
126
    {
127 22
        if (null !== $this->table) {
128 1
            return $this->table;
129
        }
130
131 22
        $this->table = [];
132 22
        $this->tableSizes = [
133
            'from' => 1,
134
            'to' => 1,
135
        ];
136
137 22
        if ($this->has('ToUnicode')) {
138 20
            $content = $this->get('ToUnicode')->getContent();
139 20
            $matches = [];
140
141
            // Support for multiple spacerange sections
142 20
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
143 20
                foreach ($matches['sections'] as $section) {
144 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
145
146 20
                    preg_match_all($regexp, $section, $matches);
147
148 20
                    $this->tableSizes = [
149 20
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
150 20
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
151
                    ];
152
153 20
                    break;
154
                }
155
            }
156
157
            // Support for multiple bfchar sections
158 20
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
159 6
                foreach ($matches['sections'] as $section) {
160 6
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
161
162 6
                    preg_match_all($regexp, $section, $matches);
163
164 6
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
165
166 6
                    foreach ($matches['from'] as $key => $from) {
167 6
                        $parts = preg_split(
168 6
                            '/([0-9A-F]{4})/i',
169 6
                            $matches['to'][$key],
170 6
                            0,
171 6
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
172
                        );
173 6
                        $text = '';
174 6
                        foreach ($parts as $part) {
175 6
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

175
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
176
                        }
177 6
                        $this->table[hexdec($from)] = $text;
178
                    }
179
                }
180
            }
181
182
            // Support for multiple bfrange sections
183 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
184 18
                foreach ($matches['sections'] as $section) {
185
                    // Support for : <srcCode1> <srcCode2> <dstString>
186 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
187
188 18
                    preg_match_all($regexp, $section, $matches);
189
190 18
                    foreach ($matches['from'] as $key => $from) {
191 18
                        $char_from = hexdec($from);
192 18
                        $char_to = hexdec($matches['to'][$key]);
193 18
                        $offset = hexdec($matches['offset'][$key]);
194
195 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
196 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
197
                        }
198
                    }
199
200
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
201
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
202 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
203
204 18
                    preg_match_all($regexp, $section, $matches);
205
206 18
                    foreach ($matches['from'] as $key => $from) {
207 1
                        $char_from = hexdec($from);
208 1
                        $strings = [];
209
210 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
211
212 1
                        foreach ($strings['string'] as $position => $string) {
213 1
                            $parts = preg_split(
214 1
                                '/([0-9A-F]{4})/i',
215
                                $string,
216 1
                                0,
217 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
218
                            );
219 1
                            $text = '';
220 1
                            foreach ($parts as $part) {
221 1
                                $text .= self::uchr(hexdec($part));
222
                            }
223 1
                            $this->table[$char_from + $position] = $text;
224
                        }
225
                    }
226
                }
227
            }
228
        }
229
230 22
        return $this->table;
231
    }
232
233
    /**
234
     * @param array $table
235
     */
236
    public function setTable($table)
237
    {
238
        $this->table = $table;
239
    }
240
241
    /**
242
     * @param string $hexa
243
     * @param bool   $add_braces
244
     *
245
     * @return string
246
     */
247 26
    public static function decodeHexadecimal($hexa, $add_braces = false)
248
    {
249
        // Special shortcut for XML content.
250 26
        if (false !== stripos($hexa, '<?xml')) {
251 3
            return $hexa;
252
        }
253
254 26
        $text = '';
255 26
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
256
257 26
        foreach ($parts as $part) {
258 26
            if (preg_match('/^<.*>$/', $part) && false === stripos($part, '<?xml')) {
259 8
                $part = trim($part, '<>');
260 8
                if ($add_braces) {
261 1
                    $text .= '(';
262
                }
263
264 8
                $part = pack('H*', $part);
265 8
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
266
267 8
                if ($add_braces) {
268 8
                    $text .= ')';
269
                }
270
            } else {
271 26
                $text .= $part;
272
            }
273
        }
274
275 26
        return $text;
276
    }
277
278
    /**
279
     * @param string $text
280
     *
281
     * @return string
282
     */
283 26
    public static function decodeOctal($text)
284
    {
285 26
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
286 26
        $text = '';
287
288 26
        foreach ($parts as $part) {
289 26
            if (preg_match('/^\\\\\d{3}$/', $part)) {
290 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

290
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
291
            } else {
292 26
                $text .= $part;
293
            }
294
        }
295
296 26
        return $text;
297
    }
298
299
    /**
300
     * @param string $text
301
     *
302
     * @return string
303
     */
304 39
    public static function decodeEntities($text)
305
    {
306 39
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
307 39
        $text = '';
308
309 39
        foreach ($parts as $part) {
310 39
            if (preg_match('/^#\d{2}$/', $part)) {
311 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

311
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
312
            } else {
313 39
                $text .= $part;
314
            }
315
        }
316
317 39
        return $text;
318
    }
319
320
    /**
321
     * @param string $text
322
     *
323
     * @return string
324
     */
325 26
    public static function decodeUnicode($text)
326
    {
327 26
        if (preg_match('/^\xFE\xFF/i', $text)) {
328
            // Strip U+FEFF byte order marker.
329 16
            $decode = substr($text, 2);
330 16
            $text = '';
331 16
            $length = \strlen($decode);
332
333 16
            for ($i = 0; $i < $length; $i += 2) {
334 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

334
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
335
            }
336
        }
337
338 26
        return $text;
339
    }
340
341
    /**
342
     * @return int
343
     */
344 9
    protected function getFontSpaceLimit()
345
    {
346 9
        return -50;
347
    }
348
349
    /**
350
     * @param array $commands
351
     *
352
     * @return string
353
     */
354 9
    public function decodeText($commands)
355
    {
356 9
        $text = '';
357 9
        $word_position = 0;
358 9
        $words = [];
359 9
        $unicode = false;
360 9
        $font_space = $this->getFontSpaceLimit();
361
362 9
        foreach ($commands as $command) {
363 9
            switch ($command[PDFObject::TYPE]) {
364 9
                case 'n':
365 6
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
366 3
                        $word_position = \count($words);
367
                    }
368 6
                    continue 2;
369
370 9
                case '<':
371
                    // Decode hexadecimal.
372 6
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
373
374 6
                    if (mb_check_encoding($text, 'UTF-8')) {
375 6
                        $unicode = true;
376
                    }
377
378 6
                    break;
379
380
                default:
381
                    // Decode octal (if necessary).
382 5
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
383
            }
384
385
            // replace escaped chars
386 9
            $text = stripcslashes($text);
387
388
            // add content to result string
389 9
            if (isset($words[$word_position])) {
390 6
                $words[$word_position] .= $text;
391
            } else {
392 9
                $words[$word_position] = $text;
393
            }
394
        }
395
396 9
        foreach ($words as &$word) {
397 9
            $loop_unicode = $unicode;
398 9
            $word = $this->decodeContent($word, $loop_unicode);
399
        }
400
401 9
        return implode(' ', $words);
402
    }
403
404
    /**
405
     * @param string $text
406
     * @param bool   $unicode
407
     *
408
     * @return string
409
     */
410 13
    public function decodeContent($text, &$unicode)
411
    {
412 13
        if ($this->has('ToUnicode')) {
413 11
            $bytes = $this->tableSizes['from'];
414
415 11
            if ($bytes) {
416 11
                $result = '';
417 11
                $length = \strlen($text);
418
419 11
                for ($i = 0; $i < $length; $i += $bytes) {
420 11
                    $char = substr($text, $i, $bytes);
421
422 11
                    if (false !== ($decoded = $this->translateChar($char, false))) {
423 11
                        $char = $decoded;
424
                    } elseif ($this->has('DescendantFonts')) {
425
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
426
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

426
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
427
                        } else {
428
                            $fonts = $this->get('DescendantFonts')->getContent();
429
                        }
430
                        $decoded = false;
431
432
                        foreach ($fonts as $font) {
433
                            if ($font instanceof self) {
434
                                if (false !== ($decoded = $font->translateChar($char, false))) {
435
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

435
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
436
                                    break;
437
                                }
438
                            }
439
                        }
440
441
                        if (false !== $decoded) {
442
                            $char = $decoded;
443
                        } else {
444
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
445
                        }
446
                    } else {
447
                        $char = self::MISSING;
448
                    }
449
450 11
                    $result .= $char;
451
                }
452
453 11
                $text = $result;
454
455
                // By definition, this code generates unicode chars.
456 11
                $unicode = true;
457
            }
458 9
        } elseif ($this->has('Encoding')) {
459
            /** @var Encoding $encoding */
460 6
            $encoding = $this->get('Encoding');
461
462 6
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
463 2
                if ($unicode) {
464
                    $chars = preg_split(
465
                        '//su',
466
                        $text,
467
                        -1,
468
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
469
                    );
470
                    $result = '';
471
472
                    foreach ($chars as $char) {
473
                        $dec_av = hexdec(bin2hex($char));
474
                        $dec_ap = $encoding->translateChar($dec_av);
475
                        $result .= self::uchr($dec_ap);
476
                    }
477
478
                    $text = $result;
479
                } else {
480 2
                    $result = '';
481 2
                    $length = \strlen($text);
482
483 2
                    for ($i = 0; $i < $length; ++$i) {
484 2
                        $dec_av = hexdec(bin2hex($text[$i]));
485 2
                        $dec_ap = $encoding->translateChar($dec_av);
486 2
                        $result .= \chr($dec_ap);
487
                    }
488
489 2
                    $text = $result;
490
491 2
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
492
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
493
494
                        return $text;
495
                    }
496
                }
497
            }
498
        }
499
500
        // Convert to unicode if not already done.
501 13
        if (!$unicode) {
502 5
            if ($this->get('Encoding') instanceof Element &&
503 5
                $this->get('Encoding')->equals('MacRomanEncoding')
504
            ) {
505 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
506
            } else {
507 5
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
508
            }
509
        }
510
511 13
        return $text;
512
    }
513
}
514