Passed
Branch master (f7fac8)
by Sebastien
02:47
created

Font   F

Complexity

Total Complexity 74

Size/Duplication

Total Lines 479
Duplicated Lines 0 %

Importance

Changes 12
Bugs 2 Features 1
Metric Value
eloc 205
c 12
b 2
f 1
dl 0
loc 479
rs 2.48
wmc 74

15 Methods

Rating   Name   Duplication   Size   Complexity  
A decodeOctal() 0 14 3
A getFontSpaceLimit() 0 3 1
A setTable() 0 3 1
A translateChar() 0 11 3
B decodeText() 0 47 8
B decodeHexadecimal() 0 29 8
A uchr() 0 3 1
A decodeUnicode() 0 14 3
A decodeEntities() 0 14 3
A getName() 0 3 2
D decodeContent() 0 105 21
C loadTranslateTable() 0 106 16
A getType() 0 3 1
A init() 0 4 1
A getDetails() 0 11 2

How to fix   Complexity   

Complex Class

Complex classes like Font often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Font, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 * @license LGPLv3
10
 * @url     <https://github.com/smalot/pdfparser>
11
 *
12
 *  PdfParser is a pdf library written in PHP, extraction oriented.
13
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
14
 *
15
 *  This program is free software: you can redistribute it and/or modify
16
 *  it under the terms of the GNU Lesser General Public License as published by
17
 *  the Free Software Foundation, either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  This program is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU Lesser General Public License for more details.
24
 *
25
 *  You should have received a copy of the GNU Lesser General Public License
26
 *  along with this program.
27
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
28
 *
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 *
36
 * @package Smalot\PdfParser
37
 */
38
class Font extends PDFObject
39
{
40
    /**
41
     *
42
     */
43
    const MISSING = '?';
44
45
    /**
46
     * @var array
47
     */
48
    protected $table = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $tableSizes = null;
54
55
    /**
56
     *
57
     */
58
    public function init()
59
    {
60
        // Load translate table.
61
        $this->loadTranslateTable();
62
    }
63
64
    /**
65
     * @return string
66
     */
67
    public function getName()
68
    {
69
        return $this->has('BaseFont') ? (string)$this->get('BaseFont') : '[Unknown]';
70
    }
71
72
    /**
73
     * @return string
74
     */
75
    public function getType()
76
    {
77
        return (string)$this->header->get('Subtype');
78
    }
79
80
    /**
81
     * @return array
82
     */
83
    public function getDetails($deep = true)
84
    {
85
        $details = array();
86
87
        $details['Name']     = $this->getName();
88
        $details['Type']     = $this->getType();
89
        $details['Encoding'] = ($this->has('Encoding') ? (string)$this->get('Encoding') : 'Ansi');
90
91
        $details += parent::getDetails($deep);
92
93
        return $details;
94
    }
95
96
    /**
97
     * @param string $char
98
     * @param bool   $use_default
99
     *
100
     * @return string
101
     */
102
    public function translateChar($char, $use_default = true)
103
    {
104
        $dec = hexdec(bin2hex($char));
105
106
        if (array_key_exists($dec, $this->table)) {
107
            $char = $this->table[$dec];
108
        } else {
109
            $char = ($use_default ? self::MISSING : $char);
110
        }
111
112
        return $char;
113
    }
114
115
    /**
116
     * @param int $code
117
     *
118
     * @return string
119
     */
120
    public static function uchr($code)
121
    {
122
        return html_entity_decode('&#' . ((int)$code) . ';', ENT_NOQUOTES, 'UTF-8');
123
    }
124
125
    /**
126
     * @return array
127
     */
128
    public function loadTranslateTable()
129
    {
130
        if (!is_null($this->table)) {
0 ignored issues
show
introduced by
The condition is_null($this->table) is always false.
Loading history...
131
            return $this->table;
132
        }
133
134
        $this->table      = array();
135
        $this->tableSizes = array(
136
            'from' => 1,
137
            'to'   => 1,
138
        );
139
140
        if ($this->has('ToUnicode')) {
141
            $content = $this->get('ToUnicode')->getContent();
142
            $matches = array();
143
144
            // Support for multiple spacerange sections
145
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
146
                foreach ($matches['sections'] as $section) {
147
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
148
149
                    preg_match_all($regexp, $section, $matches);
150
151
                    $this->tableSizes = array(
152
                        'from' => max(1, strlen(current($matches['from'])) / 2),
153
                        'to'   => max(1, strlen(current($matches['to'])) / 2),
154
                    );
155
156
                    break;
157
                }
158
            }
159
160
            // Support for multiple bfchar sections
161
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
162
                foreach ($matches['sections'] as $section) {
163
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
164
165
                    preg_match_all($regexp, $section, $matches);
166
167
                    $this->tableSizes['from'] = max(1, strlen(current($matches['from'])) / 2);
168
169
                    foreach ($matches['from'] as $key => $from) {
170
                        $parts = preg_split(
171
                            '/([0-9A-F]{4})/i',
172
                            $matches['to'][$key],
173
                            0,
174
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
175
                        );
176
                        $text  = '';
177
                        foreach ($parts as $part) {
178
                            $text .= self::uchr(hexdec($part));
179
                        }
180
                        $this->table[hexdec($from)] = $text;
181
                    }
182
                }
183
            }
184
185
            // Support for multiple bfrange sections
186
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
187
                foreach ($matches['sections'] as $section) {
188
                    // Support for : <srcCode1> <srcCode2> <dstString>
189
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
190
191
                    preg_match_all($regexp, $section, $matches);
192
193
                    foreach ($matches['from'] as $key => $from) {
194
                        $char_from = hexdec($from);
195
                        $char_to   = hexdec($matches['to'][$key]);
196
                        $offset    = hexdec($matches['offset'][$key]);
197
198
                        for ($char = $char_from; $char <= $char_to; $char++) {
199
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
200
                        }
201
                    }
202
203
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
204
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
205
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
206
207
                    preg_match_all($regexp, $section, $matches);
208
209
                    foreach ($matches['from'] as $key => $from) {
210
                        $char_from = hexdec($from);
211
                        $strings   = array();
212
213
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
214
215
                        foreach ($strings['string'] as $position => $string) {
216
                            $parts = preg_split(
217
                                '/([0-9A-F]{4})/i',
218
                                $string,
219
                                0,
220
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
221
                            );
222
                            $text  = '';
223
                            foreach ($parts as $part) {
224
                                $text .= self::uchr(hexdec($part));
225
                            }
226
                            $this->table[$char_from + $position] = $text;
227
                        }
228
                    }
229
                }
230
            }
231
        }
232
233
        return $this->table;
234
    }
235
236
    /**
237
     * @param array $table
238
     */
239
    public function setTable($table)
240
    {
241
        $this->table = $table;
242
    }
243
244
    /**
245
     * @param string $hexa
246
     * @param bool   $add_braces
247
     *
248
     * @return string
249
     */
250
    public static function decodeHexadecimal($hexa, $add_braces = false)
251
    {
252
        // Special shortcut for XML content.
253
        if (stripos($hexa, '<?xml') !== false) {
254
            return $hexa;
255
        }
256
257
        $text  = '';
258
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
259
260
        foreach ($parts as $part) {
261
            if (preg_match('/^<.*>$/', $part) && stripos($part, '<?xml') === false) {
262
                $part = trim($part, '<>');
263
                if ($add_braces) {
264
                    $text .= '(';
265
                }
266
267
                $part = pack('H*', $part);
268
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
269
270
                if ($add_braces) {
271
                    $text .= ')';
272
                }
273
            } else {
274
                $text .= $part;
275
            }
276
        }
277
278
        return $text;
279
    }
280
281
    /**
282
     * @param string $text
283
     *
284
     * @return string
285
     */
286
    public static function decodeOctal($text)
287
    {
288
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
289
        $text  = '';
290
291
        foreach ($parts as $part) {
292
            if (preg_match('/^\\\\\d{3}$/', $part)) {
293
                $text .= chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

293
                $text .= chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
294
            } else {
295
                $text .= $part;
296
            }
297
        }
298
299
        return $text;
300
    }
301
302
    /**
303
     * @param $text
304
     *
305
     * @return string
306
     */
307
    public static function decodeEntities($text)
308
    {
309
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
310
        $text  = '';
311
312
        foreach ($parts as $part) {
313
            if (preg_match('/^#\d{2}$/', $part)) {
314
                $text .= chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

314
                $text .= chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
315
            } else {
316
                $text .= $part;
317
            }
318
        }
319
320
        return $text;
321
    }
322
323
    /**
324
     * @param string $text
325
     *
326
     * @return string
327
     */
328
    public static function decodeUnicode($text)
329
    {
330
        if (preg_match('/^\xFE\xFF/i', $text)) {
331
            // Strip U+FEFF byte order marker.
332
            $decode = substr($text, 2);
333
            $text   = '';
334
            $length = strlen($decode);
335
336
            for ($i = 0; $i < $length; $i += 2) {
337
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

337
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
338
            }
339
        }
340
341
        return $text;
342
    }
343
344
    /**
345
     * @return int
346
     */
347
    protected function getFontSpaceLimit()
348
    {
349
        return -50;
350
    }
351
352
    /**
353
     * @param array $commands
354
     *
355
     * @return string
356
     */
357
    public function decodeText($commands)
358
    {
359
        $word_position = 0;
360
        $words         = array();
361
        $unicode       = false;
362
        $font_space    = $this->getFontSpaceLimit();
363
364
        foreach ($commands as $command) {
365
            switch ($command[PDFObject::TYPE]) {
366
                case 'n':
367
                    if (floatval(trim($command[PDFObject::COMMAND])) < $font_space) {
368
                        $word_position = count($words);
369
                    }
370
                    continue(2);
371
372
                case '<':
373
                    // Decode hexadecimal.
374
                    $text = self::decodeHexadecimal('<' . $command[PDFObject::COMMAND] . '>');
375
376
                    if (mb_check_encoding($text, "UTF-8")) {
377
                        $unicode = true;
378
                    }
379
380
                    break;
381
382
                default:
383
                    // Decode octal (if necessary).
384
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
385
            }
386
387
            // replace escaped chars
388
            $text = stripcslashes($text);
389
390
            // add content to result string
391
            if (isset($words[$word_position])) {
392
                $words[$word_position] .= $text;
393
            } else {
394
                $words[$word_position] = $text;
395
            }
396
        }
397
398
        foreach ($words as &$word) {
399
            $loop_unicode = $unicode;
400
            $word         = $this->decodeContent($word, $loop_unicode);
401
        }
402
403
        return implode(' ', $words);
404
    }
405
406
    /**
407
     * @param string $text
408
     * @param bool   $unicode
409
     *
410
     * @return string
411
     */
412
    protected function decodeContent($text, &$unicode)
413
    {
414
        if ($this->has('ToUnicode')) {
415
416
            $bytes = $this->tableSizes['from'];
417
418
            if ($bytes) {
419
                $result = '';
420
                $length = strlen($text);
421
422
                for ($i = 0; $i < $length; $i += $bytes) {
423
                    $char = substr($text, $i, $bytes);
424
425
                    if (($decoded = $this->translateChar($char, false)) !== false) {
426
                        $char = $decoded;
427
                    } elseif ($this->has('DescendantFonts')) {
428
429
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
430
                            $fonts   = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

430
                            $fonts   = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
431
                        } else {
432
                            $fonts   = $this->get('DescendantFonts')->getContent();
433
                        }
434
                        $decoded = false;
435
436
                        foreach ($fonts as $font) {
437
                            if ($font instanceof Font) {
438
                                if (($decoded = $font->translateChar($char, false)) !== false) {
439
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
440
                                    break;
441
                                }
442
                            }
443
                        }
444
445
                        if ($decoded !== false) {
446
                            $char = $decoded;
447
                        } else {
448
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
449
                        }
450
                    } else {
451
                        $char = self::MISSING;
452
                    }
453
454
                    $result .= $char;
455
                }
456
457
                $text = $result;
458
459
                // By definition, this code generates unicode chars.
460
                $unicode = true;
461
            }
462
        } elseif ($this->has('Encoding')) {
463
            /** @var Encoding $encoding */
464
            $encoding = $this->get('Encoding');
465
466
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
467
                if ($unicode) {
468
                    $chars  = preg_split(
469
                        '//s' . ($unicode ? 'u' : ''),
0 ignored issues
show
introduced by
The condition $unicode is always true.
Loading history...
470
                        $text,
471
                        -1,
472
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
473
                    );
474
                    $result = '';
475
476
                    foreach ($chars as $char) {
477
                        $dec_av = hexdec(bin2hex($char));
478
                        $dec_ap = $encoding->translateChar($dec_av);
479
                        $result .= self::uchr($dec_ap);
480
                    }
481
482
                    $text = $result;
483
                } else {
484
                    $result = '';
485
                    $length = strlen($text);
486
487
                    for ($i = 0; $i < $length; $i++) {
488
                        $dec_av = hexdec(bin2hex($text[$i]));
489
                        $dec_ap = $encoding->translateChar($dec_av);
490
                        $result .= chr($dec_ap);
491
                    }
492
493
                    $text = $result;
494
495
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
496
                        $text = mb_convert_encoding($text, 'UTF-8', 'Mac');
497
498
                        return $text;
499
                    }
500
                }
501
            }
502
        }
503
504
        // Convert to unicode if not already done.
505
        if (!$unicode) {
506
507
            if ($this->get('Encoding') instanceof Element &&
508
                $this->get('Encoding')->equals('MacRomanEncoding')
509
            ) {
510
                $text = mb_convert_encoding($text, 'UTF-8', 'Mac');
511
            } else {
512
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
513
            }
514
        }
515
516
        return $text;
517
    }
518
}
519