Passed
Pull Request — master (#346)
by
unknown
02:30
created

Font   F

Complexity

Total Complexity 73

Size/Duplication

Total Lines 475
Duplicated Lines 0 %

Test Coverage

Coverage 86.12%

Importance

Changes 13
Bugs 2 Features 1
Metric Value
wmc 73
eloc 208
c 13
b 2
f 1
dl 0
loc 475
ccs 180
cts 209
cp 0.8612
rs 2.56

15 Methods

Rating   Name   Duplication   Size   Complexity  
A decodeOctal() 0 14 3
A getFontSpaceLimit() 0 3 1
A decodeUnicode() 0 14 3
A decodeEntities() 0 14 3
A getName() 0 3 2
A getType() 0 3 1
A init() 0 4 1
A getDetails() 0 11 2
A setTable() 0 3 1
A translateChar() 0 9 3
A uchr() 0 3 1
C loadTranslateTable() 0 106 16
D decodeContent() 0 102 20
B decodeHexadecimal() 0 31 8
B decodeText() 0 52 8

How to fix   Complexity   

Complex Class

Complex classes like Font often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Font, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
/**
34
 * Class Font
35
 */
36
class Font extends PDFObject
37
{
38
    const MISSING = '?';
39
40
    /**
41
     * @var array
42
     */
43
    protected $table = null;
44
45
    /**
46
     * @var array
47
     */
48
    protected $tableSizes = null;
49
50 21
    public function init()
51
    {
52
        // Load translate table.
53 21
        $this->loadTranslateTable();
54 21
    }
55
56
    /**
57
     * @return string
58
     */
59 2
    public function getName()
60
    {
61 2
        return $this->has('BaseFont') ? (string) $this->get('BaseFont') : '[Unknown]';
62
    }
63
64
    /**
65
     * @return string
66
     */
67 2
    public function getType()
68
    {
69 2
        return (string) $this->header->get('Subtype');
70
    }
71
72
    /**
73
     * @return array
74
     */
75 1
    public function getDetails($deep = true)
76
    {
77 1
        $details = [];
78
79 1
        $details['Name'] = $this->getName();
80 1
        $details['Type'] = $this->getType();
81 1
        $details['Encoding'] = ($this->has('Encoding') ? (string) $this->get('Encoding') : 'Ansi');
82
83 1
        $details += parent::getDetails($deep);
84
85 1
        return $details;
86
    }
87
88
    /**
89
     * @param string $char
90
     * @param bool   $use_default
91
     *
92
     * @return string|bool
93
     */
94 12
    public function translateChar($char, $use_default = true)
95
    {
96 12
        $dec = hexdec(bin2hex($char));
97
98 12
        if (\array_key_exists($dec, $this->table)) {
99 12
            return $this->table[$dec];
100
        }
101
102 2
        return $use_default ? self::MISSING : $char;
103
    }
104
105
    /**
106
     * @param int $code
107
     *
108
     * @return string
109
     */
110 22
    public static function uchr($code)
111
    {
112 22
        return html_entity_decode('&#'.((int) $code).';', ENT_NOQUOTES, 'UTF-8');
113
    }
114
115
    /**
116
     * @return array
117
     */
118 21
    public function loadTranslateTable()
119
    {
120 21
        if (null !== $this->table) {
121 1
            return $this->table;
122
        }
123
124 21
        $this->table = [];
125 21
        $this->tableSizes = [
126
            'from' => 1,
127
            'to' => 1,
128
        ];
129
130 21
        if ($this->has('ToUnicode')) {
131 20
            $content = $this->get('ToUnicode')->getContent();
132 20
            $matches = [];
133
134
            // Support for multiple spacerange sections
135 20
            if (preg_match_all('/begincodespacerange(?P<sections>.*?)endcodespacerange/s', $content, $matches)) {
136 20
                foreach ($matches['sections'] as $section) {
137 20
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
138
139 20
                    preg_match_all($regexp, $section, $matches);
140
141 20
                    $this->tableSizes = [
142 20
                        'from' => max(1, \strlen(current($matches['from'])) / 2),
143 20
                        'to' => max(1, \strlen(current($matches['to'])) / 2),
144
                    ];
145
146 20
                    break;
147
                }
148
            }
149
150
            // Support for multiple bfchar sections
151 20
            if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
152 6
                foreach ($matches['sections'] as $section) {
153 6
                    $regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
154
155 6
                    preg_match_all($regexp, $section, $matches);
156
157 6
                    $this->tableSizes['from'] = max(1, \strlen(current($matches['from'])) / 2);
158
159 6
                    foreach ($matches['from'] as $key => $from) {
160 6
                        $parts = preg_split(
161 6
                            '/([0-9A-F]{4})/i',
162 6
                            $matches['to'][$key],
163 6
                            0,
164 6
                            PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
165
                        );
166 6
                        $text = '';
167 6
                        foreach ($parts as $part) {
168 6
                            $text .= self::uchr(hexdec($part));
0 ignored issues
show
Bug introduced by
It seems like hexdec($part) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

168
                            $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec($part));
Loading history...
169
                        }
170 6
                        $this->table[hexdec($from)] = $text;
171
                    }
172
                }
173
            }
174
175
            // Support for multiple bfrange sections
176 20
            if (preg_match_all('/beginbfrange(?P<sections>.*?)endbfrange/s', $content, $matches)) {
177 18
                foreach ($matches['sections'] as $section) {
178
                    // Support for : <srcCode1> <srcCode2> <dstString>
179 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *<(?P<offset>[0-9A-F]+)>[ \r\n]+/is';
180
181 18
                    preg_match_all($regexp, $section, $matches);
182
183 18
                    foreach ($matches['from'] as $key => $from) {
184 18
                        $char_from = hexdec($from);
185 18
                        $char_to = hexdec($matches['to'][$key]);
186 18
                        $offset = hexdec($matches['offset'][$key]);
187
188 18
                        for ($char = $char_from; $char <= $char_to; ++$char) {
189 18
                            $this->table[$char] = self::uchr($char - $char_from + $offset);
190
                        }
191
                    }
192
193
                    // Support for : <srcCode1> <srcCodeN> [<dstString1> <dstString2> ... <dstStringN>]
194
                    // Some PDF file has 2-byte Unicode values on new lines > added \r\n
195 18
                    $regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)> *\[(?P<strings>[\r\n<>0-9A-F ]+)\][ \r\n]+/is';
196
197 18
                    preg_match_all($regexp, $section, $matches);
198
199 18
                    foreach ($matches['from'] as $key => $from) {
200 1
                        $char_from = hexdec($from);
201 1
                        $strings = [];
202
203 1
                        preg_match_all('/<(?P<string>[0-9A-F]+)> */is', $matches['strings'][$key], $strings);
204
205 1
                        foreach ($strings['string'] as $position => $string) {
206 1
                            $parts = preg_split(
207 1
                                '/([0-9A-F]{4})/i',
208
                                $string,
209 1
                                0,
210 1
                                PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE
211
                            );
212 1
                            $text = '';
213 1
                            foreach ($parts as $part) {
214 1
                                $text .= self::uchr(hexdec($part));
215
                            }
216 1
                            $this->table[$char_from + $position] = $text;
217
                        }
218
                    }
219
                }
220
            }
221
        }
222
223 21
        return $this->table;
224
    }
225
226
    /**
227
     * @param array $table
228
     */
229
    public function setTable($table)
230
    {
231
        $this->table = $table;
232
    }
233
234
    /**
235
     * @param string $hexa
236
     * @param bool   $add_braces
237
     *
238
     * @return string
239
     */
240 25
    public static function decodeHexadecimal($hexa, $add_braces = false)
241
    {
242
        // Special shortcut for XML content.
243 25
        if (false !== stripos($hexa, '<?xml')) {
244 3
            return $hexa;
245
        }
246
247 25
        $text = '';
248 25
        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
249
250 25
        foreach ($parts as $part) {
251 25
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
252
                // strip line breaks
253 8
                $part = preg_replace("/[\r\n]/", '', $part);
254 8
                $part = trim($part, '<>');
255 8
                if ($add_braces) {
256 1
                    $text .= '(';
257
                }
258
259 8
                $part = pack('H*', $part);
260 8
                $text .= ($add_braces ? preg_replace('/\\\/s', '\\\\\\', $part) : $part);
261
262 8
                if ($add_braces) {
263 8
                    $text .= ')';
264
                }
265
            } else {
266 25
                $text .= $part;
267
            }
268
        }
269
270 25
        return $text;
271
    }
272
273
    /**
274
     * @param string $text
275
     *
276
     * @return string
277
     */
278 25
    public static function decodeOctal($text)
279
    {
280 25
        $parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
281 25
        $text = '';
282
283 25
        foreach ($parts as $part) {
284 25
            if (preg_match('/^\\\\\d{3}$/', $part)) {
285 17
                $text .= \chr(octdec(trim($part, '\\')));
0 ignored issues
show
Bug introduced by
It seems like octdec(trim($part, '\')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

285
                $text .= \chr(/** @scrutinizer ignore-type */ octdec(trim($part, '\\')));
Loading history...
286
            } else {
287 25
                $text .= $part;
288
            }
289
        }
290
291 25
        return $text;
292
    }
293
294
    /**
295
     * @param string $text
296
     *
297
     * @return string
298
     */
299 38
    public static function decodeEntities($text)
300
    {
301 38
        $parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
302 38
        $text = '';
303
304 38
        foreach ($parts as $part) {
305 38
            if (preg_match('/^#\d{2}$/', $part)) {
306 3
                $text .= \chr(hexdec(trim($part, '#')));
0 ignored issues
show
Bug introduced by
It seems like hexdec(trim($part, '#')) can also be of type double; however, parameter $ascii of chr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

306
                $text .= \chr(/** @scrutinizer ignore-type */ hexdec(trim($part, '#')));
Loading history...
307
            } else {
308 38
                $text .= $part;
309
            }
310
        }
311
312 38
        return $text;
313
    }
314
315
    /**
316
     * @param string $text
317
     *
318
     * @return string
319
     */
320 25
    public static function decodeUnicode($text)
321
    {
322 25
        if (preg_match('/^\xFE\xFF/i', $text)) {
323
            // Strip U+FEFF byte order marker.
324 16
            $decode = substr($text, 2);
325 16
            $text = '';
326 16
            $length = \strlen($decode);
327
328 16
            for ($i = 0; $i < $length; $i += 2) {
329 16
                $text .= self::uchr(hexdec(bin2hex(substr($decode, $i, 2))));
0 ignored issues
show
Bug introduced by
It seems like hexdec(bin2hex(substr($decode, $i, 2))) can also be of type double; however, parameter $code of Smalot\PdfParser\Font::uchr() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

329
                $text .= self::uchr(/** @scrutinizer ignore-type */ hexdec(bin2hex(substr($decode, $i, 2))));
Loading history...
330
            }
331
        }
332
333 25
        return $text;
334
    }
335
336
    /**
337
     * @return int
338
     */
339 10
    protected function getFontSpaceLimit()
340
    {
341 10
        return -50;
342
    }
343
344
    /**
345
     * @param array $commands
346
     *
347
     * @return string
348
     */
349 10
    public function decodeText($commands)
350
    {
351 10
        $text = '';
352 10
        $word_position = 0;
353 10
        $words = [];
354 10
        $unicode = false;
355 10
        $font_space = $this->getFontSpaceLimit();
356
357 10
        foreach ($commands as $command) {
358 10
            switch ($command[PDFObject::TYPE]) {
359 10
                case 'n':
360 9
                    if ((float) (trim($command[PDFObject::COMMAND])) < $font_space) {
361 4
                        $word_position = \count($words);
362
                    }
363 9
                    continue 2;
364
365 10
                case '<':
366
                    // Decode hexadecimal.
367 6
                    $text = self::decodeHexadecimal('<'.$command[PDFObject::COMMAND].'>');
368
369 6
                    if (mb_check_encoding($text, 'UTF-8')) {
370 6
                        $unicode = true;
371
                    }
372
373 6
                    break;
374
375
                default:
376
                    // Decode octal (if necessary).
377 6
                    $text = self::decodeOctal($command[PDFObject::COMMAND]);
378
            }
379
380
            // replace escaped chars
381 10
            $text = str_replace(
382 10
                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
383 10
                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
384
                $text
385
            );
386
387
            // add content to result string
388 10
            if (isset($words[$word_position])) {
389 9
                $words[$word_position] .= $text;
390
            } else {
391 10
                $words[$word_position] = $text;
392
            }
393
        }
394
395 10
        foreach ($words as &$word) {
396 10
            $loop_unicode = $unicode;
397 10
            $word = $this->decodeContent($word, $loop_unicode);
398
        }
399
400 10
        return implode(' ', $words);
401
    }
402
403
    /**
404
     * @param string $text
405
     * @param bool   $unicode
406
     *
407
     * @return string
408
     */
409 12
    public function decodeContent($text, &$unicode)
410
    {
411 12
        if ($this->has('ToUnicode')) {
412 11
            $bytes = $this->tableSizes['from'];
413
414 11
            if ($bytes) {
415 11
                $result = '';
416 11
                $length = \strlen($text);
417
418 11
                for ($i = 0; $i < $length; $i += $bytes) {
419 11
                    $char = substr($text, $i, $bytes);
420
421 11
                    if (false !== ($decoded = $this->translateChar($char, false))) {
422 11
                        $char = $decoded;
423
                    } elseif ($this->has('DescendantFonts')) {
424
                        if ($this->get('DescendantFonts') instanceof PDFObject) {
425
                            $fonts = $this->get('DescendantFonts')->getHeader()->getElements();
0 ignored issues
show
Bug introduced by
The method getHeader() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

425
                            $fonts = $this->get('DescendantFonts')->/** @scrutinizer ignore-call */ getHeader()->getElements();

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
426
                        } else {
427
                            $fonts = $this->get('DescendantFonts')->getContent();
428
                        }
429
                        $decoded = false;
430
431
                        foreach ($fonts as $font) {
432
                            if ($font instanceof self) {
433
                                if (false !== ($decoded = $font->translateChar($char, false))) {
434
                                    $decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
0 ignored issues
show
Bug introduced by
It seems like $decoded can also be of type true; however, parameter $str of mb_convert_encoding() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

434
                                    $decoded = mb_convert_encoding(/** @scrutinizer ignore-type */ $decoded, 'UTF-8', 'Windows-1252');
Loading history...
435
                                    break;
436
                                }
437
                            }
438
                        }
439
440
                        if (false !== $decoded) {
441
                            $char = $decoded;
442
                        } else {
443
                            $char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
444
                        }
445
                    } else {
446
                        $char = self::MISSING;
447
                    }
448
449 11
                    $result .= $char;
450
                }
451
452 11
                $text = $result;
453
454
                // By definition, this code generates unicode chars.
455 11
                $unicode = true;
456
            }
457 8
        } elseif ($this->has('Encoding')) {
458
            /** @var Encoding $encoding */
459 5
            $encoding = $this->get('Encoding');
460
461 5
            if ($encoding instanceof Encoding) {
0 ignored issues
show
introduced by
$encoding is always a sub-type of Smalot\PdfParser\Encoding.
Loading history...
462 1
                if ($unicode) {
463
                    $chars = preg_split(
464
                        '//su',
465
                        $text,
466
                        -1,
467
                        PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
468
                    );
469
                    $result = '';
470
471
                    foreach ($chars as $char) {
472
                        $dec_av = hexdec(bin2hex($char));
473
                        $dec_ap = $encoding->translateChar($dec_av);
474
                        $result .= self::uchr($dec_ap);
475
                    }
476
477
                    $text = $result;
478
                } else {
479 1
                    $result = '';
480 1
                    $length = \strlen($text);
481
482 1
                    for ($i = 0; $i < $length; ++$i) {
483 1
                        $dec_av = hexdec(bin2hex($text[$i]));
484 1
                        $dec_ap = $encoding->translateChar($dec_av);
485 1
                        $result .= \chr($dec_ap);
486
                    }
487
488 1
                    $text = $result;
489
490 1
                    if ($encoding->get('BaseEncoding')->equals('MacRomanEncoding')) {
491
                        $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
492
493
                        return $text;
494
                    }
495
                }
496
            }
497
        }
498
499
        // Convert to unicode if not already done.
500 12
        if (!$unicode) {
501 6
            if ($this->get('Encoding') instanceof Element &&
502 6
                $this->get('Encoding')->equals('MacRomanEncoding')
503
            ) {
504 1
                $text = mb_convert_encoding($text, 'UTF-8', 'ISO-8859-1');
505
            } else {
506 6
                $text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
507
            }
508
        }
509
510 12
        return $text;
511
    }
512
}
513