Passed
Pull Request — master (#682)
by Konrad
03:15
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 10.1228

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 25
cts 28
cp 0.8929
crap 10.1228
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69
    protected $metadata = [];
70
71
    /**
72
     * @var array
73
     */
74
    protected $details;
75
76 98
    public function __construct()
77
    {
78 98
        $this->trailer = new Header([], $this);
79
    }
80
81 70
    public function init()
82
    {
83 70
        $this->buildDictionary();
84
85 70
        $this->buildDetails();
86
87
        // Propagate init to objects.
88 70
        foreach ($this->objects as $object) {
89 70
            $object->getHeader()->init();
90 70
            $object->init();
91
        }
92
    }
93
94
    /**
95
     * Build dictionary based on type header field.
96
     */
97 70
    protected function buildDictionary()
98
    {
99
        // Build dictionary.
100 70
        $this->dictionary = [];
101
102 70
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104 70
            $type = $object->getHeader()->get('Type')->getContent();
105
106 70
            if (null != $type) {
107 70
                if (!isset($this->dictionary[$type])) {
108 70
                    $this->dictionary[$type] = [
109 70
                        'all' => [],
110 70
                        'subtype' => [],
111 70
                    ];
112
                }
113
114 70
                $this->dictionary[$type]['all'][$id] = $object;
115
116 70
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117 70
                if (null != $subtype) {
118 63
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119 63
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121 63
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123
            }
124
        }
125
    }
126
127
    /**
128
     * Build details array.
129
     */
130 70
    protected function buildDetails()
131
    {
132
        // Build details array.
133 70
        $details = [];
134
135
        // Extract document info
136 70
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138 59
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 59
            if (null !== $info && method_exists($info, 'getHeader')) {
142 59
                $details = $info->getHeader()->getDetails();
143
            }
144
        }
145
146
        // Retrieve the page count
147
        try {
148 70
            $pages = $this->getPages();
149 69
            $details['Pages'] = \count($pages);
150 2
        } catch (\Exception $e) {
151 2
            $details['Pages'] = 0;
152
        }
153
154
        // Decode and repair encoded document properties
155 70
        foreach ($details as $key => $value) {
156 70
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160
                // safe
161 58
                if (mb_check_encoding($value, 'UTF-8')) {
162
                    // Remove literal backslash + line-feed "\\r"
163 57
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167 57
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168 1
                        $diff = (\ord($match[1]) - 182) * 64;
169 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175 57
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182
                    // between two single byte characters in a unicode string
183 57
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 57
                    $details[$key] = $value;
186
                } else {
187
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189 11
                    $value = str_replace("\\\r", '', $value);
190 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192
            }
193
        }
194
195 70
        $details = array_merge($details, $this->metadata);
196
197 70
        $this->details = $details;
198
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203 40
    public function extractXMPMetadata(string $content): void
204
    {
205 40
        $xml = xml_parser_create();
206 40
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208 40
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210
             * short overview about the following code parts:
211
             *
212
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218
             * element) is set as the current $metadata context.
219
             */
220 40
            $metadata = [];
221 40
            $stack = [];
222 40
            foreach ($values as $val) {
223
                // Standardize to lowercase
224 40
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227 40
                if (0 === strpos($val['tag'], 'x:')) {
228 40
                    continue;
229 40
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 40
                    continue;
231
                }
232
233 40
                switch ($val['type']) {
234 40
                    case 'open':
235
                        // Create an array of list items
236 36
                        if ('rdf:li' == $val['tag']) {
237 5
                            $metadata[] = [];
238
239
                            // Move up one level in the stack
240 5
                            $stack[\count($stack)] = &$metadata;
241 5
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 36
                            $metadata[$val['tag']] = [];
245
246
                            // Move up one level in the stack
247 36
                            $stack[\count($stack)] = &$metadata;
248 36
                            $metadata = &$metadata[$val['tag']];
249
                        }
250 36
                        break;
251
252 40
                    case 'complete':
253 40
                        if (isset($val['value'])) {
254
                            // Assign a value to this list item
255 40
                            if ('rdf:li' == $val['tag']) {
256 32
                                $metadata[] = $val['value'];
257
258
                            // Else assign a value to this property
259
                            } else {
260 40
                                $metadata[$val['tag']] = $val['value'];
261
                            }
262
                        }
263 40
                        break;
264
265 36
                    case 'close':
266
                        // If the value of this property is an array
267 36
                        if (\is_array($metadata)) {
268
                            // If the value is a single element array
269
                            // where the element is of type string, use
270
                            // the value of the first list item as the
271
                            // value for this property
272 36
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
273 32
                                $metadata = $metadata[0];
274 9
                            } elseif (0 == \count($metadata)) {
275
                                // if the value is an empty array, set
276
                                // the value of this property to the empty
277
                                // string
278 6
                                $metadata = '';
279
                            }
280
                        }
281
282
                        // Move down one level in the stack
283 36
                        $metadata = &$stack[\count($stack) - 1];
284 36
                        unset($stack[\count($stack) - 1]);
285 36
                        break;
286
                }
287
            }
288
289
            // Only use this metadata if it's referring to a PDF
290 40
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
291
                // According to the XMP specifications: 'Conflict resolution
292
                // for separate packets that describe the same resource is
293
                // beyond the scope of this document.' - Section 6.1
294
                // Source: https://www.adobe.com/devnet/xmp.html
295
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
296
                // So if there are multiple XMP blocks, just merge the values
297
                // of each found block over top of the existing values
298 22
                $this->metadata = array_merge($this->metadata, $metadata);
299
            }
300
        }
301 40
        xml_parser_free($xml);
302
    }
303
304 1
    public function getDictionary(): array
305
    {
306 1
        return $this->dictionary;
307
    }
308
309
    /**
310
     * @param PDFObject[] $objects
311
     */
312 70
    public function setObjects($objects = [])
313
    {
314 70
        $this->objects = (array) $objects;
315
316 70
        $this->init();
317
    }
318
319
    /**
320
     * @return PDFObject[]
321
     */
322 2
    public function getObjects()
323
    {
324 2
        return $this->objects;
325
    }
326
327
    /**
328
     * @return PDFObject|Font|Page|Element|null
329
     */
330 67
    public function getObjectById(string $id)
331
    {
332 67
        if (isset($this->objects[$id])) {
333 67
            return $this->objects[$id];
334
        }
335
336 3
        return null;
337
    }
338
339 71
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
340
    {
341 71
        return 0 < \count($this->getObjectsByType($type, $subtype));
342
    }
343
344 75
    public function getObjectsByType(string $type, ?string $subtype = null): array
345
    {
346 75
        if (!isset($this->dictionary[$type])) {
347 13
            return [];
348
        }
349
350 69
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
351
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
352
                return [];
353
            }
354
355
            return $this->dictionary[$type]['subtype'][$subtype];
356
        }
357
358 69
        return $this->dictionary[$type]['all'];
359
    }
360
361
    /**
362
     * @return Font[]
363
     */
364 51
    public function getFonts()
365
    {
366 51
        return $this->getObjectsByType('Font');
367
    }
368
369 45
    public function getFirstFont(): ?Font
370
    {
371 45
        $fonts = $this->getFonts();
372 45
        if ([] === $fonts) {
373 4
            return null;
374
        }
375
376 41
        return reset($fonts);
377
    }
378
379
    /**
380
     * @return Page[]
381
     *
382
     * @throws \Exception
383
     */
384 71
    public function getPages()
385
    {
386 71
        if ($this->hasObjectsByType('Catalog')) {
387
            // Search for catalog to list pages.
388 63
            $catalogues = $this->getObjectsByType('Catalog');
389 63
            $catalogue = reset($catalogues);
390
391
            /** @var Pages $object */
392 63
            $object = $catalogue->get('Pages');
393 63
            if (method_exists($object, 'getPages')) {
394 63
                return $object->getPages(true);
395
            }
396
        }
397
398 9
        if ($this->hasObjectsByType('Pages')) {
399
            // Search for pages to list kids.
400 1
            $pages = [];
401
402
            /** @var Pages[] $objects */
403 1
            $objects = $this->getObjectsByType('Pages');
404 1
            foreach ($objects as $object) {
405 1
                $pages = array_merge($pages, $object->getPages(true));
406
            }
407
408 1
            return $pages;
409
        }
410
411 9
        if ($this->hasObjectsByType('Page')) {
412
            // Search for 'page' (unordered pages).
413 7
            $pages = $this->getObjectsByType('Page');
414
415 7
            return array_values($pages);
416
        }
417
418 3
        throw new \Exception('Missing catalog.');
419
    }
420
421 23
    public function getText(?int $pageLimit = null): string
422
    {
423 23
        $texts = [];
424 23
        $pages = $this->getPages();
425
426
        // Only use the first X number of pages if $pageLimit is set and numeric.
427 23
        if (\is_int($pageLimit) && 0 < $pageLimit) {
428 1
            $pages = \array_slice($pages, 0, $pageLimit);
429
        }
430
431 23
        foreach ($pages as $index => $page) {
432
            /**
433
             * In some cases, the $page variable may be null.
434
             */
435 23
            if (null === $page) {
436
                continue;
437
            }
438 23
            if ($text = trim($page->getText())) {
439 23
                $texts[] = $text;
440
            }
441
        }
442
443 23
        return implode("\n\n", $texts);
444
    }
445
446
    public function getTrailer(): Header
447
    {
448
        return $this->trailer;
449
    }
450
451 62
    public function setTrailer(Header $trailer)
452
    {
453 62
        $this->trailer = $trailer;
454
    }
455
456 16
    public function getDetails(): array
457
    {
458 16
        return $this->details;
459
    }
460
}
461