Passed
Pull Request — master (#716)
by Konrad
03:03
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 10.9841

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 22
cts 28
cp 0.7856
crap 10.9841
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69
    protected $metadata = [];
70
71
    /**
72
     * @var array
73
     */
74
    protected $details;
75
76 102
    public function __construct()
77
    {
78 102
        $this->trailer = new Header([], $this);
79
    }
80
81 73
    public function init()
82
    {
83 73
        $this->buildDictionary();
84
85 73
        $this->buildDetails();
86
87
        // Propagate init to objects.
88 73
        foreach ($this->objects as $object) {
89 73
            $object->getHeader()->init();
90 73
            $object->init();
91
        }
92
    }
93
94
    /**
95
     * Build dictionary based on type header field.
96
     */
97 73
    protected function buildDictionary()
98
    {
99
        // Build dictionary.
100 73
        $this->dictionary = [];
101
102 73
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104 73
            $type = $object->getHeader()->get('Type')->getContent();
105
106 73
            if (null != $type) {
107 73
                if (!isset($this->dictionary[$type])) {
108 73
                    $this->dictionary[$type] = [
109 73
                        'all' => [],
110 73
                        'subtype' => [],
111 73
                    ];
112
                }
113
114 73
                $this->dictionary[$type]['all'][$id] = $object;
115
116 73
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117 73
                if (null != $subtype) {
118 65
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119 65
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121 65
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123
            }
124
        }
125
    }
126
127
    /**
128
     * Build details array.
129
     */
130 73
    protected function buildDetails()
131
    {
132
        // Build details array.
133 73
        $details = [];
134
135
        // Extract document info
136 73
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138 62
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 62
            if (null !== $info && method_exists($info, 'getHeader')) {
142 62
                $details = $info->getHeader()->getDetails();
143
            }
144
        }
145
146
        // Retrieve the page count
147
        try {
148 73
            $pages = $this->getPages();
149 72
            $details['Pages'] = \count($pages);
150 2
        } catch (\Exception $e) {
151 2
            $details['Pages'] = 0;
152
        }
153
154
        // Decode and repair encoded document properties
155 73
        foreach ($details as $key => $value) {
156 73
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160
                // safe
161 61
                if (mb_check_encoding($value, 'UTF-8')) {
162
                    // Remove literal backslash + line-feed "\\r"
163 60
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167 60
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168
                        $diff = (\ord($match[1]) - 182) * 64;
169
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175 60
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182
                    // between two single byte characters in a unicode string
183 60
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 60
                    $details[$key] = $value;
186
                } else {
187
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189 11
                    $value = str_replace("\\\r", '', $value);
190 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192
            }
193
        }
194
195 73
        $details = array_merge($details, $this->metadata);
196
197 73
        $this->details = $details;
198
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203 41
    public function extractXMPMetadata(string $content): void
204
    {
205 41
        $xml = xml_parser_create();
206 41
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208 41
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210
             * short overview about the following code parts:
211
             *
212
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218
             * element) is set as the current $metadata context.
219
             */
220 41
            $metadata = [];
221 41
            $stack = [];
222 41
            foreach ($values as $val) {
223
                // Standardize to lowercase
224 41
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227 41
                if (0 === strpos($val['tag'], 'x:')) {
228 41
                    continue;
229 41
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 41
                    continue;
231
                }
232
233 41
                switch ($val['type']) {
234 41
                    case 'open':
235
                        // Create an array of list items
236 37
                        if ('rdf:li' == $val['tag']) {
237 5
                            $metadata[] = [];
238
239
                            // Move up one level in the stack
240 5
                            $stack[\count($stack)] = &$metadata;
241 5
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 37
                            $metadata[$val['tag']] = [];
245
246
                            // Move up one level in the stack
247 37
                            $stack[\count($stack)] = &$metadata;
248 37
                            $metadata = &$metadata[$val['tag']];
249
                        }
250 37
                        break;
251
252 41
                    case 'complete':
253 41
                        if (isset($val['value'])) {
254
                            // Assign a value to this list item
255 41
                            if ('rdf:li' == $val['tag']) {
256 33
                                $metadata[] = $val['value'];
257
258
                            // Else assign a value to this property
259
                            } else {
260 41
                                $metadata[$val['tag']] = $val['value'];
261
                            }
262
                        }
263 41
                        break;
264
265 37
                    case 'close':
266
                        // If the value of this property is an array
267 37
                        if (\is_array($metadata)) {
268
                            // If the value is a single element array
269
                            // where the element is of type string, use
270
                            // the value of the first list item as the
271
                            // value for this property
272 37
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
273 33
                                $metadata = $metadata[0];
274 10
                            } elseif (0 == \count($metadata)) {
275
                                // if the value is an empty array, set
276
                                // the value of this property to the empty
277
                                // string
278 7
                                $metadata = '';
279
                            }
280
                        }
281
282
                        // Move down one level in the stack
283 37
                        $metadata = &$stack[\count($stack) - 1];
284 37
                        unset($stack[\count($stack) - 1]);
285 37
                        break;
286
                }
287
            }
288
289
            // Only use this metadata if it's referring to a PDF
290 41
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
291
                // According to the XMP specifications: 'Conflict resolution
292
                // for separate packets that describe the same resource is
293
                // beyond the scope of this document.' - Section 6.1
294
                // Source: https://www.adobe.com/devnet/xmp.html
295
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
296
                // So if there are multiple XMP blocks, just merge the values
297
                // of each found block over top of the existing values
298 22
                $this->metadata = array_merge($this->metadata, $metadata);
299
            }
300
        }
301 41
        xml_parser_free($xml);
302
    }
303
304 1
    public function getDictionary(): array
305
    {
306 1
        return $this->dictionary;
307
    }
308
309
    /**
310
     * @param PDFObject[] $objects
311
     */
312 73
    public function setObjects($objects = [])
313
    {
314 73
        $this->objects = (array) $objects;
315
316 73
        $this->init();
317
    }
318
319
    /**
320
     * @return PDFObject[]
321
     */
322 2
    public function getObjects()
323
    {
324 2
        return $this->objects;
325
    }
326
327
    /**
328
     * @return PDFObject|Font|Page|Element|null
329
     */
330 70
    public function getObjectById(string $id)
331
    {
332 70
        if (isset($this->objects[$id])) {
333 70
            return $this->objects[$id];
334
        }
335
336 3
        return null;
337
    }
338
339 74
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
340
    {
341 74
        return 0 < \count($this->getObjectsByType($type, $subtype));
342
    }
343
344 78
    public function getObjectsByType(string $type, ?string $subtype = null): array
345
    {
346 78
        if (!isset($this->dictionary[$type])) {
347 13
            return [];
348
        }
349
350 72
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
351
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
352
                return [];
353
            }
354
355
            return $this->dictionary[$type]['subtype'][$subtype];
356
        }
357
358 72
        return $this->dictionary[$type]['all'];
359
    }
360
361
    /**
362
     * @return Font[]
363
     */
364 53
    public function getFonts()
365
    {
366 53
        return $this->getObjectsByType('Font');
367
    }
368
369 47
    public function getFirstFont(): ?Font
370
    {
371 47
        $fonts = $this->getFonts();
372 47
        if ([] === $fonts) {
373 4
            return null;
374
        }
375
376 43
        return reset($fonts);
377
    }
378
379
    /**
380
     * @return Page[]
381
     *
382
     * @throws \Exception
383
     */
384 74
    public function getPages()
385
    {
386 74
        if ($this->hasObjectsByType('Catalog')) {
387
            // Search for catalog to list pages.
388 66
            $catalogues = $this->getObjectsByType('Catalog');
389 66
            $catalogue = reset($catalogues);
390
391
            /** @var Pages $object */
392 66
            $object = $catalogue->get('Pages');
393 66
            if (method_exists($object, 'getPages')) {
394 66
                return $object->getPages(true);
395
            }
396
        }
397
398 9
        if ($this->hasObjectsByType('Pages')) {
399
            // Search for pages to list kids.
400 1
            $pages = [];
401
402
            /** @var Pages[] $objects */
403 1
            $objects = $this->getObjectsByType('Pages');
404 1
            foreach ($objects as $object) {
405 1
                $pages = array_merge($pages, $object->getPages(true));
406
            }
407
408 1
            return $pages;
409
        }
410
411 9
        if ($this->hasObjectsByType('Page')) {
412
            // Search for 'page' (unordered pages).
413 7
            $pages = $this->getObjectsByType('Page');
414
415 7
            return array_values($pages);
416
        }
417
418 3
        throw new \Exception('Missing catalog.');
419
    }
420
421 24
    public function getText(?int $pageLimit = null): string
422
    {
423 24
        $texts = [];
424 24
        $pages = $this->getPages();
425
426
        // Only use the first X number of pages if $pageLimit is set and numeric.
427 24
        if (\is_int($pageLimit) && 0 < $pageLimit) {
428 1
            $pages = \array_slice($pages, 0, $pageLimit);
429
        }
430
431 24
        foreach ($pages as $index => $page) {
432
            /**
433
             * In some cases, the $page variable may be null.
434
             */
435 24
            if (null === $page) {
436
                continue;
437
            }
438 24
            if ($text = trim($page->getText())) {
439 24
                $texts[] = $text;
440
            }
441
        }
442
443 24
        return implode("\n\n", $texts);
444
    }
445
446
    public function getTrailer(): Header
447
    {
448
        return $this->trailer;
449
    }
450
451 65
    public function setTrailer(Header $trailer)
452
    {
453 65
        $this->trailer = $trailer;
454
    }
455
456 17
    public function getDetails(): array
457
    {
458 17
        return $this->details;
459
    }
460
}
461