Passed
Pull Request — master (#615)
by Jeffrey
02:30
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 10.1228

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 25
cts 28
cp 0.8929
crap 10.1228
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69
    protected $metadata = [];
70
71
    /**
72
     * @var array
73
     */
74
    protected $details;
75
76 74
    public function __construct()
77
    {
78 74
        $this->trailer = new Header([], $this);
79 74
    }
80
81 51
    public function init()
82
    {
83 51
        $this->buildDictionary();
84
85 51
        $this->buildDetails();
86
87
        // Propagate init to objects.
88 51
        foreach ($this->objects as $object) {
89 51
            $object->getHeader()->init();
90 51
            $object->init();
91
        }
92 51
    }
93
94
    /**
95
     * Build dictionary based on type header field.
96
     */
97 51
    protected function buildDictionary()
98
    {
99
        // Build dictionary.
100 51
        $this->dictionary = [];
101
102 51
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104 51
            $type = $object->getHeader()->get('Type')->getContent();
105
106 51
            if (null != $type) {
107 51
                if (!isset($this->dictionary[$type])) {
108 51
                    $this->dictionary[$type] = [
109
                        'all' => [],
110
                        'subtype' => [],
111
                    ];
112
                }
113
114 51
                $this->dictionary[$type]['all'][$id] = $object;
115
116 51
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117 51
                if (null != $subtype) {
118 44
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119 44
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121 44
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123
            }
124
        }
125 51
    }
126
127
    /**
128
     * Build details array.
129
     */
130 51
    protected function buildDetails()
131
    {
132
        // Build details array.
133 51
        $details = [];
134
135
        // Extract document info
136 51
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138 42
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 42
            if (null !== $info && method_exists($info, 'getHeader')) {
142 42
                $details = $info->getHeader()->getDetails();
143
            }
144
        }
145
146
        // Retrieve the page count
147
        try {
148 51
            $pages = $this->getPages();
149 50
            $details['Pages'] = \count($pages);
150 2
        } catch (\Exception $e) {
151 2
            $details['Pages'] = 0;
152
        }
153
154
        // Decode and repair encoded document properties
155 51
        foreach ($details as $key => $value) {
156 51
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160
                // safe
161 41
                if (mb_check_encoding($value, 'UTF-8')) {
162
                    // Remove literal backslash + line-feed "\\r"
163 41
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167 41
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168 1
                        $diff = (\ord($match[1]) - 182) * 64;
169 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175 41
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182
                    // between two single byte characters in a unicode string
183 41
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 41
                    $details[$key] = $value;
186
                } else {
187
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189 7
                    $value = str_replace("\\\r", '', $value);
190 7
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192
            }
193
        }
194
195 51
        $details = array_merge($details, $this->metadata);
196
197 51
        $this->details = $details;
198 51
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203 30
    public function extractXMPMetadata(string $content): void
204
    {
205 30
        $xml = xml_parser_create();
206 30
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208 30
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210
             * short overview about the following code parts:
211
             *
212
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218
             * element) is set as the current $metadata context.
219
             */
220 30
            $metadata = [];
221 30
            $stack = [];
222 30
            foreach ($values as $val) {
223
                // Standardize to lowercase
224 30
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227 30
                if (0 === strpos($val['tag'], 'x:')) {
228 30
                    continue;
229 30
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 30
                    continue;
231
                }
232
233 30
                switch ($val['type']) {
234 30
                    case 'open':
235
                        // Create an array of list items
236 28
                        if ('rdf:li' == $val['tag']) {
237 3
                            $metadata[] = [];
238
239
                            // Move up one level in the stack
240 3
                            $stack[\count($stack)] = &$metadata;
241 3
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 28
                            $metadata[$val['tag']] = [];
245
246
                            // Move up one level in the stack
247 28
                            $stack[\count($stack)] = &$metadata;
248 28
                            $metadata = &$metadata[$val['tag']];
249
                        }
250 28
                        break;
251
252 30
                    case 'complete':
253 30
                        if (isset($val['value'])) {
254
                            // Assign a value to this list item
255 30
                            if ('rdf:li' == $val['tag']) {
256 27
                                $metadata[] = $val['value'];
257
258
                                // Else assign a value to this property
259
                            } else {
260 30
                                $metadata[$val['tag']] = $val['value'];
261
                            }
262
                        }
263 30
                        break;
264
265 28
                    case 'close':
266
                        // If the value of this property is a single-
267
                        // element array where the element is of type
268
                        // string, use the value of the first list item
269
                        // as the value for this property
270 28
                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
271 27
                            $metadata = $metadata[0];
272
                        }
273
274
                        // Move down one level in the stack
275 28
                        $metadata = &$stack[\count($stack) - 1];
276 28
                        unset($stack[\count($stack) - 1]);
277 28
                        break;
278
                }
279
            }
280
281
            // Only use this metadata if it's referring to a PDF
282 30
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
283
                // According to the XMP specifications: 'Conflict resolution
284
                // for separate packets that describe the same resource is
285
                // beyond the scope of this document.' - Section 6.1
286
                // Source: https://www.adobe.com/devnet/xmp.html
287
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
288
                // So if there are multiple XMP blocks, just merge the values
289
                // of each found block over top of the existing values
290 13
                $this->metadata = array_merge($this->metadata, $metadata);
291
            }
292
        }
293 30
        xml_parser_free($xml);
294 30
    }
295
296 1
    public function getDictionary(): array
297
    {
298 1
        return $this->dictionary;
299
    }
300
301
    /**
302
     * @param PDFObject[] $objects
303
     */
304 51
    public function setObjects($objects = [])
305
    {
306 51
        $this->objects = (array) $objects;
307
308 51
        $this->init();
309 51
    }
310
311
    /**
312
     * @return PDFObject[]
313
     */
314 1
    public function getObjects()
315
    {
316 1
        return $this->objects;
317
    }
318
319
    /**
320
     * @return PDFObject|Font|Page|Element|null
321
     */
322 48
    public function getObjectById(string $id)
323
    {
324 48
        if (isset($this->objects[$id])) {
325 48
            return $this->objects[$id];
326
        }
327
328 3
        return null;
329
    }
330
331 52
    public function hasObjectsByType(string $type, string $subtype = null): bool
332
    {
333 52
        return 0 < \count($this->getObjectsByType($type, $subtype));
334
    }
335
336 55
    public function getObjectsByType(string $type, string $subtype = null): array
337
    {
338 55
        if (!isset($this->dictionary[$type])) {
339 12
            return [];
340
        }
341
342 50
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
343
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
344
                return [];
345
            }
346
347
            return $this->dictionary[$type]['subtype'][$subtype];
348
        }
349
350 50
        return $this->dictionary[$type]['all'];
351
    }
352
353
    /**
354
     * @return Font[]
355
     */
356 27
    public function getFonts()
357
    {
358 27
        return $this->getObjectsByType('Font');
359
    }
360
361 21
    public function getFirstFont(): ?Font
362
    {
363 21
        $fonts = $this->getFonts();
364 21
        if ([] === $fonts) {
365 3
            return null;
366
        }
367
368 18
        return reset($fonts);
369
    }
370
371
    /**
372
     * @return Page[]
373
     *
374
     * @throws \Exception
375
     */
376 52
    public function getPages()
377
    {
378 52
        if ($this->hasObjectsByType('Catalog')) {
379
            // Search for catalog to list pages.
380 44
            $catalogues = $this->getObjectsByType('Catalog');
381 44
            $catalogue = reset($catalogues);
382
383
            /** @var Pages $object */
384 44
            $object = $catalogue->get('Pages');
385 44
            if (method_exists($object, 'getPages')) {
386 44
                return $object->getPages(true);
387
            }
388
        }
389
390 9
        if ($this->hasObjectsByType('Pages')) {
391
            // Search for pages to list kids.
392 1
            $pages = [];
393
394
            /** @var Pages[] $objects */
395 1
            $objects = $this->getObjectsByType('Pages');
396 1
            foreach ($objects as $object) {
397 1
                $pages = array_merge($pages, $object->getPages(true));
398
            }
399
400 1
            return $pages;
401
        }
402
403 9
        if ($this->hasObjectsByType('Page')) {
404
            // Search for 'page' (unordered pages).
405 7
            $pages = $this->getObjectsByType('Page');
406
407 7
            return array_values($pages);
408
        }
409
410 3
        throw new \Exception('Missing catalog.');
411
    }
412
413 12
    public function getText(int $pageLimit = null): string
414
    {
415 12
        $texts = [];
416 12
        $pages = $this->getPages();
417
418
        // Only use the first X number of pages if $pageLimit is set and numeric.
419 12
        if (\is_int($pageLimit) && 0 < $pageLimit) {
420 1
            $pages = \array_slice($pages, 0, $pageLimit);
421
        }
422
423 12
        foreach ($pages as $index => $page) {
424
            /**
425
             * In some cases, the $page variable may be null.
426
             */
427 12
            if (null === $page) {
428
                continue;
429
            }
430 12
            if ($text = trim($page->getText())) {
431 12
                $texts[] = $text;
432
            }
433
        }
434
435 12
        return implode("\n\n", $texts);
436
    }
437
438
    public function getTrailer(): Header
439
    {
440
        return $this->trailer;
441
    }
442
443 43
    public function setTrailer(Header $trailer)
444
    {
445 43
        $this->trailer = $trailer;
446 43
    }
447
448 14
    public function getDetails(): array
449
    {
450 14
        return $this->details;
451
    }
452
}
453