Document::buildDetails()   B
last analyzed

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 10.1228

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 25
cts 28
cp 0.8929
crap 10.1228
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
use Smalot\PdfParser\Exception\MissingCatalogException;
37
38
/**
39
 * Technical references :
40
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
41
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
42
 * - http://www.php.net/manual/en/ref.pdf.php#74211
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
46
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
47
 *
48
 * Class Document
49
 */
50
class Document
51
{
52
    /**
53
     * @var PDFObject[]
54
     */
55
    protected $objects = [];
56
57
    /**
58
     * @var array
59
     */
60
    protected $dictionary = [];
61
62
    /**
63
     * @var Header
64
     */
65
    protected $trailer;
66
67
    /**
68
     * @var array<mixed>
69
     */
70
    protected $metadata = [];
71
72
    /**
73
     * @var array
74
     */
75
    protected $details;
76
77 104
    public function __construct()
78
    {
79 104
        $this->trailer = new Header([], $this);
80
    }
81
82 75
    public function init()
83
    {
84 75
        $this->buildDictionary();
85
86 75
        $this->buildDetails();
87
88
        // Propagate init to objects.
89 75
        foreach ($this->objects as $object) {
90 74
            $object->getHeader()->init();
91 74
            $object->init();
92
        }
93
    }
94
95
    /**
96
     * Build dictionary based on type header field.
97
     */
98 75
    protected function buildDictionary()
99
    {
100
        // Build dictionary.
101 75
        $this->dictionary = [];
102
103 75
        foreach ($this->objects as $id => $object) {
104
            // Cache objects by type and subtype
105 74
            $type = $object->getHeader()->get('Type')->getContent();
106
107 74
            if (null != $type) {
108 74
                if (!isset($this->dictionary[$type])) {
109 74
                    $this->dictionary[$type] = [
110 74
                        'all' => [],
111 74
                        'subtype' => [],
112 74
                    ];
113
                }
114
115 74
                $this->dictionary[$type]['all'][$id] = $object;
116
117 74
                $subtype = $object->getHeader()->get('Subtype')->getContent();
118 74
                if (null != $subtype) {
119 67
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
120 67
                        $this->dictionary[$type]['subtype'][$subtype] = [];
121
                    }
122 67
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
123
                }
124
            }
125
        }
126
    }
127
128
    /**
129
     * Build details array.
130
     */
131 75
    protected function buildDetails()
132
    {
133
        // Build details array.
134 75
        $details = [];
135
136
        // Extract document info
137 75
        if ($this->trailer->has('Info')) {
138
            /** @var PDFObject $info */
139 62
            $info = $this->trailer->get('Info');
140
            // This could be an ElementMissing object, so we need to check for
141
            // the getHeader method first.
142 62
            if (null !== $info && method_exists($info, 'getHeader')) {
143 62
                $details = $info->getHeader()->getDetails();
144
            }
145
        }
146
147
        // Retrieve the page count
148
        try {
149 75
            $pages = $this->getPages();
150 73
            $details['Pages'] = \count($pages);
151 3
        } catch (\Exception $e) {
152 3
            $details['Pages'] = 0;
153
        }
154
155
        // Decode and repair encoded document properties
156 75
        foreach ($details as $key => $value) {
157 75
            if (\is_string($value)) {
158
                // If the string is already UTF-8 encoded, that means we only
159
                // need to repair Adobe's ham-fisted insertion of line-feeds
160
                // every ~127 characters, which doesn't seem to be multi-byte
161
                // safe
162 61
                if (mb_check_encoding($value, 'UTF-8')) {
163
                    // Remove literal backslash + line-feed "\\r"
164 60
                    $value = str_replace("\x5c\x0d", '', $value);
165
166
                    // Remove backslash plus bytes written into high part of
167
                    // multibyte unicode character
168 60
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
169 1
                        $diff = (\ord($match[1]) - 182) * 64;
170 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
171 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
172
                    }
173
174
                    // Remove bytes written into low part of multibyte unicode
175
                    // character
176 60
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
177
                        $diff = \ord($match[2]) - 181;
178
                        $newbyte = \chr(\ord($match[1]) + $diff);
179
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
180
                    }
181
182
                    // Remove this byte string that Adobe occasionally adds
183
                    // between two single byte characters in a unicode string
184 60
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
185
186 60
                    $details[$key] = $value;
187
                } else {
188
                    // If the string is just PDFDocEncoding, remove any line-feeds
189
                    // and decode the whole thing.
190 11
                    $value = str_replace("\\\r", '', $value);
191 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
192
                }
193
            }
194
        }
195
196 75
        $details = array_merge($details, $this->metadata);
197
198 75
        $this->details = $details;
199
    }
200
201
    /**
202
     * Extract XMP Metadata
203
     */
204 42
    public function extractXMPMetadata(string $content): void
205
    {
206 42
        $xml = xml_parser_create();
207 42
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
208
209 42
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
210
            /*
211
             * short overview about the following code parts:
212
             *
213
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
214
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
215
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
216
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
217
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
218
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
219
             * element) is set as the current $metadata context.
220
             */
221 42
            $metadata = [];
222 42
            $stack = [];
223 42
            foreach ($values as $val) {
224
                // Standardize to lowercase
225 42
                $val['tag'] = strtolower($val['tag']);
226
227
                // Ignore structural x: and rdf: XML elements
228 42
                if (0 === strpos($val['tag'], 'x:')) {
229 42
                    continue;
230 42
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
231 42
                    continue;
232
                }
233
234 42
                switch ($val['type']) {
235 42
                    case 'open':
236
                        // Create an array of list items
237 38
                        if ('rdf:li' == $val['tag']) {
238 5
                            $metadata[] = [];
239
240
                            // Move up one level in the stack
241 5
                            $stack[\count($stack)] = &$metadata;
242 5
                            $metadata = &$metadata[\count($metadata) - 1];
243
                        } else {
244
                            // Else create an array of named values
245 38
                            $metadata[$val['tag']] = [];
246
247
                            // Move up one level in the stack
248 38
                            $stack[\count($stack)] = &$metadata;
249 38
                            $metadata = &$metadata[$val['tag']];
250
                        }
251 38
                        break;
252
253 42
                    case 'complete':
254 42
                        if (isset($val['value'])) {
255
                            // Assign a value to this list item
256 42
                            if ('rdf:li' == $val['tag']) {
257 34
                                $metadata[] = $val['value'];
258
259
                                // Else assign a value to this property
260
                            } else {
261 42
                                $metadata[$val['tag']] = $val['value'];
262
                            }
263
                        }
264 42
                        break;
265
266 38
                    case 'close':
267
                        // If the value of this property is an array
268 38
                        if (\is_array($metadata)) {
269
                            // If the value is a single element array
270
                            // where the element is of type string, use
271
                            // the value of the first list item as the
272
                            // value for this property
273 38
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
274 34
                                $metadata = $metadata[0];
275 10
                            } elseif (0 == \count($metadata)) {
276
                                // if the value is an empty array, set
277
                                // the value of this property to the empty
278
                                // string
279 7
                                $metadata = '';
280
                            }
281
                        }
282
283
                        // Move down one level in the stack
284 38
                        $metadata = &$stack[\count($stack) - 1];
285 38
                        unset($stack[\count($stack) - 1]);
286 38
                        break;
287
                }
288
            }
289
290
            // Only use this metadata if it's referring to a PDF
291 42
            if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) {
292
                // According to the XMP specifications: 'Conflict resolution
293
                // for separate packets that describe the same resource is
294
                // beyond the scope of this document.' - Section 6.1
295
                // Source: https://www.adobe.com/devnet/xmp.html
296
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
297
                // So if there are multiple XMP blocks, just merge the values
298
                // of each found block over top of the existing values
299 42
                $this->metadata = array_merge($this->metadata, $metadata);
300
            }
301
        }
302 42
303
        // TODO: remove this if-clause and its content when dropping PHP 7 support
304
        if (version_compare(PHP_VERSION, '8.0.0', '<')) {
305 1
            // ref: https://www.php.net/manual/en/function.xml-parser-free.php
306
            xml_parser_free($xml);
307 1
308
            // to avoid memory leaks; documentation said:
309
            // > it was necessary to also explicitly unset the reference to parser to avoid memory leaks
310
            unset($xml);
311
        }
312
    }
313 74
314
    public function getDictionary(): array
315 74
    {
316
        return $this->dictionary;
317 74
    }
318
319
    /**
320
     * @param PDFObject[] $objects
321
     */
322
    public function setObjects($objects = [])
323 2
    {
324
        $this->objects = (array) $objects;
325 2
326
        $this->init();
327
    }
328
329
    /**
330
     * @return PDFObject[]
331 71
     */
332
    public function getObjects()
333 71
    {
334 71
        return $this->objects;
335
    }
336
337 3
    /**
338
     * @return PDFObject|Font|Page|Element|null
339
     */
340 76
    public function getObjectById(string $id)
341
    {
342 76
        if (isset($this->objects[$id])) {
343
            return $this->objects[$id];
344
        }
345 80
346
        return null;
347 80
    }
348 15
349
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
350
    {
351 73
        return 0 < \count($this->getObjectsByType($type, $subtype));
352
    }
353
354
    public function getObjectsByType(string $type, ?string $subtype = null): array
355
    {
356
        if (!isset($this->dictionary[$type])) {
357
            return [];
358
        }
359 73
360
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
361
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
362
                return [];
363
            }
364
365 55
            return $this->dictionary[$type]['subtype'][$subtype];
366
        }
367 55
368
        return $this->dictionary[$type]['all'];
369
    }
370 49
371
    /**
372 49
     * @return Font[]
373 49
     */
374 5
    public function getFonts()
375
    {
376
        return $this->getObjectsByType('Font');
377 44
    }
378
379
    public function getFirstFont(): ?Font
380
    {
381
        $fonts = $this->getFonts();
382
        if ([] === $fonts) {
383
            return null;
384
        }
385 76
386
        return reset($fonts);
387 76
    }
388
389 67
    /**
390 67
     * @return Page[]
391
     *
392
     * @throws MissingCatalogException
393 67
     */
394 67
    public function getPages()
395 67
    {
396
        if ($this->hasObjectsByType('Catalog')) {
397
            // Search for catalog to list pages.
398
            $catalogues = $this->getObjectsByType('Catalog');
399 10
            $catalogue = reset($catalogues);
400
401 1
            /** @var Pages $object */
402
            $object = $catalogue->get('Pages');
403
            if (method_exists($object, 'getPages')) {
404 1
                return $object->getPages(true);
405 1
            }
406 1
        }
407
408
        if ($this->hasObjectsByType('Pages')) {
409 1
            // Search for pages to list kids.
410
            $pages = [];
411
412 10
            /** @var Pages[] $objects */
413
            $objects = $this->getObjectsByType('Pages');
414 7
            foreach ($objects as $object) {
415
                $pages = array_merge($pages, $object->getPages(true));
416 7
            }
417
418
            return $pages;
419 4
        }
420
421
        if ($this->hasObjectsByType('Page')) {
422 25
            // Search for 'page' (unordered pages).
423
            $pages = $this->getObjectsByType('Page');
424 25
425 25
            return array_values($pages);
426
        }
427
428 25
        throw new MissingCatalogException('Missing catalog.');
429 1
    }
430
431
    public function getText(?int $pageLimit = null): string
432 25
    {
433
        $texts = [];
434
        $pages = $this->getPages();
435
436 25
        // Only use the first X number of pages if $pageLimit is set and numeric.
437
        if (\is_int($pageLimit) && 0 < $pageLimit) {
438
            $pages = \array_slice($pages, 0, $pageLimit);
439 25
        }
440 24
441
        foreach ($pages as $index => $page) {
442
            /**
443
             * In some cases, the $page variable may be null.
444 25
             */
445
            if (null === $page) {
446
                continue;
447
            }
448
            if ($text = trim($page->getText())) {
449
                $texts[] = $text;
450
            }
451
        }
452 66
453
        return implode("\n\n", $texts);
454 66
    }
455
456
    public function getTrailer(): Header
457 18
    {
458
        return $this->trailer;
459 18
    }
460
461
    public function setTrailer(Header $trailer)
462
    {
463
        $this->trailer = $trailer;
464
    }
465
466
    public function getDetails(): array
467
    {
468
        return $this->details;
469
    }
470
}
471