Document::buildDetails()   B
last analyzed

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 10.1228

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 25
cts 28
cp 0.8929
crap 10.1228
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
use Smalot\PdfParser\Exception\MissingCatalogException;
37
38
/**
39
 * Technical references :
40
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
41
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
42
 * - http://www.php.net/manual/en/ref.pdf.php#74211
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
46
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
47
 *
48
 * Class Document
49
 */
50
class Document
51
{
52
    /**
53
     * @var PDFObject[]
54
     */
55
    protected $objects = [];
56
57
    /**
58
     * @var array
59
     */
60
    protected $dictionary = [];
61
62
    /**
63
     * @var Header
64
     */
65
    protected $trailer;
66
67
    /**
68
     * @var array<mixed>
69
     */
70
    protected $metadata = [];
71
72
    /**
73
     * @var array
74
     */
75
    protected $details;
76
77 104
    public function __construct()
78
    {
79 104
        $this->trailer = new Header([], $this);
80
    }
81
82 75
    public function init()
83
    {
84 75
        $this->buildDictionary();
85
86 75
        $this->buildDetails();
87
88
        // Propagate init to objects.
89 75
        foreach ($this->objects as $object) {
90 74
            $object->getHeader()->init();
91 74
            $object->init();
92
        }
93
    }
94
95
    /**
96
     * Build dictionary based on type header field.
97
     */
98 75
    protected function buildDictionary()
99
    {
100
        // Build dictionary.
101 75
        $this->dictionary = [];
102
103 75
        foreach ($this->objects as $id => $object) {
104
            // Cache objects by type and subtype
105 74
            $type = $object->getHeader()->get('Type')->getContent();
106
107 74
            if (null != $type) {
108 74
                if (!isset($this->dictionary[$type])) {
109 74
                    $this->dictionary[$type] = [
110 74
                        'all' => [],
111 74
                        'subtype' => [],
112 74
                    ];
113
                }
114
115 74
                $this->dictionary[$type]['all'][$id] = $object;
116
117 74
                $subtype = $object->getHeader()->get('Subtype')->getContent();
118 74
                if (null != $subtype) {
119 67
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
120 67
                        $this->dictionary[$type]['subtype'][$subtype] = [];
121
                    }
122 67
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
123
                }
124
            }
125
        }
126
    }
127
128
    /**
129
     * Build details array.
130
     */
131 75
    protected function buildDetails()
132
    {
133
        // Build details array.
134 75
        $details = [];
135
136
        // Extract document info
137 75
        if ($this->trailer->has('Info')) {
138
            /** @var PDFObject $info */
139 62
            $info = $this->trailer->get('Info');
140
            // This could be an ElementMissing object, so we need to check for
141
            // the getHeader method first.
142 62
            if (null !== $info && method_exists($info, 'getHeader')) {
143 62
                $details = $info->getHeader()->getDetails();
144
            }
145
        }
146
147
        // Retrieve the page count
148
        try {
149 75
            $pages = $this->getPages();
150 73
            $details['Pages'] = \count($pages);
151 3
        } catch (\Exception $e) {
152 3
            $details['Pages'] = 0;
153
        }
154
155
        // Decode and repair encoded document properties
156 75
        foreach ($details as $key => $value) {
157 75
            if (\is_string($value)) {
158
                // If the string is already UTF-8 encoded, that means we only
159
                // need to repair Adobe's ham-fisted insertion of line-feeds
160
                // every ~127 characters, which doesn't seem to be multi-byte
161
                // safe
162 61
                if (mb_check_encoding($value, 'UTF-8')) {
163
                    // Remove literal backslash + line-feed "\\r"
164 60
                    $value = str_replace("\x5c\x0d", '', $value);
165
166
                    // Remove backslash plus bytes written into high part of
167
                    // multibyte unicode character
168 60
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
169 1
                        $diff = (\ord($match[1]) - 182) * 64;
170 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
171 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
172
                    }
173
174
                    // Remove bytes written into low part of multibyte unicode
175
                    // character
176 60
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
177
                        $diff = \ord($match[2]) - 181;
178
                        $newbyte = \chr(\ord($match[1]) + $diff);
179
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
180
                    }
181
182
                    // Remove this byte string that Adobe occasionally adds
183
                    // between two single byte characters in a unicode string
184 60
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
185
186 60
                    $details[$key] = $value;
187
                } else {
188
                    // If the string is just PDFDocEncoding, remove any line-feeds
189
                    // and decode the whole thing.
190 11
                    $value = str_replace("\\\r", '', $value);
191 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
192
                }
193
            }
194
        }
195
196 75
        $details = array_merge($details, $this->metadata);
197
198 75
        $this->details = $details;
199
    }
200
201
    /**
202
     * Extract XMP Metadata
203
     */
204 42
    public function extractXMPMetadata(string $content): void
205
    {
206 42
        $xml = xml_parser_create();
207 42
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
208
209 42
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
210
            /*
211
             * short overview about the following code parts:
212
             *
213
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
214
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
215
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
216
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
217
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
218
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
219
             * element) is set as the current $metadata context.
220
             */
221 42
            $metadata = [];
222 42
            $stack = [];
223 42
            foreach ($values as $val) {
224
                // Standardize to lowercase
225 42
                $val['tag'] = strtolower($val['tag']);
226
227
                // Ignore structural x: and rdf: XML elements
228 42
                if (0 === strpos($val['tag'], 'x:')) {
229 42
                    continue;
230 42
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
231 42
                    continue;
232
                }
233
234 42
                switch ($val['type']) {
235 42
                    case 'open':
236
                        // Create an array of list items
237 38
                        if ('rdf:li' == $val['tag']) {
238 5
                            $metadata[] = [];
239
240
                            // Move up one level in the stack
241 5
                            $stack[\count($stack)] = &$metadata;
242 5
                            $metadata = &$metadata[\count($metadata) - 1];
243
                        } else {
244
                            // Else create an array of named values
245 38
                            $metadata[$val['tag']] = [];
246
247
                            // Move up one level in the stack
248 38
                            $stack[\count($stack)] = &$metadata;
249 38
                            $metadata = &$metadata[$val['tag']];
250
                        }
251 38
                        break;
252
253 42
                    case 'complete':
254 42
                        if (isset($val['value'])) {
255
                            // Assign a value to this list item
256 42
                            if ('rdf:li' == $val['tag']) {
257 34
                                $metadata[] = $val['value'];
258
259
                                // Else assign a value to this property
260
                            } else {
261 42
                                $metadata[$val['tag']] = $val['value'];
262
                            }
263
                        }
264 42
                        break;
265
266 38
                    case 'close':
267
                        // If the value of this property is an array
268 38
                        if (\is_array($metadata)) {
269
                            // If the value is a single element array
270
                            // where the element is of type string, use
271
                            // the value of the first list item as the
272
                            // value for this property
273 38
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
274 34
                                $metadata = $metadata[0];
275 10
                            } elseif (0 == \count($metadata)) {
276
                                // if the value is an empty array, set
277
                                // the value of this property to the empty
278
                                // string
279 7
                                $metadata = '';
280
                            }
281
                        }
282
283
                        // Move down one level in the stack
284 38
                        $metadata = &$stack[\count($stack) - 1];
285 38
                        unset($stack[\count($stack) - 1]);
286 38
                        break;
287
                }
288
            }
289
290
            // Only use this metadata if it's referring to a PDF
291 42
            if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) {
292
                // According to the XMP specifications: 'Conflict resolution
293
                // for separate packets that describe the same resource is
294
                // beyond the scope of this document.' - Section 6.1
295
                // Source: https://www.adobe.com/devnet/xmp.html
296
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
297
                // So if there are multiple XMP blocks, just merge the values
298
                // of each found block over top of the existing values
299 42
                $this->metadata = array_merge($this->metadata, $metadata);
300
            }
301
        }
302 42
        xml_parser_free($xml);
303
    }
304
305 1
    public function getDictionary(): array
306
    {
307 1
        return $this->dictionary;
308
    }
309
310
    /**
311
     * @param PDFObject[] $objects
312
     */
313 74
    public function setObjects($objects = [])
314
    {
315 74
        $this->objects = (array) $objects;
316
317 74
        $this->init();
318
    }
319
320
    /**
321
     * @return PDFObject[]
322
     */
323 2
    public function getObjects()
324
    {
325 2
        return $this->objects;
326
    }
327
328
    /**
329
     * @return PDFObject|Font|Page|Element|null
330
     */
331 71
    public function getObjectById(string $id)
332
    {
333 71
        if (isset($this->objects[$id])) {
334 71
            return $this->objects[$id];
335
        }
336
337 3
        return null;
338
    }
339
340 76
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
341
    {
342 76
        return 0 < \count($this->getObjectsByType($type, $subtype));
343
    }
344
345 80
    public function getObjectsByType(string $type, ?string $subtype = null): array
346
    {
347 80
        if (!isset($this->dictionary[$type])) {
348 15
            return [];
349
        }
350
351 73
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
352
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
353
                return [];
354
            }
355
356
            return $this->dictionary[$type]['subtype'][$subtype];
357
        }
358
359 73
        return $this->dictionary[$type]['all'];
360
    }
361
362
    /**
363
     * @return Font[]
364
     */
365 55
    public function getFonts()
366
    {
367 55
        return $this->getObjectsByType('Font');
368
    }
369
370 49
    public function getFirstFont(): ?Font
371
    {
372 49
        $fonts = $this->getFonts();
373 49
        if ([] === $fonts) {
374 5
            return null;
375
        }
376
377 44
        return reset($fonts);
378
    }
379
380
    /**
381
     * @return Page[]
382
     *
383
     * @throws MissingCatalogException
384
     */
385 76
    public function getPages()
386
    {
387 76
        if ($this->hasObjectsByType('Catalog')) {
388
            // Search for catalog to list pages.
389 67
            $catalogues = $this->getObjectsByType('Catalog');
390 67
            $catalogue = reset($catalogues);
391
392
            /** @var Pages $object */
393 67
            $object = $catalogue->get('Pages');
394 67
            if (method_exists($object, 'getPages')) {
395 67
                return $object->getPages(true);
396
            }
397
        }
398
399 10
        if ($this->hasObjectsByType('Pages')) {
400
            // Search for pages to list kids.
401 1
            $pages = [];
402
403
            /** @var Pages[] $objects */
404 1
            $objects = $this->getObjectsByType('Pages');
405 1
            foreach ($objects as $object) {
406 1
                $pages = array_merge($pages, $object->getPages(true));
407
            }
408
409 1
            return $pages;
410
        }
411
412 10
        if ($this->hasObjectsByType('Page')) {
413
            // Search for 'page' (unordered pages).
414 7
            $pages = $this->getObjectsByType('Page');
415
416 7
            return array_values($pages);
417
        }
418
419 4
        throw new MissingCatalogException('Missing catalog.');
420
    }
421
422 25
    public function getText(?int $pageLimit = null): string
423
    {
424 25
        $texts = [];
425 25
        $pages = $this->getPages();
426
427
        // Only use the first X number of pages if $pageLimit is set and numeric.
428 25
        if (\is_int($pageLimit) && 0 < $pageLimit) {
429 1
            $pages = \array_slice($pages, 0, $pageLimit);
430
        }
431
432 25
        foreach ($pages as $index => $page) {
433
            /**
434
             * In some cases, the $page variable may be null.
435
             */
436 25
            if (null === $page) {
437
                continue;
438
            }
439 25
            if ($text = trim($page->getText())) {
440 24
                $texts[] = $text;
441
            }
442
        }
443
444 25
        return implode("\n\n", $texts);
445
    }
446
447
    public function getTrailer(): Header
448
    {
449
        return $this->trailer;
450
    }
451
452 66
    public function setTrailer(Header $trailer)
453
    {
454 66
        $this->trailer = $trailer;
455
    }
456
457 18
    public function getDetails(): array
458
    {
459 18
        return $this->details;
460
    }
461
}
462