Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 25
CRAP Score 10.1228

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 25
cts 28
cp 0.8929
crap 10.1228
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69
    protected $metadata = [];
70
71
    /**
72
     * @var array
73
     */
74
    protected $details;
75
76 98
    public function __construct()
77
    {
78 98
        $this->trailer = new Header([], $this);
79
    }
80
81 71
    public function init()
82
    {
83 71
        $this->buildDictionary();
84
85 71
        $this->buildDetails();
86
87
        // Propagate init to objects.
88 71
        foreach ($this->objects as $object) {
89 71
            $object->getHeader()->init();
90 71
            $object->init();
91
        }
92
    }
93
94
    /**
95
     * Build dictionary based on type header field.
96
     */
97 71
    protected function buildDictionary()
98
    {
99
        // Build dictionary.
100 71
        $this->dictionary = [];
101
102 71
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104 71
            $type = $object->getHeader()->get('Type')->getContent();
105
106 71
            if (null != $type) {
107 71
                if (!isset($this->dictionary[$type])) {
108 71
                    $this->dictionary[$type] = [
109 71
                        'all' => [],
110 71
                        'subtype' => [],
111 71
                    ];
112
                }
113
114 71
                $this->dictionary[$type]['all'][$id] = $object;
115
116 71
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117 71
                if (null != $subtype) {
118 64
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119 64
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121 64
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123
            }
124
        }
125
    }
126
127
    /**
128
     * Build details array.
129
     */
130 71
    protected function buildDetails()
131
    {
132
        // Build details array.
133 71
        $details = [];
134
135
        // Extract document info
136 71
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138 60
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 60
            if (null !== $info && method_exists($info, 'getHeader')) {
142 60
                $details = $info->getHeader()->getDetails();
143
            }
144
        }
145
146
        // Retrieve the page count
147
        try {
148 71
            $pages = $this->getPages();
149 70
            $details['Pages'] = \count($pages);
150 2
        } catch (\Exception $e) {
151 2
            $details['Pages'] = 0;
152
        }
153
154
        // Decode and repair encoded document properties
155 71
        foreach ($details as $key => $value) {
156 71
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160
                // safe
161 59
                if (mb_check_encoding($value, 'UTF-8')) {
162
                    // Remove literal backslash + line-feed "\\r"
163 58
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167 58
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168 1
                        $diff = (\ord($match[1]) - 182) * 64;
169 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175 58
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182
                    // between two single byte characters in a unicode string
183 58
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 58
                    $details[$key] = $value;
186
                } else {
187
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189 11
                    $value = str_replace("\\\r", '', $value);
190 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192
            }
193
        }
194
195 71
        $details = array_merge($details, $this->metadata);
196
197 71
        $this->details = $details;
198
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203 41
    public function extractXMPMetadata(string $content): void
204
    {
205 41
        $xml = xml_parser_create();
206 41
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208 41
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210
             * short overview about the following code parts:
211
             *
212
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218
             * element) is set as the current $metadata context.
219
             */
220 41
            $metadata = [];
221 41
            $stack = [];
222 41
            foreach ($values as $val) {
223
                // Standardize to lowercase
224 41
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227 41
                if (0 === strpos($val['tag'], 'x:')) {
228 41
                    continue;
229 41
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 41
                    continue;
231
                }
232
233 41
                switch ($val['type']) {
234 41
                    case 'open':
235
                        // Create an array of list items
236 37
                        if ('rdf:li' == $val['tag']) {
237 5
                            $metadata[] = [];
238
239
                            // Move up one level in the stack
240 5
                            $stack[\count($stack)] = &$metadata;
241 5
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 37
                            $metadata[$val['tag']] = [];
245
246
                            // Move up one level in the stack
247 37
                            $stack[\count($stack)] = &$metadata;
248 37
                            $metadata = &$metadata[$val['tag']];
249
                        }
250 37
                        break;
251
252 41
                    case 'complete':
253 41
                        if (isset($val['value'])) {
254
                            // Assign a value to this list item
255 41
                            if ('rdf:li' == $val['tag']) {
256 33
                                $metadata[] = $val['value'];
257
258
                            // Else assign a value to this property
259
                            } else {
260 41
                                $metadata[$val['tag']] = $val['value'];
261
                            }
262
                        }
263 41
                        break;
264
265 37
                    case 'close':
266
                        // If the value of this property is an array
267 37
                        if (\is_array($metadata)) {
268
                            // If the value is a single element array
269
                            // where the element is of type string, use
270
                            // the value of the first list item as the
271
                            // value for this property
272 37
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
273 33
                                $metadata = $metadata[0];
274 10
                            } elseif (0 == \count($metadata)) {
275
                                // if the value is an empty array, set
276
                                // the value of this property to the empty
277
                                // string
278 7
                                $metadata = '';
279
                            }
280
                        }
281
282
                        // Move down one level in the stack
283 37
                        $metadata = &$stack[\count($stack) - 1];
284 37
                        unset($stack[\count($stack) - 1]);
285 37
                        break;
286
                }
287
            }
288
289
            // Only use this metadata if it's referring to a PDF
290 41
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
291
                // According to the XMP specifications: 'Conflict resolution
292
                // for separate packets that describe the same resource is
293
                // beyond the scope of this document.' - Section 6.1
294
                // Source: https://www.adobe.com/devnet/xmp.html
295
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
296
                // So if there are multiple XMP blocks, just merge the values
297
                // of each found block over top of the existing values
298 22
                $this->metadata = array_merge($this->metadata, $metadata);
299
            }
300
        }
301 41
        xml_parser_free($xml);
302
    }
303
304 1
    public function getDictionary(): array
305
    {
306 1
        return $this->dictionary;
307
    }
308
309
    /**
310
     * @param PDFObject[] $objects
311
     */
312 71
    public function setObjects($objects = [])
313
    {
314 71
        $this->objects = (array) $objects;
315
316 71
        $this->init();
317
    }
318
319
    /**
320
     * @return PDFObject[]
321
     */
322 2
    public function getObjects()
323
    {
324 2
        return $this->objects;
325
    }
326
327
    /**
328
     * @return PDFObject|Font|Page|Element|null
329
     */
330 68
    public function getObjectById(string $id)
331
    {
332 68
        if (isset($this->objects[$id])) {
333 68
            return $this->objects[$id];
334
        }
335
336 3
        return null;
337
    }
338
339 72
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
340
    {
341 72
        return 0 < \count($this->getObjectsByType($type, $subtype));
342
    }
343
344 76
    public function getObjectsByType(string $type, ?string $subtype = null): array
345
    {
346 76
        if (!isset($this->dictionary[$type])) {
347 13
            return [];
348
        }
349
350 70
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
351
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
352
                return [];
353
            }
354
355
            return $this->dictionary[$type]['subtype'][$subtype];
356
        }
357
358 70
        return $this->dictionary[$type]['all'];
359
    }
360
361
    /**
362
     * @return Font[]
363
     */
364 52
    public function getFonts()
365
    {
366 52
        return $this->getObjectsByType('Font');
367
    }
368
369 46
    public function getFirstFont(): ?Font
370
    {
371 46
        $fonts = $this->getFonts();
372 46
        if ([] === $fonts) {
373 4
            return null;
374
        }
375
376 42
        return reset($fonts);
377
    }
378
379
    /**
380
     * @return Page[]
381
     *
382
     * @throws \Exception
383
     */
384 72
    public function getPages()
385
    {
386 72
        if ($this->hasObjectsByType('Catalog')) {
387
            // Search for catalog to list pages.
388 64
            $catalogues = $this->getObjectsByType('Catalog');
389 64
            $catalogue = reset($catalogues);
390
391
            /** @var Pages $object */
392 64
            $object = $catalogue->get('Pages');
393 64
            if (method_exists($object, 'getPages')) {
394 64
                return $object->getPages(true);
395
            }
396
        }
397
398 9
        if ($this->hasObjectsByType('Pages')) {
399
            // Search for pages to list kids.
400 1
            $pages = [];
401
402
            /** @var Pages[] $objects */
403 1
            $objects = $this->getObjectsByType('Pages');
404 1
            foreach ($objects as $object) {
405 1
                $pages = array_merge($pages, $object->getPages(true));
406
            }
407
408 1
            return $pages;
409
        }
410
411 9
        if ($this->hasObjectsByType('Page')) {
412
            // Search for 'page' (unordered pages).
413 7
            $pages = $this->getObjectsByType('Page');
414
415 7
            return array_values($pages);
416
        }
417
418 3
        throw new \Exception('Missing catalog.');
419
    }
420
421 23
    public function getText(?int $pageLimit = null): string
422
    {
423 23
        $texts = [];
424 23
        $pages = $this->getPages();
425
426
        // Only use the first X number of pages if $pageLimit is set and numeric.
427 23
        if (\is_int($pageLimit) && 0 < $pageLimit) {
428 1
            $pages = \array_slice($pages, 0, $pageLimit);
429
        }
430
431 23
        foreach ($pages as $index => $page) {
432
            /**
433
             * In some cases, the $page variable may be null.
434
             */
435 23
            if (null === $page) {
436
                continue;
437
            }
438 23
            if ($text = trim($page->getText())) {
439 23
                $texts[] = $text;
440
            }
441
        }
442
443 23
        return implode("\n\n", $texts);
444
    }
445
446
    public function getTrailer(): Header
447
    {
448
        return $this->trailer;
449
    }
450
451 63
    public function setTrailer(Header $trailer)
452
    {
453 63
        $this->trailer = $trailer;
454
    }
455
456 16
    public function getDetails(): array
457
    {
458 16
        return $this->details;
459
    }
460
}
461