Test Failed
Pull Request — master (#611)
by
unknown
02:00
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 71
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 10.0328

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 71
ccs 27
cts 29
cp 0.931
crap 10.0328
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69 72
    protected $metadata = [];
70
71 72
    /**
72 72
     * @var array
73
     */
74 49
    protected $details;
75
76 49
    public function __construct()
77
    {
78 49
        $this->trailer = new Header([], $this);
79
    }
80
81 49
    public function init()
82 49
    {
83 49
        $this->buildDictionary();
84
85 49
        $this->buildDetails();
86
87
        // Propagate init to objects.
88
        foreach ($this->objects as $object) {
89
            $object->getHeader()->init();
90 49
            $object->init();
91
        }
92
    }
93 49
94
    /**
95 49
     * Build dictionary based on type header field.
96
     */
97 49
    protected function buildDictionary()
98
    {
99 49
        // Build dictionary.
100 49
        $this->dictionary = [];
101 49
102
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104
            $type = $object->getHeader()->get('Type')->getContent();
105
106
            if (null != $type) {
107 49
                if (!isset($this->dictionary[$type])) {
108
                    $this->dictionary[$type] = [
109 49
                        'all' => [],
110 49
                        'subtype' => [],
111 42
                    ];
112 42
                }
113
114 42
                $this->dictionary[$type]['all'][$id] = $object;
115
116
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117
                if (null != $subtype) {
118 49
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123 49
            }
124
        }
125
    }
126 49
127
    /**
128
     * Build details array.
129 49
     */
130
    protected function buildDetails()
131 40
    {
132
        // Build details array.
133
        $details = [];
134 40
135 40
        // Extract document info
136
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 49
            if (null !== $info && method_exists($info, 'getHeader')) {
142 48
                $details = $info->getHeader()->getDetails();
143 2
            }
144 2
        }
145
146
        // Retrieve the page count
147 49
        try {
148 49
            $pages = $this->getPages();
149
            $details['Pages'] = \count($pages);
150 1
        } catch (\Exception $e) {
151
            $details['Pages'] = 0;
152 1
        }
153
154
        // Decode and repair encoded document properties
155
        foreach ($details as $key => $value) {
156
            if (\is_string($value)) {
157
158 49
                // If the string is already UTF-8 encoded, that means we only
159
                // need to repair Adobe's ham-fisted insertion of line-feeds
160 49
                // every ~127 characters, which doesn't seem to be multi-byte
161
                // safe
162 49
                if (mb_check_encoding($value, 'UTF-8')) {
163 49
164
                    // Remove literal backslash + line-feed "\\r"
165
                    $value = str_replace("\x5c\x0d", '', $value);
166
167
                    // Remove backslash plus bytes written into high part of
168 1
                    // multibyte unicode character
169
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
170 1
                        $diff = (\ord($match[1]) - 182) * 64;
171
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
172
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2]."/", $newbyte, $value);
173
                    }
174
175
                    // Remove bytes written into low part of multibyte unicode
176 46
                    // character
177
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
178 46
                        $diff = \ord($match[2]) - 181;
179 46
                        $newbyte = \chr(\ord($match[1]) + $diff);
180
                        $value = preg_replace("/".$match[1]."\x9c\xe0".$match[2]."/", $newbyte, $value);
181
                    }
182 3
183
                    // Remove this byte string that Adobe occasionally adds
184
                    // between two single byte characters in a unicode string
185 50
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
186
187 50
                    $details[$key] = $value;
188
189
                // If the string is just PDFDocEncoding, remove any line-feeds
190 53
                // and decode the whole thing.
191
                } else {
192 53
                    $value = str_replace("\\\r", '', $value);
193 12
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
194
                }
195
            }
196 48
        }
197
198
        $details = array_merge($details, $this->metadata);
199
200
        $this->details = $details;
201
    }
202
203
    /**
204 48
     * Extract XMP Metadata
205
     */
206
    public function extractXMPMetadata(string $content): void
207
    {
208
        $xml = xml_parser_create();
209
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
210 27
211
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
212 27
            /*
213
             * short overview about the following code parts:
214
             *
215 21
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
216
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
217 21
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
218 21
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
219 3
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
220
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
221
             * element) is set as the current $metadata context.
222 18
             */
223
            $metadata = [];
224
            $stack = [];
225
            foreach ($values as $val) {
226
                // Standardize to lowercase
227
                $val['tag'] = strtolower($val['tag']);
228
229
                // Ignore structural x: and rdf: XML elements
230 50
                if (0 === strpos($val['tag'], 'x:')) {
231
                    continue;
232 50
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
233
                    continue;
234 42
                }
235 42
236
                switch ($val['type']) {
237
                    case 'open':
238 42
                        // Create an array of list items
239 42
                        if ('rdf:li' == $val['tag']) {
240 42
                            $metadata[] = [];
241
242
                            // Move up one level in the stack
243
                            $stack[\count($stack)] = &$metadata;
244 9
                            $metadata = &$metadata[\count($metadata) - 1];
245
                        } else {
246 1
                            // Else create an array of named values
247
                            $metadata[$val['tag']] = [];
248
249 1
                            // Move up one level in the stack
250 1
                            $stack[\count($stack)] = &$metadata;
251 1
                            $metadata = &$metadata[$val['tag']];
252
                        }
253
                        break;
254 1
255
                    case 'complete':
256
                        if (isset($val['value'])) {
257 9
                            // Assign a value to this list item
258
                            if ('rdf:li' == $val['tag']) {
259 7
                                $metadata[] = $val['value'];
260
261 7
                                // Else assign a value to this property
262
                            } else {
263
                                $metadata[$val['tag']] = $val['value'];
264 3
                            }
265
                        }
266
                        break;
267 12
268
                    case 'close':
269 12
                        // If the value of this property is a single-
270 12
                        // element array where the element is of type
271
                        // string, use the value of the first list item
272
                        // as the value for this property
273 12
                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
274 1
                            $metadata = $metadata[0];
275
                        }
276
277 12
                        // Move down one level in the stack
278
                        $metadata = &$stack[\count($stack) - 1];
279
                        unset($stack[\count($stack) - 1]);
280
                        break;
281 12
                }
282
            }
283
284 12
            // Only use this metadata if it's referring to a PDF
285 12
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
286
                // According to the XMP specifications: 'Conflict resolution
287
                // for separate packets that describe the same resource is
288
                // beyond the scope of this document.' - Section 6.1
289 12
                // Source: https://www.adobe.com/devnet/xmp.html
290
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
291
                // So if there are multiple XMP blocks, just merge the values
292
                // of each found block over top of the existing values
293
                $this->metadata = array_merge($this->metadata, $metadata);
294
            }
295
        }
296
        xml_parser_free($xml);
297 41
    }
298
299 41
    public function getDictionary(): array
300 41
    {
301
        return $this->dictionary;
302 12
    }
303
304 12
    /**
305
     * @param PDFObject[] $objects
306
     */
307
    public function setObjects($objects = [])
308
    {
309
        $this->objects = (array) $objects;
310
311
        $this->init();
312
    }
313
314
    /**
315
     * @return PDFObject[]
316
     */
317
    public function getObjects()
318
    {
319
        return $this->objects;
320
    }
321
322
    /**
323
     * @return PDFObject|Font|Page|Element|null
324
     */
325
    public function getObjectById(string $id)
326
    {
327
        if (isset($this->objects[$id])) {
328
            return $this->objects[$id];
329
        }
330
331
        return null;
332
    }
333
334
    public function hasObjectsByType(string $type, string $subtype = null): bool
335
    {
336
        return 0 < \count($this->getObjectsByType($type, $subtype));
337
    }
338
339
    public function getObjectsByType(string $type, string $subtype = null): array
340
    {
341
        if (!isset($this->dictionary[$type])) {
342
            return [];
343
        }
344
345
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
346
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
347
                return [];
348
            }
349
350
            return $this->dictionary[$type]['subtype'][$subtype];
351
        }
352
353
        return $this->dictionary[$type]['all'];
354
    }
355
356
    /**
357
     * @return Font[]
358
     */
359
    public function getFonts()
360
    {
361
        return $this->getObjectsByType('Font');
362
    }
363
364
    public function getFirstFont(): ?Font
365
    {
366
        $fonts = $this->getFonts();
367
        if ([] === $fonts) {
368
            return null;
369
        }
370
371
        return reset($fonts);
372
    }
373
374
    /**
375
     * @return Page[]
376
     *
377
     * @throws \Exception
378
     */
379
    public function getPages()
380
    {
381
        if ($this->hasObjectsByType('Catalog')) {
382
            // Search for catalog to list pages.
383
            $catalogues = $this->getObjectsByType('Catalog');
384
            $catalogue = reset($catalogues);
385
386
            /** @var Pages $object */
387
            $object = $catalogue->get('Pages');
388
            if (method_exists($object, 'getPages')) {
389
                return $object->getPages(true);
390
            }
391
        }
392
393
        if ($this->hasObjectsByType('Pages')) {
394
            // Search for pages to list kids.
395
            $pages = [];
396
397
            /** @var Pages[] $objects */
398
            $objects = $this->getObjectsByType('Pages');
399
            foreach ($objects as $object) {
400
                $pages = array_merge($pages, $object->getPages(true));
401
            }
402
403
            return $pages;
404
        }
405
406
        if ($this->hasObjectsByType('Page')) {
407
            // Search for 'page' (unordered pages).
408
            $pages = $this->getObjectsByType('Page');
409
410
            return array_values($pages);
411
        }
412
413
        throw new \Exception('Missing catalog.');
414
    }
415
416
    public function getText(int $pageLimit = null): string
417
    {
418
        $texts = [];
419
        $pages = $this->getPages();
420
421
        // Only use the first X number of pages if $pageLimit is set and numeric.
422
        if (\is_int($pageLimit) && 0 < $pageLimit) {
423
            $pages = \array_slice($pages, 0, $pageLimit);
424
        }
425
426
        foreach ($pages as $index => $page) {
427
            /**
428
             * In some cases, the $page variable may be null.
429
             */
430
            if (null === $page) {
431
                continue;
432
            }
433
            if ($text = trim($page->getText())) {
434
                $texts[] = $text;
435
            }
436
        }
437
438
        return implode("\n\n", $texts);
439
    }
440
441
    public function getTrailer(): Header
442
    {
443
        return $this->trailer;
444
    }
445
446
    public function setTrailer(Header $trailer)
447
    {
448
        $this->trailer = $trailer;
449
    }
450
451
    public function getDetails(): array
452
    {
453
        return $this->details;
454
    }
455
}
456