Test Failed
Push — master ( 66ddf4...d03ef9 )
by Konrad
02:44
created

Document::buildDetails()   B

Complexity

Conditions 10
Paths 63

Size

Total Lines 68
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 10.0045

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 10
eloc 29
c 1
b 1
f 0
nc 63
nop 0
dl 0
loc 68
ccs 27
cts 28
cp 0.9643
crap 10.0045
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69 72
    protected $metadata = [];
70
71 72
    /**
72 72
     * @var array
73
     */
74 49
    protected $details;
75
76 49
    public function __construct()
77
    {
78 49
        $this->trailer = new Header([], $this);
79
    }
80
81 49
    public function init()
82 49
    {
83 49
        $this->buildDictionary();
84
85 49
        $this->buildDetails();
86
87
        // Propagate init to objects.
88
        foreach ($this->objects as $object) {
89
            $object->getHeader()->init();
90 49
            $object->init();
91
        }
92
    }
93 49
94
    /**
95 49
     * Build dictionary based on type header field.
96
     */
97 49
    protected function buildDictionary()
98
    {
99 49
        // Build dictionary.
100 49
        $this->dictionary = [];
101 49
102
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104
            $type = $object->getHeader()->get('Type')->getContent();
105
106
            if (null != $type) {
107 49
                if (!isset($this->dictionary[$type])) {
108
                    $this->dictionary[$type] = [
109 49
                        'all' => [],
110 49
                        'subtype' => [],
111 42
                    ];
112 42
                }
113
114 42
                $this->dictionary[$type]['all'][$id] = $object;
115
116
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117
                if (null != $subtype) {
118 49
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123 49
            }
124
        }
125
    }
126 49
127
    /**
128
     * Build details array.
129 49
     */
130
    protected function buildDetails()
131 40
    {
132
        // Build details array.
133
        $details = [];
134 40
135 40
        // Extract document info
136
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 49
            if (null !== $info && method_exists($info, 'getHeader')) {
142 48
                $details = $info->getHeader()->getDetails();
143 2
            }
144 2
        }
145
146
        // Retrieve the page count
147 49
        try {
148 49
            $pages = $this->getPages();
149
            $details['Pages'] = \count($pages);
150 1
        } catch (\Exception $e) {
151
            $details['Pages'] = 0;
152 1
        }
153
154
        // Decode and repair encoded document properties
155
        foreach ($details as $key => $value) {
156
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158 49
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160 49
                // safe
161
                if (mb_check_encoding($value, 'UTF-8')) {
162 49
                    // Remove literal backslash + line-feed "\\r"
163 49
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168 1
                        $diff = (\ord($match[1]) - 182) * 64;
169
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176 46
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178 46
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179 46
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182 3
                    // between two single byte characters in a unicode string
183
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 50
                    $details[$key] = $value;
186
                } else {
187 50
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189
                    $value = str_replace("\\\r", '', $value);
190 53
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192 53
            }
193 12
        }
194
195
        $details = array_merge($details, $this->metadata);
196 48
197
        $this->details = $details;
198
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203
    public function extractXMPMetadata(string $content): void
204 48
    {
205
        $xml = xml_parser_create();
206
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210 27
             * short overview about the following code parts:
211
             *
212 27
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215 21
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217 21
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218 21
             * element) is set as the current $metadata context.
219 3
             */
220
            $metadata = [];
221
            $stack = [];
222 18
            foreach ($values as $val) {
223
                // Standardize to lowercase
224
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227
                if (0 === strpos($val['tag'], 'x:')) {
228
                    continue;
229
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 50
                    continue;
231
                }
232 50
233
                switch ($val['type']) {
234 42
                    case 'open':
235 42
                        // Create an array of list items
236
                        if ('rdf:li' == $val['tag']) {
237
                            $metadata[] = [];
238 42
239 42
                            // Move up one level in the stack
240 42
                            $stack[\count($stack)] = &$metadata;
241
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 9
                            $metadata[$val['tag']] = [];
245
246 1
                            // Move up one level in the stack
247
                            $stack[\count($stack)] = &$metadata;
248
                            $metadata = &$metadata[$val['tag']];
249 1
                        }
250 1
                        break;
251 1
252
                    case 'complete':
253
                        if (isset($val['value'])) {
254 1
                            // Assign a value to this list item
255
                            if ('rdf:li' == $val['tag']) {
256
                                $metadata[] = $val['value'];
257 9
258
                                // Else assign a value to this property
259 7
                            } else {
260
                                $metadata[$val['tag']] = $val['value'];
261 7
                            }
262
                        }
263
                        break;
264 3
265
                    case 'close':
266
                        // If the value of this property is a single-
267 12
                        // element array where the element is of type
268
                        // string, use the value of the first list item
269 12
                        // as the value for this property
270 12
                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
271
                            $metadata = $metadata[0];
272
                        }
273 12
274 1
                        // Move down one level in the stack
275
                        $metadata = &$stack[\count($stack) - 1];
276
                        unset($stack[\count($stack) - 1]);
277 12
                        break;
278
                }
279
            }
280
281 12
            // Only use this metadata if it's referring to a PDF
282
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
283
                // According to the XMP specifications: 'Conflict resolution
284 12
                // for separate packets that describe the same resource is
285 12
                // beyond the scope of this document.' - Section 6.1
286
                // Source: https://www.adobe.com/devnet/xmp.html
287
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
288
                // So if there are multiple XMP blocks, just merge the values
289 12
                // of each found block over top of the existing values
290
                $this->metadata = array_merge($this->metadata, $metadata);
291
            }
292
        }
293
        xml_parser_free($xml);
294
    }
295
296
    public function getDictionary(): array
297 41
    {
298
        return $this->dictionary;
299 41
    }
300 41
301
    /**
302 12
     * @param PDFObject[] $objects
303
     */
304 12
    public function setObjects($objects = [])
305
    {
306
        $this->objects = (array) $objects;
307
308
        $this->init();
309
    }
310
311
    /**
312
     * @return PDFObject[]
313
     */
314
    public function getObjects()
315
    {
316
        return $this->objects;
317
    }
318
319
    /**
320
     * @return PDFObject|Font|Page|Element|null
321
     */
322
    public function getObjectById(string $id)
323
    {
324
        if (isset($this->objects[$id])) {
325
            return $this->objects[$id];
326
        }
327
328
        return null;
329
    }
330
331
    public function hasObjectsByType(string $type, string $subtype = null): bool
332
    {
333
        return 0 < \count($this->getObjectsByType($type, $subtype));
334
    }
335
336
    public function getObjectsByType(string $type, string $subtype = null): array
337
    {
338
        if (!isset($this->dictionary[$type])) {
339
            return [];
340
        }
341
342
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
343
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
344
                return [];
345
            }
346
347
            return $this->dictionary[$type]['subtype'][$subtype];
348
        }
349
350
        return $this->dictionary[$type]['all'];
351
    }
352
353
    /**
354
     * @return Font[]
355
     */
356
    public function getFonts()
357
    {
358
        return $this->getObjectsByType('Font');
359
    }
360
361
    public function getFirstFont(): ?Font
362
    {
363
        $fonts = $this->getFonts();
364
        if ([] === $fonts) {
365
            return null;
366
        }
367
368
        return reset($fonts);
369
    }
370
371
    /**
372
     * @return Page[]
373
     *
374
     * @throws \Exception
375
     */
376
    public function getPages()
377
    {
378
        if ($this->hasObjectsByType('Catalog')) {
379
            // Search for catalog to list pages.
380
            $catalogues = $this->getObjectsByType('Catalog');
381
            $catalogue = reset($catalogues);
382
383
            /** @var Pages $object */
384
            $object = $catalogue->get('Pages');
385
            if (method_exists($object, 'getPages')) {
386
                return $object->getPages(true);
387
            }
388
        }
389
390
        if ($this->hasObjectsByType('Pages')) {
391
            // Search for pages to list kids.
392
            $pages = [];
393
394
            /** @var Pages[] $objects */
395
            $objects = $this->getObjectsByType('Pages');
396
            foreach ($objects as $object) {
397
                $pages = array_merge($pages, $object->getPages(true));
398
            }
399
400
            return $pages;
401
        }
402
403
        if ($this->hasObjectsByType('Page')) {
404
            // Search for 'page' (unordered pages).
405
            $pages = $this->getObjectsByType('Page');
406
407
            return array_values($pages);
408
        }
409
410
        throw new \Exception('Missing catalog.');
411
    }
412
413
    public function getText(int $pageLimit = null): string
414
    {
415
        $texts = [];
416
        $pages = $this->getPages();
417
418
        // Only use the first X number of pages if $pageLimit is set and numeric.
419
        if (\is_int($pageLimit) && 0 < $pageLimit) {
420
            $pages = \array_slice($pages, 0, $pageLimit);
421
        }
422
423
        foreach ($pages as $index => $page) {
424
            /**
425
             * In some cases, the $page variable may be null.
426
             */
427
            if (null === $page) {
428
                continue;
429
            }
430
            if ($text = trim($page->getText())) {
431
                $texts[] = $text;
432
            }
433
        }
434
435
        return implode("\n\n", $texts);
436
    }
437
438
    public function getTrailer(): Header
439
    {
440
        return $this->trailer;
441
    }
442
443
    public function setTrailer(Header $trailer)
444
    {
445
        $this->trailer = $trailer;
446
    }
447
448
    public function getDetails(): array
449
    {
450
        return $this->details;
451
    }
452
}
453