Test Failed
Push — master ( 66ddf4...d03ef9 )
by Konrad
02:44
created

Document   F

Complexity

Total Complexity 65

Size/Duplication

Total Lines 402
Duplicated Lines 0 %

Test Coverage

Coverage 94%

Importance

Changes 8
Bugs 2 Features 0
Metric Value
eloc 142
c 8
b 2
f 0
dl 0
loc 402
ccs 94
cts 100
cp 0.94
rs 3.2
wmc 65

18 Methods

Rating   Name   Duplication   Size   Complexity  
A buildDictionary() 0 25 6
A init() 0 10 2
A __construct() 0 3 1
A getObjectsByType() 0 15 4
A getPages() 0 35 6
A getDetails() 0 3 1
A getObjects() 0 3 1
A setObjects() 0 5 1
A getObjectById() 0 7 2
A getFonts() 0 3 1
A getFirstFont() 0 8 2
A hasObjectsByType() 0 3 1
A getDictionary() 0 3 1
A getText() 0 23 6
A getTrailer() 0 3 1
D extractXMPMetadata() 0 91 18
A setTrailer() 0 3 1
B buildDetails() 0 68 10

How to fix   Complexity   

Complex Class

Complex classes like Document often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Document, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37
/**
38
 * Technical references :
39
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41
 * - http://www.php.net/manual/en/ref.pdf.php#74211
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46
 *
47
 * Class Document
48
 */
49
class Document
50
{
51
    /**
52
     * @var PDFObject[]
53
     */
54
    protected $objects = [];
55
56
    /**
57
     * @var array
58
     */
59
    protected $dictionary = [];
60
61
    /**
62
     * @var Header
63
     */
64
    protected $trailer;
65
66
    /**
67
     * @var array<mixed>
68
     */
69 72
    protected $metadata = [];
70
71 72
    /**
72 72
     * @var array
73
     */
74 49
    protected $details;
75
76 49
    public function __construct()
77
    {
78 49
        $this->trailer = new Header([], $this);
79
    }
80
81 49
    public function init()
82 49
    {
83 49
        $this->buildDictionary();
84
85 49
        $this->buildDetails();
86
87
        // Propagate init to objects.
88
        foreach ($this->objects as $object) {
89
            $object->getHeader()->init();
90 49
            $object->init();
91
        }
92
    }
93 49
94
    /**
95 49
     * Build dictionary based on type header field.
96
     */
97 49
    protected function buildDictionary()
98
    {
99 49
        // Build dictionary.
100 49
        $this->dictionary = [];
101 49
102
        foreach ($this->objects as $id => $object) {
103
            // Cache objects by type and subtype
104
            $type = $object->getHeader()->get('Type')->getContent();
105
106
            if (null != $type) {
107 49
                if (!isset($this->dictionary[$type])) {
108
                    $this->dictionary[$type] = [
109 49
                        'all' => [],
110 49
                        'subtype' => [],
111 42
                    ];
112 42
                }
113
114 42
                $this->dictionary[$type]['all'][$id] = $object;
115
116
                $subtype = $object->getHeader()->get('Subtype')->getContent();
117
                if (null != $subtype) {
118 49
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119
                        $this->dictionary[$type]['subtype'][$subtype] = [];
120
                    }
121
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122
                }
123 49
            }
124
        }
125
    }
126 49
127
    /**
128
     * Build details array.
129 49
     */
130
    protected function buildDetails()
131 40
    {
132
        // Build details array.
133
        $details = [];
134 40
135 40
        // Extract document info
136
        if ($this->trailer->has('Info')) {
137
            /** @var PDFObject $info */
138
            $info = $this->trailer->get('Info');
139
            // This could be an ElementMissing object, so we need to check for
140
            // the getHeader method first.
141 49
            if (null !== $info && method_exists($info, 'getHeader')) {
142 48
                $details = $info->getHeader()->getDetails();
143 2
            }
144 2
        }
145
146
        // Retrieve the page count
147 49
        try {
148 49
            $pages = $this->getPages();
149
            $details['Pages'] = \count($pages);
150 1
        } catch (\Exception $e) {
151
            $details['Pages'] = 0;
152 1
        }
153
154
        // Decode and repair encoded document properties
155
        foreach ($details as $key => $value) {
156
            if (\is_string($value)) {
157
                // If the string is already UTF-8 encoded, that means we only
158 49
                // need to repair Adobe's ham-fisted insertion of line-feeds
159
                // every ~127 characters, which doesn't seem to be multi-byte
160 49
                // safe
161
                if (mb_check_encoding($value, 'UTF-8')) {
162 49
                    // Remove literal backslash + line-feed "\\r"
163 49
                    $value = str_replace("\x5c\x0d", '', $value);
164
165
                    // Remove backslash plus bytes written into high part of
166
                    // multibyte unicode character
167
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168 1
                        $diff = (\ord($match[1]) - 182) * 64;
169
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171
                    }
172
173
                    // Remove bytes written into low part of multibyte unicode
174
                    // character
175
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176 46
                        $diff = \ord($match[2]) - 181;
177
                        $newbyte = \chr(\ord($match[1]) + $diff);
178 46
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179 46
                    }
180
181
                    // Remove this byte string that Adobe occasionally adds
182 3
                    // between two single byte characters in a unicode string
183
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
184
185 50
                    $details[$key] = $value;
186
                } else {
187 50
                    // If the string is just PDFDocEncoding, remove any line-feeds
188
                    // and decode the whole thing.
189
                    $value = str_replace("\\\r", '', $value);
190 53
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191
                }
192 53
            }
193 12
        }
194
195
        $details = array_merge($details, $this->metadata);
196 48
197
        $this->details = $details;
198
    }
199
200
    /**
201
     * Extract XMP Metadata
202
     */
203
    public function extractXMPMetadata(string $content): void
204 48
    {
205
        $xml = xml_parser_create();
206
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209
            /*
210 27
             * short overview about the following code parts:
211
             *
212 27
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215 21
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
216
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217 21
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218 21
             * element) is set as the current $metadata context.
219 3
             */
220
            $metadata = [];
221
            $stack = [];
222 18
            foreach ($values as $val) {
223
                // Standardize to lowercase
224
                $val['tag'] = strtolower($val['tag']);
225
226
                // Ignore structural x: and rdf: XML elements
227
                if (0 === strpos($val['tag'], 'x:')) {
228
                    continue;
229
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230 50
                    continue;
231
                }
232 50
233
                switch ($val['type']) {
234 42
                    case 'open':
235 42
                        // Create an array of list items
236
                        if ('rdf:li' == $val['tag']) {
237
                            $metadata[] = [];
238 42
239 42
                            // Move up one level in the stack
240 42
                            $stack[\count($stack)] = &$metadata;
241
                            $metadata = &$metadata[\count($metadata) - 1];
242
                        } else {
243
                            // Else create an array of named values
244 9
                            $metadata[$val['tag']] = [];
245
246 1
                            // Move up one level in the stack
247
                            $stack[\count($stack)] = &$metadata;
248
                            $metadata = &$metadata[$val['tag']];
249 1
                        }
250 1
                        break;
251 1
252
                    case 'complete':
253
                        if (isset($val['value'])) {
254 1
                            // Assign a value to this list item
255
                            if ('rdf:li' == $val['tag']) {
256
                                $metadata[] = $val['value'];
257 9
258
                                // Else assign a value to this property
259 7
                            } else {
260
                                $metadata[$val['tag']] = $val['value'];
261 7
                            }
262
                        }
263
                        break;
264 3
265
                    case 'close':
266
                        // If the value of this property is a single-
267 12
                        // element array where the element is of type
268
                        // string, use the value of the first list item
269 12
                        // as the value for this property
270 12
                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
271
                            $metadata = $metadata[0];
272
                        }
273 12
274 1
                        // Move down one level in the stack
275
                        $metadata = &$stack[\count($stack) - 1];
276
                        unset($stack[\count($stack) - 1]);
277 12
                        break;
278
                }
279
            }
280
281 12
            // Only use this metadata if it's referring to a PDF
282
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
283
                // According to the XMP specifications: 'Conflict resolution
284 12
                // for separate packets that describe the same resource is
285 12
                // beyond the scope of this document.' - Section 6.1
286
                // Source: https://www.adobe.com/devnet/xmp.html
287
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
288
                // So if there are multiple XMP blocks, just merge the values
289 12
                // of each found block over top of the existing values
290
                $this->metadata = array_merge($this->metadata, $metadata);
291
            }
292
        }
293
        xml_parser_free($xml);
294
    }
295
296
    public function getDictionary(): array
297 41
    {
298
        return $this->dictionary;
299 41
    }
300 41
301
    /**
302 12
     * @param PDFObject[] $objects
303
     */
304 12
    public function setObjects($objects = [])
305
    {
306
        $this->objects = (array) $objects;
307
308
        $this->init();
309
    }
310
311
    /**
312
     * @return PDFObject[]
313
     */
314
    public function getObjects()
315
    {
316
        return $this->objects;
317
    }
318
319
    /**
320
     * @return PDFObject|Font|Page|Element|null
321
     */
322
    public function getObjectById(string $id)
323
    {
324
        if (isset($this->objects[$id])) {
325
            return $this->objects[$id];
326
        }
327
328
        return null;
329
    }
330
331
    public function hasObjectsByType(string $type, string $subtype = null): bool
332
    {
333
        return 0 < \count($this->getObjectsByType($type, $subtype));
334
    }
335
336
    public function getObjectsByType(string $type, string $subtype = null): array
337
    {
338
        if (!isset($this->dictionary[$type])) {
339
            return [];
340
        }
341
342
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
343
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
344
                return [];
345
            }
346
347
            return $this->dictionary[$type]['subtype'][$subtype];
348
        }
349
350
        return $this->dictionary[$type]['all'];
351
    }
352
353
    /**
354
     * @return Font[]
355
     */
356
    public function getFonts()
357
    {
358
        return $this->getObjectsByType('Font');
359
    }
360
361
    public function getFirstFont(): ?Font
362
    {
363
        $fonts = $this->getFonts();
364
        if ([] === $fonts) {
365
            return null;
366
        }
367
368
        return reset($fonts);
369
    }
370
371
    /**
372
     * @return Page[]
373
     *
374
     * @throws \Exception
375
     */
376
    public function getPages()
377
    {
378
        if ($this->hasObjectsByType('Catalog')) {
379
            // Search for catalog to list pages.
380
            $catalogues = $this->getObjectsByType('Catalog');
381
            $catalogue = reset($catalogues);
382
383
            /** @var Pages $object */
384
            $object = $catalogue->get('Pages');
385
            if (method_exists($object, 'getPages')) {
386
                return $object->getPages(true);
387
            }
388
        }
389
390
        if ($this->hasObjectsByType('Pages')) {
391
            // Search for pages to list kids.
392
            $pages = [];
393
394
            /** @var Pages[] $objects */
395
            $objects = $this->getObjectsByType('Pages');
396
            foreach ($objects as $object) {
397
                $pages = array_merge($pages, $object->getPages(true));
398
            }
399
400
            return $pages;
401
        }
402
403
        if ($this->hasObjectsByType('Page')) {
404
            // Search for 'page' (unordered pages).
405
            $pages = $this->getObjectsByType('Page');
406
407
            return array_values($pages);
408
        }
409
410
        throw new \Exception('Missing catalog.');
411
    }
412
413
    public function getText(int $pageLimit = null): string
414
    {
415
        $texts = [];
416
        $pages = $this->getPages();
417
418
        // Only use the first X number of pages if $pageLimit is set and numeric.
419
        if (\is_int($pageLimit) && 0 < $pageLimit) {
420
            $pages = \array_slice($pages, 0, $pageLimit);
421
        }
422
423
        foreach ($pages as $index => $page) {
424
            /**
425
             * In some cases, the $page variable may be null.
426
             */
427
            if (null === $page) {
428
                continue;
429
            }
430
            if ($text = trim($page->getText())) {
431
                $texts[] = $text;
432
            }
433
        }
434
435
        return implode("\n\n", $texts);
436
    }
437
438
    public function getTrailer(): Header
439
    {
440
        return $this->trailer;
441
    }
442
443
    public function setTrailer(Header $trailer)
444
    {
445
        $this->trailer = $trailer;
446
    }
447
448
    public function getDetails(): array
449
    {
450
        return $this->details;
451
    }
452
}
453