Document   F
last analyzed

Complexity

Total Complexity 66

Size/Duplication

Total Lines 410
Duplicated Lines 0 %

Test Coverage

Coverage 94.16%

Importance

Changes 9
Bugs 2 Features 0
Metric Value
eloc 145
c 9
b 2
f 0
dl 0
loc 410
ccs 145
cts 154
cp 0.9416
rs 3.12
wmc 66

18 Methods

Rating   Name   Duplication   Size   Complexity  
A getFirstFont() 0 8 2
A hasObjectsByType() 0 3 1
A getTrailer() 0 3 1
D extractXMPMetadata() 0 99 19
A getObjectsByType() 0 15 4
A getPages() 0 35 6
A buildDictionary() 0 25 6
A getDetails() 0 3 1
A init() 0 10 2
A getObjects() 0 3 1
A __construct() 0 3 1
A setObjects() 0 5 1
A getObjectById() 0 7 2
A getFonts() 0 3 1
B buildDetails() 0 68 10
A getDictionary() 0 3 1
A getText() 0 23 6
A setTrailer() 0 3 1

How to fix   Complexity   

Complex Class

Complex classes like Document often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Document, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
use Smalot\PdfParser\Exception\MissingCatalogException;
37
38
/**
39
 * Technical references :
40
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
41
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
42
 * - http://www.php.net/manual/en/ref.pdf.php#74211
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
44
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
45
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
46
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
47
 *
48
 * Class Document
49
 */
50
class Document
51
{
52
    /**
53
     * @var PDFObject[]
54
     */
55
    protected $objects = [];
56
57
    /**
58
     * @var array
59
     */
60
    protected $dictionary = [];
61
62
    /**
63
     * @var Header
64
     */
65
    protected $trailer;
66
67
    /**
68
     * @var array<mixed>
69
     */
70
    protected $metadata = [];
71
72
    /**
73
     * @var array
74
     */
75
    protected $details;
76
77 104
    public function __construct()
78
    {
79 104
        $this->trailer = new Header([], $this);
80
    }
81
82 75
    public function init()
83
    {
84 75
        $this->buildDictionary();
85
86 75
        $this->buildDetails();
87
88
        // Propagate init to objects.
89 75
        foreach ($this->objects as $object) {
90 74
            $object->getHeader()->init();
91 74
            $object->init();
92
        }
93
    }
94
95
    /**
96
     * Build dictionary based on type header field.
97
     */
98 75
    protected function buildDictionary()
99
    {
100
        // Build dictionary.
101 75
        $this->dictionary = [];
102
103 75
        foreach ($this->objects as $id => $object) {
104
            // Cache objects by type and subtype
105 74
            $type = $object->getHeader()->get('Type')->getContent();
106
107 74
            if (null != $type) {
108 74
                if (!isset($this->dictionary[$type])) {
109 74
                    $this->dictionary[$type] = [
110 74
                        'all' => [],
111 74
                        'subtype' => [],
112 74
                    ];
113
                }
114
115 74
                $this->dictionary[$type]['all'][$id] = $object;
116
117 74
                $subtype = $object->getHeader()->get('Subtype')->getContent();
118 74
                if (null != $subtype) {
119 67
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
120 67
                        $this->dictionary[$type]['subtype'][$subtype] = [];
121
                    }
122 67
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
123
                }
124
            }
125
        }
126
    }
127
128
    /**
129
     * Build details array.
130
     */
131 75
    protected function buildDetails()
132
    {
133
        // Build details array.
134 75
        $details = [];
135
136
        // Extract document info
137 75
        if ($this->trailer->has('Info')) {
138
            /** @var PDFObject $info */
139 62
            $info = $this->trailer->get('Info');
140
            // This could be an ElementMissing object, so we need to check for
141
            // the getHeader method first.
142 62
            if (null !== $info && method_exists($info, 'getHeader')) {
143 62
                $details = $info->getHeader()->getDetails();
144
            }
145
        }
146
147
        // Retrieve the page count
148
        try {
149 75
            $pages = $this->getPages();
150 73
            $details['Pages'] = \count($pages);
151 3
        } catch (\Exception $e) {
152 3
            $details['Pages'] = 0;
153
        }
154
155
        // Decode and repair encoded document properties
156 75
        foreach ($details as $key => $value) {
157 75
            if (\is_string($value)) {
158
                // If the string is already UTF-8 encoded, that means we only
159
                // need to repair Adobe's ham-fisted insertion of line-feeds
160
                // every ~127 characters, which doesn't seem to be multi-byte
161
                // safe
162 61
                if (mb_check_encoding($value, 'UTF-8')) {
163
                    // Remove literal backslash + line-feed "\\r"
164 60
                    $value = str_replace("\x5c\x0d", '', $value);
165
166
                    // Remove backslash plus bytes written into high part of
167
                    // multibyte unicode character
168 60
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
169 1
                        $diff = (\ord($match[1]) - 182) * 64;
170 1
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
171 1
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
172
                    }
173
174
                    // Remove bytes written into low part of multibyte unicode
175
                    // character
176 60
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
177
                        $diff = \ord($match[2]) - 181;
178
                        $newbyte = \chr(\ord($match[1]) + $diff);
179
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
180
                    }
181
182
                    // Remove this byte string that Adobe occasionally adds
183
                    // between two single byte characters in a unicode string
184 60
                    $value = str_replace("\xe5\xb0\x8d", '', $value);
185
186 60
                    $details[$key] = $value;
187
                } else {
188
                    // If the string is just PDFDocEncoding, remove any line-feeds
189
                    // and decode the whole thing.
190 11
                    $value = str_replace("\\\r", '', $value);
191 11
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
192
                }
193
            }
194
        }
195
196 75
        $details = array_merge($details, $this->metadata);
197
198 75
        $this->details = $details;
199
    }
200
201
    /**
202
     * Extract XMP Metadata
203
     */
204 42
    public function extractXMPMetadata(string $content): void
205
    {
206 42
        $xml = xml_parser_create();
207 42
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
208
209 42
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
210
            /*
211
             * short overview about the following code parts:
212
             *
213
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
214
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
215
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
216
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
217
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
218
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
219
             * element) is set as the current $metadata context.
220
             */
221 42
            $metadata = [];
222 42
            $stack = [];
223 42
            foreach ($values as $val) {
224
                // Standardize to lowercase
225 42
                $val['tag'] = strtolower($val['tag']);
226
227
                // Ignore structural x: and rdf: XML elements
228 42
                if (0 === strpos($val['tag'], 'x:')) {
229 42
                    continue;
230 42
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
231 42
                    continue;
232
                }
233
234 42
                switch ($val['type']) {
235 42
                    case 'open':
236
                        // Create an array of list items
237 38
                        if ('rdf:li' == $val['tag']) {
238 5
                            $metadata[] = [];
239
240
                            // Move up one level in the stack
241 5
                            $stack[\count($stack)] = &$metadata;
242 5
                            $metadata = &$metadata[\count($metadata) - 1];
243
                        } else {
244
                            // Else create an array of named values
245 38
                            $metadata[$val['tag']] = [];
246
247
                            // Move up one level in the stack
248 38
                            $stack[\count($stack)] = &$metadata;
249 38
                            $metadata = &$metadata[$val['tag']];
250
                        }
251 38
                        break;
252
253 42
                    case 'complete':
254 42
                        if (isset($val['value'])) {
255
                            // Assign a value to this list item
256 42
                            if ('rdf:li' == $val['tag']) {
257 34
                                $metadata[] = $val['value'];
258
259
                                // Else assign a value to this property
260
                            } else {
261 42
                                $metadata[$val['tag']] = $val['value'];
262
                            }
263
                        }
264 42
                        break;
265
266 38
                    case 'close':
267
                        // If the value of this property is an array
268 38
                        if (\is_array($metadata)) {
269
                            // If the value is a single element array
270
                            // where the element is of type string, use
271
                            // the value of the first list item as the
272
                            // value for this property
273 38
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
274 34
                                $metadata = $metadata[0];
275 10
                            } elseif (0 == \count($metadata)) {
276
                                // if the value is an empty array, set
277
                                // the value of this property to the empty
278
                                // string
279 7
                                $metadata = '';
280
                            }
281
                        }
282
283
                        // Move down one level in the stack
284 38
                        $metadata = &$stack[\count($stack) - 1];
285 38
                        unset($stack[\count($stack) - 1]);
286 38
                        break;
287
                }
288
            }
289
290
            // Only use this metadata if it's referring to a PDF
291 42
            if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) {
292
                // According to the XMP specifications: 'Conflict resolution
293
                // for separate packets that describe the same resource is
294
                // beyond the scope of this document.' - Section 6.1
295
                // Source: https://www.adobe.com/devnet/xmp.html
296
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
297
                // So if there are multiple XMP blocks, just merge the values
298
                // of each found block over top of the existing values
299 42
                $this->metadata = array_merge($this->metadata, $metadata);
300
            }
301
        }
302 42
        xml_parser_free($xml);
303
    }
304
305 1
    public function getDictionary(): array
306
    {
307 1
        return $this->dictionary;
308
    }
309
310
    /**
311
     * @param PDFObject[] $objects
312
     */
313 74
    public function setObjects($objects = [])
314
    {
315 74
        $this->objects = (array) $objects;
316
317 74
        $this->init();
318
    }
319
320
    /**
321
     * @return PDFObject[]
322
     */
323 2
    public function getObjects()
324
    {
325 2
        return $this->objects;
326
    }
327
328
    /**
329
     * @return PDFObject|Font|Page|Element|null
330
     */
331 71
    public function getObjectById(string $id)
332
    {
333 71
        if (isset($this->objects[$id])) {
334 71
            return $this->objects[$id];
335
        }
336
337 3
        return null;
338
    }
339
340 76
    public function hasObjectsByType(string $type, ?string $subtype = null): bool
341
    {
342 76
        return 0 < \count($this->getObjectsByType($type, $subtype));
343
    }
344
345 80
    public function getObjectsByType(string $type, ?string $subtype = null): array
346
    {
347 80
        if (!isset($this->dictionary[$type])) {
348 15
            return [];
349
        }
350
351 73
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
352
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
353
                return [];
354
            }
355
356
            return $this->dictionary[$type]['subtype'][$subtype];
357
        }
358
359 73
        return $this->dictionary[$type]['all'];
360
    }
361
362
    /**
363
     * @return Font[]
364
     */
365 55
    public function getFonts()
366
    {
367 55
        return $this->getObjectsByType('Font');
368
    }
369
370 49
    public function getFirstFont(): ?Font
371
    {
372 49
        $fonts = $this->getFonts();
373 49
        if ([] === $fonts) {
374 5
            return null;
375
        }
376
377 44
        return reset($fonts);
378
    }
379
380
    /**
381
     * @return Page[]
382
     *
383
     * @throws MissingCatalogException
384
     */
385 76
    public function getPages()
386
    {
387 76
        if ($this->hasObjectsByType('Catalog')) {
388
            // Search for catalog to list pages.
389 67
            $catalogues = $this->getObjectsByType('Catalog');
390 67
            $catalogue = reset($catalogues);
391
392
            /** @var Pages $object */
393 67
            $object = $catalogue->get('Pages');
394 67
            if (method_exists($object, 'getPages')) {
395 67
                return $object->getPages(true);
396
            }
397
        }
398
399 10
        if ($this->hasObjectsByType('Pages')) {
400
            // Search for pages to list kids.
401 1
            $pages = [];
402
403
            /** @var Pages[] $objects */
404 1
            $objects = $this->getObjectsByType('Pages');
405 1
            foreach ($objects as $object) {
406 1
                $pages = array_merge($pages, $object->getPages(true));
407
            }
408
409 1
            return $pages;
410
        }
411
412 10
        if ($this->hasObjectsByType('Page')) {
413
            // Search for 'page' (unordered pages).
414 7
            $pages = $this->getObjectsByType('Page');
415
416 7
            return array_values($pages);
417
        }
418
419 4
        throw new MissingCatalogException('Missing catalog.');
420
    }
421
422 25
    public function getText(?int $pageLimit = null): string
423
    {
424 25
        $texts = [];
425 25
        $pages = $this->getPages();
426
427
        // Only use the first X number of pages if $pageLimit is set and numeric.
428 25
        if (\is_int($pageLimit) && 0 < $pageLimit) {
429 1
            $pages = \array_slice($pages, 0, $pageLimit);
430
        }
431
432 25
        foreach ($pages as $index => $page) {
433
            /**
434
             * In some cases, the $page variable may be null.
435
             */
436 25
            if (null === $page) {
437
                continue;
438
            }
439 25
            if ($text = trim($page->getText())) {
440 24
                $texts[] = $text;
441
            }
442
        }
443
444 25
        return implode("\n\n", $texts);
445
    }
446
447
    public function getTrailer(): Header
448
    {
449
        return $this->trailer;
450
    }
451
452 66
    public function setTrailer(Header $trailer)
453
    {
454 66
        $this->trailer = $trailer;
455
    }
456
457 18
    public function getDetails(): array
458
    {
459 18
        return $this->details;
460
    }
461
}
462