Test Failed
Pull Request — master (#606)
by Konrad
01:56
created

Document::extractXMPMetadata()   D

Complexity

Conditions 18
Paths 3

Size

Total Lines 91
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 18.1728

Importance

Changes 5
Bugs 1 Features 1
Metric Value
cc 18
eloc 38
c 5
b 1
f 1
nc 3
nop 1
dl 0
loc 91
ccs 34
cts 37
cp 0.9189
crap 18.1728
rs 4.8666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var array<mixed>
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
166
            /*
167
             * short overview about the following code parts:
168 1
             *
169
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
170 1
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
171
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
172
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
173
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
174
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
175
             * element) is set as the current $metadata context.
176 46
             */
177
            $metadata = [];
178 46
            $stack = [];
179 46
            foreach ($values as $val) {
180
                // Standardize to lowercase
181
                $val['tag'] = strtolower($val['tag']);
182 3
183
                // Ignore structural x: and rdf: XML elements
184
                if (str_starts_with($val['tag'], 'x:')) {
185 50
                    continue;
186
                } elseif (str_starts_with($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
187 50
                    continue;
188
                }
189
190 53
                switch ($val['type']) {
191
                    case 'open':
192 53
                        // Create an array of list items
193 12
                        if ('rdf:li' == $val['tag']) {
194
                            $metadata[] = [];
195
196 48
                            // Move up one level in the stack
197
                            $stack[\count($stack)] = &$metadata;
198
                            $metadata = &$metadata[\count($metadata) - 1];
199
                        } else {
200
                            // Else create an array of named values
201
                            $metadata[$val['tag']] = [];
202
203
                            // Move up one level in the stack
204 48
                            $stack[\count($stack)] = &$metadata;
205
                            $metadata = &$metadata[$val['tag']];
206
                        }
207
                        break;
208
209
                    case 'complete':
210 27
                        if (isset($val['value'])) {
211
                            // Assign a value to this list item
212 27
                            if ('rdf:li' == $val['tag']) {
213
                                $metadata[] = $val['value'];
214
215 21
                                // Else assign a value to this property
216
                            } else {
217 21
                                $metadata[$val['tag']] = $val['value'];
218 21
                            }
219 3
                        }
220
                        break;
221
222 18
                    case 'close':
223
                        // If the value of this property is a single-
224
                        // element array where the element is of type
225
                        // string, use the value of the first list item
226
                        // as the value for this property
227
                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
228
                            $metadata = $metadata[0];
229
                        }
230 50
231
                        // Move down one level in the stack
232 50
                        $metadata = &$stack[\count($stack) - 1];
233
                        unset($stack[\count($stack) - 1]);
234 42
                        break;
235 42
                }
236
            }
237
238 42
            // Only use this metadata if it's referring to a PDF
239 42
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
240 42
                // According to the XMP specifications: 'Conflict resolution
241
                // for separate packets that describe the same resource is
242
                // beyond the scope of this document.' - Section 6.1
243
                // Source: https://www.adobe.com/devnet/xmp.html
244 9
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
245
                // So if there are multiple XMP blocks, just merge the values
246 1
                // of each found block over top of the existing values
247
                $this->metadata = array_merge($this->metadata, $metadata);
248
            }
249 1
        }
250 1
        xml_parser_free($xml);
251 1
    }
252
253
    public function getDictionary(): array
254 1
    {
255
        return $this->dictionary;
256
    }
257 9
258
    /**
259 7
     * @param PDFObject[] $objects
260
     */
261 7
    public function setObjects($objects = [])
262
    {
263
        $this->objects = (array) $objects;
264 3
265
        $this->init();
266
    }
267 12
268
    /**
269 12
     * @return PDFObject[]
270 12
     */
271
    public function getObjects()
272
    {
273 12
        return $this->objects;
274 1
    }
275
276
    /**
277 12
     * @return PDFObject|Font|Page|Element|null
278
     */
279
    public function getObjectById(string $id)
280
    {
281 12
        if (isset($this->objects[$id])) {
282
            return $this->objects[$id];
283
        }
284 12
285 12
        return null;
286
    }
287
288
    public function hasObjectsByType(string $type, string $subtype = null): bool
289 12
    {
290
        return 0 < \count($this->getObjectsByType($type, $subtype));
291
    }
292
293
    public function getObjectsByType(string $type, string $subtype = null): array
294
    {
295
        if (!isset($this->dictionary[$type])) {
296
            return [];
297 41
        }
298
299 41
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
300 41
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
301
                return [];
302 12
            }
303
304 12
            return $this->dictionary[$type]['subtype'][$subtype];
305
        }
306
307
        return $this->dictionary[$type]['all'];
308
    }
309
310
    /**
311
     * @return Font[]
312
     */
313
    public function getFonts()
314
    {
315
        return $this->getObjectsByType('Font');
316
    }
317
318
    public function getFirstFont(): ?Font
319
    {
320
        $fonts = $this->getFonts();
321
        if ([] === $fonts) {
322
            return null;
323
        }
324
325
        return reset($fonts);
326
    }
327
328
    /**
329
     * @return Page[]
330
     *
331
     * @throws \Exception
332
     */
333
    public function getPages()
334
    {
335
        if ($this->hasObjectsByType('Catalog')) {
336
            // Search for catalog to list pages.
337
            $catalogues = $this->getObjectsByType('Catalog');
338
            $catalogue = reset($catalogues);
339
340
            /** @var Pages $object */
341
            $object = $catalogue->get('Pages');
342
            if (method_exists($object, 'getPages')) {
343
                return $object->getPages(true);
344
            }
345
        }
346
347
        if ($this->hasObjectsByType('Pages')) {
348
            // Search for pages to list kids.
349
            $pages = [];
350
351
            /** @var Pages[] $objects */
352
            $objects = $this->getObjectsByType('Pages');
353
            foreach ($objects as $object) {
354
                $pages = array_merge($pages, $object->getPages(true));
355
            }
356
357
            return $pages;
358
        }
359
360
        if ($this->hasObjectsByType('Page')) {
361
            // Search for 'page' (unordered pages).
362
            $pages = $this->getObjectsByType('Page');
363
364
            return array_values($pages);
365
        }
366
367
        throw new \Exception('Missing catalog.');
368
    }
369
370
    public function getText(int $pageLimit = null): string
371
    {
372
        $texts = [];
373
        $pages = $this->getPages();
374
375
        // Only use the first X number of pages if $pageLimit is set and numeric.
376
        if (\is_int($pageLimit) && 0 < $pageLimit) {
377
            $pages = \array_slice($pages, 0, $pageLimit);
378
        }
379
380
        foreach ($pages as $index => $page) {
381
            /**
382
             * In some cases, the $page variable may be null.
383
             */
384
            if (null === $page) {
385
                continue;
386
            }
387
            if ($text = trim($page->getText())) {
388
                $texts[] = $text;
389
            }
390
        }
391
392
        return implode("\n\n", $texts);
393
    }
394
395
    public function getTrailer(): Header
396
    {
397
        return $this->trailer;
398
    }
399
400
    public function setTrailer(Header $trailer)
401
    {
402
        $this->trailer = $trailer;
403
    }
404
405
    public function getDetails(): array
406
    {
407
        return $this->details;
408
    }
409
}
410