Test Failed
Pull Request — master (#606)
by Konrad
01:55
created

Document::extractXMPMetadata()   D

Complexity

Conditions 18
Paths 3

Size

Total Lines 93
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 35
CRAP Score 18.1591

Importance

Changes 4
Bugs 0 Features 1
Metric Value
cc 18
eloc 36
c 4
b 0
f 1
nc 3
nop 1
dl 0
loc 93
ccs 35
cts 38
cp 0.9211
crap 18.1591
rs 4.8666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var array<mixed>
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
166
            /*
167
             * short overview about the following code parts:
168 1
             *
169
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on, 
170 1
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
171
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered, 
172
             * we save the current $metadata context in the $stack, then create a child array of $metadata and 
173
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
174
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
175
             * element) is set as the current $metadata context.
176 46
             */
177
            $metadata = [];
178 46
            $stack = [];
179 46
            foreach ($values as $val) {
180
181
                // Standardize to lowercase
182 3
                $val['tag'] = strtolower($val['tag']);
183
184
                // Ignore structural x: and rdf: XML elements
185 50
                if (0 === strpos($val['tag'], 'x:')) continue;
186
                if (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) continue;
187 50
  
188
                switch ($val['type']) {
189
                    case 'open':
190 53
                        // Create an array of list items
191
                        if ('rdf:li' == $val['tag']) {
192 53
                            $metadata[] = [];
193 12
194
                            // Move up one level in the stack
195
                            $stack[count($stack)] = &$metadata;
196 48
                            $metadata = &$metadata[count($metadata) - 1];
197
198
                        // Else create an array of named values
199
                        } else {
200
                            $metadata[$val['tag']] = [];
201
202
                            // Move up one level in the stack
203
                            $stack[count($stack)] = &$metadata;
204 48
                            $metadata = &$metadata[$val['tag']];
205
                        }
206
                        break;
207
208
                    case 'complete':
209
                        if (isset($val['value'])) {
210 27
211
                            // Assign a value to this list item
212 27
                            if ('rdf:li' == $val['tag']) {
213
                                $metadata[] = $val['value'];
214
215 21
                            // Else assign a value to this property
216
                            } else {
217 21
                                $metadata[$val['tag']] = $val['value'];
218 21
                            }
219 3
                        }
220
                        break;
221
222 18
                    case 'close':
223
                        // If the value of this property is a single-
224
                        // element array where the element is of type
225
                        // string, use the value of the first list item
226
                        // as the value for this property
227
                        if (is_array($metadata) && isset($metadata[0]) && 1 == count($metadata) && is_string($metadata[0])) {
228
                            $metadata = $metadata[0];
229
                        }
230 50
231
                        // Move down one level in the stack
232 50
                        $metadata = &$stack[count($stack) - 1];
233
                        unset($stack[count($stack) - 1]);
234 42
                        break;
235 42
236
                }
237
            }
238 42
239 42
            // Only use this metadata if it's referring to a PDF
240 42
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
241
242
                // According to the XMP specifications: 'Conflict resolution
243
                // for separate packets that describe the same resource is
244 9
                // beyond the scope of this document.' - Section 6.1
245
                // Source: https://www.adobe.com/devnet/xmp.html
246 1
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
247
                // So if there are multiple XMP blocks, just merge the values
248
                // of each found block over top of the existing values
249 1
                $this->metadata = array_merge($this->metadata, $metadata);
250 1
            }
251 1
        }
252
        xml_parser_free($xml);
253
    }
254 1
255
    public function getDictionary(): array
256
    {
257 9
        return $this->dictionary;
258
    }
259 7
260
    /**
261 7
     * @param PDFObject[] $objects
262
     */
263
    public function setObjects($objects = [])
264 3
    {
265
        $this->objects = (array) $objects;
266
267 12
        $this->init();
268
    }
269 12
270 12
    /**
271
     * @return PDFObject[]
272
     */
273 12
    public function getObjects()
274 1
    {
275
        return $this->objects;
276
    }
277 12
278
    /**
279
     * @return PDFObject|Font|Page|Element|null
280
     */
281 12
    public function getObjectById(string $id)
282
    {
283
        if (isset($this->objects[$id])) {
284 12
            return $this->objects[$id];
285 12
        }
286
287
        return null;
288
    }
289 12
290
    public function hasObjectsByType(string $type, string $subtype = null): bool
291
    {
292
        return 0 < \count($this->getObjectsByType($type, $subtype));
293
    }
294
295
    public function getObjectsByType(string $type, string $subtype = null): array
296
    {
297 41
        if (!isset($this->dictionary[$type])) {
298
            return [];
299 41
        }
300 41
301
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
302 12
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
303
                return [];
304 12
            }
305
306
            return $this->dictionary[$type]['subtype'][$subtype];
307
        }
308
309
        return $this->dictionary[$type]['all'];
310
    }
311
312
    /**
313
     * @return Font[]
314
     */
315
    public function getFonts()
316
    {
317
        return $this->getObjectsByType('Font');
318
    }
319
320
    public function getFirstFont(): ?Font
321
    {
322
        $fonts = $this->getFonts();
323
        if ([] === $fonts) {
324
            return null;
325
        }
326
327
        return reset($fonts);
328
    }
329
330
    /**
331
     * @return Page[]
332
     *
333
     * @throws \Exception
334
     */
335
    public function getPages()
336
    {
337
        if ($this->hasObjectsByType('Catalog')) {
338
            // Search for catalog to list pages.
339
            $catalogues = $this->getObjectsByType('Catalog');
340
            $catalogue = reset($catalogues);
341
342
            /** @var Pages $object */
343
            $object = $catalogue->get('Pages');
344
            if (method_exists($object, 'getPages')) {
345
                return $object->getPages(true);
346
            }
347
        }
348
349
        if ($this->hasObjectsByType('Pages')) {
350
            // Search for pages to list kids.
351
            $pages = [];
352
353
            /** @var Pages[] $objects */
354
            $objects = $this->getObjectsByType('Pages');
355
            foreach ($objects as $object) {
356
                $pages = array_merge($pages, $object->getPages(true));
357
            }
358
359
            return $pages;
360
        }
361
362
        if ($this->hasObjectsByType('Page')) {
363
            // Search for 'page' (unordered pages).
364
            $pages = $this->getObjectsByType('Page');
365
366
            return array_values($pages);
367
        }
368
369
        throw new \Exception('Missing catalog.');
370
    }
371
372
    public function getText(int $pageLimit = null): string
373
    {
374
        $texts = [];
375
        $pages = $this->getPages();
376
377
        // Only use the first X number of pages if $pageLimit is set and numeric.
378
        if (\is_int($pageLimit) && 0 < $pageLimit) {
379
            $pages = \array_slice($pages, 0, $pageLimit);
380
        }
381
382
        foreach ($pages as $index => $page) {
383
            /**
384
             * In some cases, the $page variable may be null.
385
             */
386
            if (null === $page) {
387
                continue;
388
            }
389
            if ($text = trim($page->getText())) {
390
                $texts[] = $text;
391
            }
392
        }
393
394
        return implode("\n\n", $texts);
395
    }
396
397
    public function getTrailer(): Header
398
    {
399
        return $this->trailer;
400
    }
401
402
    public function setTrailer(Header $trailer)
403
    {
404
        $this->trailer = $trailer;
405
    }
406
407
    public function getDetails(): array
408
    {
409
        return $this->details;
410
    }
411
}
412