Test Failed
Pull Request — master (#606)
by
unknown
02:00
created

Document::extractXMPMetadata()   D

Complexity

Conditions 18
Paths 3

Size

Total Lines 83
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 18.2433

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 18
eloc 36
nc 3
nop 1
dl 0
loc 83
ccs 30
cts 33
cp 0.9091
crap 18.2433
rs 4.8666
c 3
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var array<mixed>
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
166
167
            $metadata = [];
168 1
            $stack = [];
169
            foreach ($values as $val) {
170 1
171
                // Standardize to lowercase
172
                $val['tag'] = strtolower($val['tag']);
173
174
                // Ignore structural x: and rdf: XML elements
175
                if (0 === strpos($val['tag'], 'x:')) continue;
176 46
                if (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) continue;
177
  
178 46
                switch ($val['type']) {
179 46
                    case 'open':
180
                        // Create an array of list items
181
                        if ('rdf:li' == $val['tag']) {
182 3
                            $metadata[] = [];
183
184
                            // Move up one level in the stack
185 50
                            $stack[count($stack)] = &$metadata;
186
                            $metadata = &$metadata[count($metadata) - 1];
187 50
188
                        // Else create an array of named values
189
                        } else {
190 53
                            $metadata[$val['tag']] = [];
191
192 53
                            // Move up one level in the stack
193 12
                            $stack[count($stack)] = &$metadata;
194
                            $metadata = &$metadata[$val['tag']];
195
                        }
196 48
                        break;
197
198
                    case 'complete':
199
                        if (isset($val['value'])) {
200
201
                            // Assign a value to this list item
202
                            if ('rdf:li' == $val['tag']) {
203
                                $metadata[] = $val['value'];
204 48
205
                            // Else assign a value to this property
206
                            } else {
207
                                $metadata[$val['tag']] = $val['value'];
208
                            }
209
                        }
210 27
                        break;
211
212 27
                    case 'close':
213
                        // If the value of this property is a single-
214
                        // element array where the element is of type
215 21
                        // string, use the value of the first list item
216
                        // as the value for this property
217 21
                        if (is_array($metadata) && isset($metadata[0]) && 1 == count($metadata) && is_string($metadata[0])) {
218 21
                            $metadata = $metadata[0];
219 3
                        }
220
221
                        // Move down one level in the stack
222 18
                        $metadata = &$stack[count($stack) - 1];
223
                        unset($stack[count($stack) - 1]);
224
                        break;
225
226
                }
227
            }
228
229
            // Only use this metadata if it's referring to a PDF
230 50
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
231
232 50
                // According to the XMP specifications: 'Conflict resolution
233
                // for separate packets that describe the same resource is
234 42
                // beyond the scope of this document.' - Section 6.1
235 42
                // Source: https://www.adobe.com/devnet/xmp.html
236
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
237
                // So if there are multiple XMP blocks, just merge the values
238 42
                // of each found block over top of the existing values
239 42
                $this->metadata = array_merge($this->metadata, $metadata);
240 42
            }
241
        }
242
        xml_parser_free($xml);
243
    }
244 9
245
    public function getDictionary(): array
246 1
    {
247
        return $this->dictionary;
248
    }
249 1
250 1
    /**
251 1
     * @param PDFObject[] $objects
252
     */
253
    public function setObjects($objects = [])
254 1
    {
255
        $this->objects = (array) $objects;
256
257 9
        $this->init();
258
    }
259 7
260
    /**
261 7
     * @return PDFObject[]
262
     */
263
    public function getObjects()
264 3
    {
265
        return $this->objects;
266
    }
267 12
268
    /**
269 12
     * @return PDFObject|Font|Page|Element|null
270 12
     */
271
    public function getObjectById(string $id)
272
    {
273 12
        if (isset($this->objects[$id])) {
274 1
            return $this->objects[$id];
275
        }
276
277 12
        return null;
278
    }
279
280
    public function hasObjectsByType(string $type, string $subtype = null): bool
281 12
    {
282
        return 0 < \count($this->getObjectsByType($type, $subtype));
283
    }
284 12
285 12
    public function getObjectsByType(string $type, string $subtype = null): array
286
    {
287
        if (!isset($this->dictionary[$type])) {
288
            return [];
289 12
        }
290
291
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
292
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
293
                return [];
294
            }
295
296
            return $this->dictionary[$type]['subtype'][$subtype];
297 41
        }
298
299 41
        return $this->dictionary[$type]['all'];
300 41
    }
301
302 12
    /**
303
     * @return Font[]
304 12
     */
305
    public function getFonts()
306
    {
307
        return $this->getObjectsByType('Font');
308
    }
309
310
    public function getFirstFont(): ?Font
311
    {
312
        $fonts = $this->getFonts();
313
        if ([] === $fonts) {
314
            return null;
315
        }
316
317
        return reset($fonts);
318
    }
319
320
    /**
321
     * @return Page[]
322
     *
323
     * @throws \Exception
324
     */
325
    public function getPages()
326
    {
327
        if ($this->hasObjectsByType('Catalog')) {
328
            // Search for catalog to list pages.
329
            $catalogues = $this->getObjectsByType('Catalog');
330
            $catalogue = reset($catalogues);
331
332
            /** @var Pages $object */
333
            $object = $catalogue->get('Pages');
334
            if (method_exists($object, 'getPages')) {
335
                return $object->getPages(true);
336
            }
337
        }
338
339
        if ($this->hasObjectsByType('Pages')) {
340
            // Search for pages to list kids.
341
            $pages = [];
342
343
            /** @var Pages[] $objects */
344
            $objects = $this->getObjectsByType('Pages');
345
            foreach ($objects as $object) {
346
                $pages = array_merge($pages, $object->getPages(true));
347
            }
348
349
            return $pages;
350
        }
351
352
        if ($this->hasObjectsByType('Page')) {
353
            // Search for 'page' (unordered pages).
354
            $pages = $this->getObjectsByType('Page');
355
356
            return array_values($pages);
357
        }
358
359
        throw new \Exception('Missing catalog.');
360
    }
361
362
    public function getText(int $pageLimit = null): string
363
    {
364
        $texts = [];
365
        $pages = $this->getPages();
366
367
        // Only use the first X number of pages if $pageLimit is set and numeric.
368
        if (\is_int($pageLimit) && 0 < $pageLimit) {
369
            $pages = \array_slice($pages, 0, $pageLimit);
370
        }
371
372
        foreach ($pages as $index => $page) {
373
            /**
374
             * In some cases, the $page variable may be null.
375
             */
376
            if (null === $page) {
377
                continue;
378
            }
379
            if ($text = trim($page->getText())) {
380
                $texts[] = $text;
381
            }
382
        }
383
384
        return implode("\n\n", $texts);
385
    }
386
387
    public function getTrailer(): Header
388
    {
389
        return $this->trailer;
390
    }
391
392
    public function setTrailer(Header $trailer)
393
    {
394
        $this->trailer = $trailer;
395
    }
396
397
    public function getDetails(): array
398
    {
399
        return $this->details;
400
    }
401
}
402