Test Failed
Pull Request — master (#606)
by
unknown
01:54
created

Document::extractXMPMetadata()   D

Complexity

Conditions 18
Paths 3

Size

Total Lines 81
Code Lines 36

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 30
CRAP Score 18.2433

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 18
eloc 36
c 2
b 0
f 0
nc 3
nop 1
dl 0
loc 81
ccs 30
cts 33
cp 0.9091
crap 18.2433
rs 4.8666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var array<mixed>
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (xml_parse_into_struct($xml, $content, $values, $index)) {
166
167
            $metadata = [];
168 1
            $stack = [];
169
            foreach ($values as $val) {
170 1
171
                // Standardize to lowercase
172
                $val['tag'] = strtolower($val['tag']);
173
174
                // Ignore structural x: and rdf: XML elements
175
                if (strpos($val['tag'], 'x:') === 0) continue;
176 46
                if (strpos($val['tag'], 'rdf:') === 0 && 'rdf:li' != $val['tag']) continue;
177
  
178 46
                switch ($val['type']) {
179 46
                    case 'open':
180
                        // Create an array of list items
181
                        if ('rdf:li' == $val['tag']) {
182 3
                            $metadata[] = [];
183
184
                            // Move up one level in the stack
185 50
                            $stack[count($stack)] = &$metadata;
186
                            $metadata = &$metadata[count($metadata) - 1];
187 50
188
                        // Else create an array of named values
189
                        } else {
190 53
                            $metadata[$val['tag']] = [];
191
192 53
                            // Move up one level in the stack
193 12
                            $stack[count($stack)] = &$metadata;
194
                            $metadata = &$metadata[$val['tag']];
195
                        }
196 48
                        break;
197
198
                    case 'complete':
199
                        if (isset($val['value'])) {
200
201
                            // Assign a value to this list item
202
                            if ('rdf:li' == $val['tag']) {
203
                                $metadata[] = $val['value'];
204 48
205
                            // Else assign a value to this property
206
                            } else {
207
                                $metadata[$val['tag']] = $val['value'];
208
                            }
209
                        }
210 27
                        break;
211
212 27
                    case 'close':
213
                        // If the value of this property is a single-
214
                        // element array where the element is of type
215 21
                        // string, use the value of the first list item
216
                        // as the value for this property
217 21
                        if (is_array($metadata) && isset($metadata[0]) && count($metadata) == 1 && is_string($metadata[0])) {
218 21
                            $metadata = $metadata[0];
219 3
                        }
220
221
                        // Move down one level in the stack
222 18
                        $metadata = &$stack[count($stack) - 1];
223
                        unset($stack[count($stack) - 1]);
224
                        break;
225
226
                }
227
            }
228
229
            // Only use this metadata if it's referring to a PDF
230 50
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
231
232 50
                // According to the XMP specifications: 'Conflict resolution
233
                // for separate packets that describe the same resource is
234 42
                // beyond the scope of this document.' - Section 6.1
235 42
                // So if there are multiple XMP blocks, just merge the values
236
                // of each found block over top of the existing values
237
                $this->metadata = array_merge($this->metadata, $metadata);
238 42
            }
239 42
        }
240 42
        xml_parser_free($xml);
241
    }
242
243
    public function getDictionary(): array
244 9
    {
245
        return $this->dictionary;
246 1
    }
247
248
    /**
249 1
     * @param PDFObject[] $objects
250 1
     */
251 1
    public function setObjects($objects = [])
252
    {
253
        $this->objects = (array) $objects;
254 1
255
        $this->init();
256
    }
257 9
258
    /**
259 7
     * @return PDFObject[]
260
     */
261 7
    public function getObjects()
262
    {
263
        return $this->objects;
264 3
    }
265
266
    /**
267 12
     * @return PDFObject|Font|Page|Element|null
268
     */
269 12
    public function getObjectById(string $id)
270 12
    {
271
        if (isset($this->objects[$id])) {
272
            return $this->objects[$id];
273 12
        }
274 1
275
        return null;
276
    }
277 12
278
    public function hasObjectsByType(string $type, string $subtype = null): bool
279
    {
280
        return 0 < \count($this->getObjectsByType($type, $subtype));
281 12
    }
282
283
    public function getObjectsByType(string $type, string $subtype = null): array
284 12
    {
285 12
        if (!isset($this->dictionary[$type])) {
286
            return [];
287
        }
288
289 12
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
290
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
291
                return [];
292
            }
293
294
            return $this->dictionary[$type]['subtype'][$subtype];
295
        }
296
297 41
        return $this->dictionary[$type]['all'];
298
    }
299 41
300 41
    /**
301
     * @return Font[]
302 12
     */
303
    public function getFonts()
304 12
    {
305
        return $this->getObjectsByType('Font');
306
    }
307
308
    public function getFirstFont(): ?Font
309
    {
310
        $fonts = $this->getFonts();
311
        if ([] === $fonts) {
312
            return null;
313
        }
314
315
        return reset($fonts);
316
    }
317
318
    /**
319
     * @return Page[]
320
     *
321
     * @throws \Exception
322
     */
323
    public function getPages()
324
    {
325
        if ($this->hasObjectsByType('Catalog')) {
326
            // Search for catalog to list pages.
327
            $catalogues = $this->getObjectsByType('Catalog');
328
            $catalogue = reset($catalogues);
329
330
            /** @var Pages $object */
331
            $object = $catalogue->get('Pages');
332
            if (method_exists($object, 'getPages')) {
333
                return $object->getPages(true);
334
            }
335
        }
336
337
        if ($this->hasObjectsByType('Pages')) {
338
            // Search for pages to list kids.
339
            $pages = [];
340
341
            /** @var Pages[] $objects */
342
            $objects = $this->getObjectsByType('Pages');
343
            foreach ($objects as $object) {
344
                $pages = array_merge($pages, $object->getPages(true));
345
            }
346
347
            return $pages;
348
        }
349
350
        if ($this->hasObjectsByType('Page')) {
351
            // Search for 'page' (unordered pages).
352
            $pages = $this->getObjectsByType('Page');
353
354
            return array_values($pages);
355
        }
356
357
        throw new \Exception('Missing catalog.');
358
    }
359
360
    public function getText(int $pageLimit = null): string
361
    {
362
        $texts = [];
363
        $pages = $this->getPages();
364
365
        // Only use the first X number of pages if $pageLimit is set and numeric.
366
        if (\is_int($pageLimit) && 0 < $pageLimit) {
367
            $pages = \array_slice($pages, 0, $pageLimit);
368
        }
369
370
        foreach ($pages as $index => $page) {
371
            /**
372
             * In some cases, the $page variable may be null.
373
             */
374
            if (null === $page) {
375
                continue;
376
            }
377
            if ($text = trim($page->getText())) {
378
                $texts[] = $text;
379
            }
380
        }
381
382
        return implode("\n\n", $texts);
383
    }
384
385
    public function getTrailer(): Header
386
    {
387
        return $this->trailer;
388
    }
389
390
    public function setTrailer(Header $trailer)
391
    {
392
        $this->trailer = $trailer;
393
    }
394
395
    public function getDetails(): array
396
    {
397
        return $this->details;
398
    }
399
}
400