Test Failed
Pull Request — master (#606)
by
unknown
07:11
created

Document::extractXMPMetadata()   C

Complexity

Conditions 15
Paths 2

Size

Total Lines 77
Code Lines 35

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 15.225

Importance

Changes 0
Metric Value
cc 15
eloc 35
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 77
ccs 27
cts 30
cp 0.9
crap 15.225
rs 5.9166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var array<mixed>
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (xml_parse_into_struct($xml, $content, $values, $index)) {
166
167
            $metadata = [];
168 1
            $stack = [];
169
            foreach ($values as $val) {
170 1
171
                // Standardize to lowercase
172
                $val['tag'] = strtolower($val['tag']);
173
174
                // Ignore structural x: and rdf: XML elements
175
                if (strpos($val['tag'], 'x:') === 0) continue;
176 46
                if (strpos($val['tag'], 'rdf:') === 0 && 'rdf:li' != $val['tag']) continue;
177
  
178 46
                switch ($val['type']) {
179 46
                    case 'open':
180
                        // Create an array of list items
181
                        if ('rdf:li' == $val['tag']) {
182 3
                            $metadata[] = [];
183
184
                            // Move up one level in the stack
185 50
                            $stack[count($stack)] = &$metadata;
186
                            $metadata = &$metadata[count($metadata) - 1];
187 50
188
                        // Else create an array of named values
189
                        } else {
190 53
                            $metadata[$val['tag']] = [];
191
192 53
                            // Move up one level in the stack
193 12
                            $stack[count($stack)] = &$metadata;
194
                            $metadata = &$metadata[$val['tag']];
195
                        }
196 48
                        break;
197
198
                    case 'complete':
199
                        if (isset($val['value'])) {
200
201
                            // Assign a value to this list item
202
                            if ('rdf:li' == $val['tag']) {
203
                                $metadata[] = $val['value'];
204 48
205
                            // Else assign a value to this property
206
                            } else {
207
                                $metadata[$val['tag']] = $val['value'];
208
                            }
209
                        }
210 27
                        break;
211
212 27
                    case 'close':
213
                        // If the value of this item is a single element
214
                        // array of just one list item, use the value of
215 21
                        // the first list item as the value for this
216
                        // property
217 21
                        if (is_array($metadata) && isset($metadata[0]) && count($metadata) == 1) {
218 21
                            $metadata = $metadata[0];
219 3
                        }
220
221
                        // Move down one level in the stack
222 18
                        $metadata = &$stack[count($stack) - 1];
223
                        unset($stack[count($stack) - 1]);
224
                        break;
225
226
                }
227
            }
228
229
            // According to the XMP specifications: 'Conflict resolution
230 50
            // for separate packets that describe the same resource is
231
            // beyond the scope of this document.' - Section 6.1
232 50
            // So if there are multiple XMP blocks, just merge the values
233
            // of each found block over top of the existing values
234 42
            $this->metadata = array_merge($this->metadata, $metadata);
235 42
        }
236
        xml_parser_free($xml);
237
    }
238 42
239 42
    public function getDictionary(): array
240 42
    {
241
        return $this->dictionary;
242
    }
243
244 9
    /**
245
     * @param PDFObject[] $objects
246 1
     */
247
    public function setObjects($objects = [])
248
    {
249 1
        $this->objects = (array) $objects;
250 1
251 1
        $this->init();
252
    }
253
254 1
    /**
255
     * @return PDFObject[]
256
     */
257 9
    public function getObjects()
258
    {
259 7
        return $this->objects;
260
    }
261 7
262
    /**
263
     * @return PDFObject|Font|Page|Element|null
264 3
     */
265
    public function getObjectById(string $id)
266
    {
267 12
        if (isset($this->objects[$id])) {
268
            return $this->objects[$id];
269 12
        }
270 12
271
        return null;
272
    }
273 12
274 1
    public function hasObjectsByType(string $type, string $subtype = null): bool
275
    {
276
        return 0 < \count($this->getObjectsByType($type, $subtype));
277 12
    }
278
279
    public function getObjectsByType(string $type, string $subtype = null): array
280
    {
281 12
        if (!isset($this->dictionary[$type])) {
282
            return [];
283
        }
284 12
285 12
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
286
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
287
                return [];
288
            }
289 12
290
            return $this->dictionary[$type]['subtype'][$subtype];
291
        }
292
293
        return $this->dictionary[$type]['all'];
294
    }
295
296
    /**
297 41
     * @return Font[]
298
     */
299 41
    public function getFonts()
300 41
    {
301
        return $this->getObjectsByType('Font');
302 12
    }
303
304 12
    public function getFirstFont(): ?Font
305
    {
306
        $fonts = $this->getFonts();
307
        if ([] === $fonts) {
308
            return null;
309
        }
310
311
        return reset($fonts);
312
    }
313
314
    /**
315
     * @return Page[]
316
     *
317
     * @throws \Exception
318
     */
319
    public function getPages()
320
    {
321
        if ($this->hasObjectsByType('Catalog')) {
322
            // Search for catalog to list pages.
323
            $catalogues = $this->getObjectsByType('Catalog');
324
            $catalogue = reset($catalogues);
325
326
            /** @var Pages $object */
327
            $object = $catalogue->get('Pages');
328
            if (method_exists($object, 'getPages')) {
329
                return $object->getPages(true);
330
            }
331
        }
332
333
        if ($this->hasObjectsByType('Pages')) {
334
            // Search for pages to list kids.
335
            $pages = [];
336
337
            /** @var Pages[] $objects */
338
            $objects = $this->getObjectsByType('Pages');
339
            foreach ($objects as $object) {
340
                $pages = array_merge($pages, $object->getPages(true));
341
            }
342
343
            return $pages;
344
        }
345
346
        if ($this->hasObjectsByType('Page')) {
347
            // Search for 'page' (unordered pages).
348
            $pages = $this->getObjectsByType('Page');
349
350
            return array_values($pages);
351
        }
352
353
        throw new \Exception('Missing catalog.');
354
    }
355
356
    public function getText(int $pageLimit = null): string
357
    {
358
        $texts = [];
359
        $pages = $this->getPages();
360
361
        // Only use the first X number of pages if $pageLimit is set and numeric.
362
        if (\is_int($pageLimit) && 0 < $pageLimit) {
363
            $pages = \array_slice($pages, 0, $pageLimit);
364
        }
365
366
        foreach ($pages as $index => $page) {
367
            /**
368
             * In some cases, the $page variable may be null.
369
             */
370
            if (null === $page) {
371
                continue;
372
            }
373
            if ($text = trim($page->getText())) {
374
                $texts[] = $text;
375
            }
376
        }
377
378
        return implode("\n\n", $texts);
379
    }
380
381
    public function getTrailer(): Header
382
    {
383
        return $this->trailer;
384
    }
385
386
    public function setTrailer(Header $trailer)
387
    {
388
        $this->trailer = $trailer;
389
    }
390
391
    public function getDetails(): array
392
    {
393
        return $this->details;
394
    }
395
}
396