Test Failed
Pull Request — master (#606)
by
unknown
02:13
created

Document::extractXMPMetadata()   D

Complexity

Conditions 48
Paths 35

Size

Total Lines 97
Code Lines 66

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 49.0477

Importance

Changes 0
Metric Value
cc 48
eloc 66
c 0
b 0
f 0
nc 35
nop 1
dl 0
loc 97
ccs 36
cts 39
cp 0.9231
crap 49.0477
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var Metadata
0 ignored issues
show
Bug introduced by
The type Smalot\PdfParser\Metadata was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
0 ignored issues
show
Bug introduced by
$this->metadata of type Smalot\PdfParser\Metadata is incompatible with the type array expected by parameter $arrays of array_merge(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

152
        $details = array_merge($details, /** @scrutinizer ignore-type */ $this->metadata);
Loading history...
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165
        if (xml_parse_into_struct($xml, $content, $values, $index)) {
166
            $detail = '';
167
168 1
            foreach ($values as $val) {
169
                switch ($val['tag']) {
170 1
                    case 'DC:CREATOR':
171
                        $detail = ('open' == $val['type']) ? 'Author' : '';
172
                        break;
173
174
                    case 'DC:DESCRIPTION':
175
                        $detail = ('open' == $val['type']) ? 'Description' : '';
176 46
                        break;
177
178 46
                    case 'DC:TITLE':
179 46
                        $detail = ('open' == $val['type']) ? 'Title' : '';
180
                        break;
181
182 3
                    case 'DC:SUBJECT':
183
                        $detail = ('open' == $val['type']) ? 'Subject' : '';
184
                        break;
185 50
186
                    case 'RDF:LI':
187 50
                        if ($detail && 'complete' == $val['type'] && isset($val['value'])) {
188
                            $this->metadata[$detail] = $val['value'];
189
                        }
190 53
                        break;
191
192 53
                    case 'DC:FORMAT':
193 12
                        if ('complete' == $val['type'] && isset($val['value'])) {
194
                            $this->metadata['Format'] = $val['value'];
195
                        }
196 48
                        break;
197
198
                    case 'PDF:KEYWORDS':
199
                        if ('complete' == $val['type'] && isset($val['value'])) {
200
                            $this->metadata['Keywords'] = $val['value'];
201
                        }
202
                        break;
203
204 48
                    case 'PDF:PRODUCER':
205
                        if ('complete' == $val['type'] && isset($val['value'])) {
206
                            $this->metadata['Producer'] = $val['value'];
207
                        }
208
                        break;
209
210 27
                    case 'PDFX:SOURCEMODIFIED':
211
                        if ('complete' == $val['type'] && isset($val['value'])) {
212 27
                            $this->metadata['SourceModified'] = $val['value'];
213
                        }
214
                        break;
215 21
216
                    case 'PDFX:COMPANY':
217 21
                        if ('complete' == $val['type'] && isset($val['value'])) {
218 21
                            $this->metadata['Company'] = $val['value'];
219 3
                        }
220
                        break;
221
222 18
                    case 'XMP:CREATEDATE':
223
                        if ('complete' == $val['type'] && isset($val['value'])) {
224
                            $this->metadata['CreationDate'] = $val['value'];
225
                        }
226
                        break;
227
228
                    case 'XMP:CREATORTOOL':
229
                        if ('complete' == $val['type'] && isset($val['value'])) {
230 50
                            $this->metadata['Creator'] = $val['value'];
231
                        }
232 50
                        break;
233
234 42
                    case 'XMP:MODIFYDATE':
235 42
                        if ('complete' == $val['type'] && isset($val['value'])) {
236
                            $this->metadata['ModDate'] = $val['value'];
237
                        }
238 42
                        break;
239 42
240 42
                    case 'XMP:METADATADATE':
241
                        if ('complete' == $val['type'] && isset($val['value'])) {
242
                            $this->metadata['MetadataDate'] = $val['value'];
243
                        }
244 9
                        break;
245
246 1
                    case 'XMPMM:DOCUMENTID':
247
                        if ('complete' == $val['type'] && isset($val['value'])) {
248
                            $this->metadata['DocumentUUID'] = $val['value'];
249 1
                        }
250 1
                        break;
251 1
252
                    case 'XMPMM:INSTANCEID':
253
                        if ('complete' == $val['type'] && isset($val['value'])) {
254 1
                            $this->metadata['InstanceUUID'] = $val['value'];
255
                        }
256
                        break;
257 9
                }
258
            }
259 7
        }
260
    }
261 7
262
    public function getDictionary(): array
263
    {
264 3
        return $this->dictionary;
265
    }
266
267 12
    /**
268
     * @param PDFObject[] $objects
269 12
     */
270 12
    public function setObjects($objects = [])
271
    {
272
        $this->objects = (array) $objects;
273 12
274 1
        $this->init();
275
    }
276
277 12
    /**
278
     * @return PDFObject[]
279
     */
280
    public function getObjects()
281 12
    {
282
        return $this->objects;
283
    }
284 12
285 12
    /**
286
     * @return PDFObject|Font|Page|Element|null
287
     */
288
    public function getObjectById(string $id)
289 12
    {
290
        if (isset($this->objects[$id])) {
291
            return $this->objects[$id];
292
        }
293
294
        return null;
295
    }
296
297 41
    public function hasObjectsByType(string $type, string $subtype = null): bool
298
    {
299 41
        return 0 < \count($this->getObjectsByType($type, $subtype));
300 41
    }
301
302 12
    public function getObjectsByType(string $type, string $subtype = null): array
303
    {
304 12
        if (!isset($this->dictionary[$type])) {
305
            return [];
306
        }
307
308
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
309
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
310
                return [];
311
            }
312
313
            return $this->dictionary[$type]['subtype'][$subtype];
314
        }
315
316
        return $this->dictionary[$type]['all'];
317
    }
318
319
    /**
320
     * @return Font[]
321
     */
322
    public function getFonts()
323
    {
324
        return $this->getObjectsByType('Font');
325
    }
326
327
    public function getFirstFont(): ?Font
328
    {
329
        $fonts = $this->getFonts();
330
        if ([] === $fonts) {
331
            return null;
332
        }
333
334
        return reset($fonts);
335
    }
336
337
    /**
338
     * @return Page[]
339
     *
340
     * @throws \Exception
341
     */
342
    public function getPages()
343
    {
344
        if ($this->hasObjectsByType('Catalog')) {
345
            // Search for catalog to list pages.
346
            $catalogues = $this->getObjectsByType('Catalog');
347
            $catalogue = reset($catalogues);
348
349
            /** @var Pages $object */
350
            $object = $catalogue->get('Pages');
351
            if (method_exists($object, 'getPages')) {
352
                return $object->getPages(true);
353
            }
354
        }
355
356
        if ($this->hasObjectsByType('Pages')) {
357
            // Search for pages to list kids.
358
            $pages = [];
359
360
            /** @var Pages[] $objects */
361
            $objects = $this->getObjectsByType('Pages');
362
            foreach ($objects as $object) {
363
                $pages = array_merge($pages, $object->getPages(true));
364
            }
365
366
            return $pages;
367
        }
368
369
        if ($this->hasObjectsByType('Page')) {
370
            // Search for 'page' (unordered pages).
371
            $pages = $this->getObjectsByType('Page');
372
373
            return array_values($pages);
374
        }
375
376
        throw new \Exception('Missing catalog.');
377
    }
378
379
    public function getText(int $pageLimit = null): string
380
    {
381
        $texts = [];
382
        $pages = $this->getPages();
383
384
        // Only use the first X number of pages if $pageLimit is set and numeric.
385
        if (\is_int($pageLimit) && 0 < $pageLimit) {
386
            $pages = \array_slice($pages, 0, $pageLimit);
387
        }
388
389
        foreach ($pages as $index => $page) {
390
            /**
391
             * In some cases, the $page variable may be null.
392
             */
393
            if (null === $page) {
394
                continue;
395
            }
396
            if ($text = trim($page->getText())) {
397
                $texts[] = $text;
398
            }
399
        }
400
401
        return implode("\n\n", $texts);
402
    }
403
404
    public function getTrailer(): Header
405
    {
406
        return $this->trailer;
407
    }
408
409
    public function setTrailer(Header $trailer)
410
    {
411
        $this->trailer = $trailer;
412
    }
413
414
    public function getDetails(): array
415
    {
416
        return $this->details;
417
    }
418
}
419