Test Failed
Pull Request — master (#606)
by
unknown
07:37
created

Document::getXMPMetadata()   D

Complexity

Conditions 48
Paths 35

Size

Total Lines 98
Code Lines 66

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 37
CRAP Score 48.972

Importance

Changes 0
Metric Value
cc 48
eloc 66
c 0
b 0
f 0
nc 35
nop 1
dl 0
loc 98
ccs 37
cts 40
cp 0.925
crap 48.972
rs 4.1666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var Metadata
0 ignored issues
show
Bug introduced by
The type Smalot\PdfParser\Metadata was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
0 ignored issues
show
Bug introduced by
$this->metadata of type Smalot\PdfParser\Metadata is incompatible with the type array expected by parameter $arrays of array_merge(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

152
        $details = array_merge($details, /** @scrutinizer ignore-type */ $this->metadata);
Loading history...
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Get XMP Metadata
159
     */
160 49
    public function getXMPMetadata(string $content)
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1);
164
165
        if (xml_parse_into_struct($xml, $content, $values, $index)) {
166
167
            $detail = '';
168 1
169
            foreach ($values as $val) {
170 1
                switch ($val['tag']) {
171
                    case 'DC:CREATOR':
172
                        $detail = ($val['type'] == 'open') ? 'Author' : '';
173
                        break;
174
175
                    case 'DC:DESCRIPTION':
176 46
                        $detail = ($val['type'] == 'open') ? 'Description' : '';
177
                        break;
178 46
179 46
                    case 'DC:TITLE':
180
                        $detail = ($val['type'] == 'open') ? 'Title' : '';
181
                        break;
182 3
183
                    case 'DC:SUBJECT':
184
                        $detail = ($val['type'] == 'open') ? 'Subject' : '';
185 50
                        break;
186
187 50
                    case 'RDF:LI':
188
                        if ($detail && $val['type'] == 'complete' && isset($val['value'])) {
189
                            $this->metadata[$detail] = $val['value'];
190 53
                        }
191
                        break;
192 53
193 12
                    case 'DC:FORMAT':
194
                        if ($val['type'] == 'complete' && isset($val['value'])) {
195
                            $this->metadata['Format'] = $val['value'];
196 48
                        }
197
                        break;
198
199
                    case 'PDF:KEYWORDS':
200
                        if ($val['type'] == 'complete' && isset($val['value'])) {
201
                            $this->metadata['Keywords'] = $val['value'];
202
                        }
203
                        break;
204 48
205
                    case 'PDF:PRODUCER':
206
                        if ($val['type'] == 'complete' && isset($val['value'])) {
207
                            $this->metadata['Producer'] = $val['value'];
208
                        }
209
                        break;
210 27
211
                    case 'PDFX:SOURCEMODIFIED':
212 27
                        if ($val['type'] == 'complete' && isset($val['value'])) {
213
                            $this->metadata['SourceModified'] = $val['value'];
214
                        }
215 21
                        break;
216
217 21
                    case 'PDFX:COMPANY':
218 21
                        if ($val['type'] == 'complete' && isset($val['value'])) {
219 3
                            $this->metadata['Company'] = $val['value'];
220
                        }
221
                        break;
222 18
223
                    case 'XMP:CREATEDATE':
224
                        if ($val['type'] == 'complete' && isset($val['value'])) {
225
                            $this->metadata['CreationDate'] = $val['value'];
226
                        }
227
                        break;
228
229
                    case 'XMP:CREATORTOOL':
230 50
                        if ($val['type'] == 'complete' && isset($val['value'])) {
231
                            $this->metadata['Creator'] = $val['value'];
232 50
                        }
233
                        break;
234 42
235 42
                    case 'XMP:MODIFYDATE':
236
                        if ($val['type'] == 'complete' && isset($val['value'])) {
237
                            $this->metadata['ModifyDate'] = $val['value'];
238 42
                        }
239 42
                        break;
240 42
241
                    case 'XMP:METADATADATE':
242
                        if ($val['type'] == 'complete' && isset($val['value'])) {
243
                            $this->metadata['MetadataDate'] = $val['value'];
244 9
                        }
245
                        break;                
246 1
247
                    case 'XMPMM:DOCUMENTID':
248
                        if ($val['type'] == 'complete' && isset($val['value'])) {
249 1
                            $this->metadata['DocumentUUID'] = $val['value'];
250 1
                        }
251 1
                        break;                
252
253
                    case 'XMPMM:INSTANCEID':
254 1
                        if ($val['type'] == 'complete' && isset($val['value'])) {
255
                            $this->metadata['InstanceUUID'] = $val['value'];
256
                        }
257 9
                        break;                
258
259 7
                }
260
            }
261 7
        }
262
    }
263
264 3
265
    public function getDictionary(): array
266
    {
267 12
        return $this->dictionary;
268
    }
269 12
270 12
    /**
271
     * @param PDFObject[] $objects
272
     */
273 12
    public function setObjects($objects = [])
274 1
    {
275
        $this->objects = (array) $objects;
276
277 12
        $this->init();
278
    }
279
280
    /**
281 12
     * @return PDFObject[]
282
     */
283
    public function getObjects()
284 12
    {
285 12
        return $this->objects;
286
    }
287
288
    /**
289 12
     * @return PDFObject|Font|Page|Element|null
290
     */
291
    public function getObjectById(string $id)
292
    {
293
        if (isset($this->objects[$id])) {
294
            return $this->objects[$id];
295
        }
296
297 41
        return null;
298
    }
299 41
300 41
    public function hasObjectsByType(string $type, string $subtype = null): bool
301
    {
302 12
        return 0 < \count($this->getObjectsByType($type, $subtype));
303
    }
304 12
305
    public function getObjectsByType(string $type, string $subtype = null): array
306
    {
307
        if (!isset($this->dictionary[$type])) {
308
            return [];
309
        }
310
311
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
312
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
313
                return [];
314
            }
315
316
            return $this->dictionary[$type]['subtype'][$subtype];
317
        }
318
319
        return $this->dictionary[$type]['all'];
320
    }
321
322
    /**
323
     * @return Font[]
324
     */
325
    public function getFonts()
326
    {
327
        return $this->getObjectsByType('Font');
328
    }
329
330
    public function getFirstFont(): ?Font
331
    {
332
        $fonts = $this->getFonts();
333
        if ([] === $fonts) {
334
            return null;
335
        }
336
337
        return reset($fonts);
338
    }
339
340
    /**
341
     * @return Page[]
342
     *
343
     * @throws \Exception
344
     */
345
    public function getPages()
346
    {
347
        if ($this->hasObjectsByType('Catalog')) {
348
            // Search for catalog to list pages.
349
            $catalogues = $this->getObjectsByType('Catalog');
350
            $catalogue = reset($catalogues);
351
352
            /** @var Pages $object */
353
            $object = $catalogue->get('Pages');
354
            if (method_exists($object, 'getPages')) {
355
                return $object->getPages(true);
356
            }
357
        }
358
359
        if ($this->hasObjectsByType('Pages')) {
360
            // Search for pages to list kids.
361
            $pages = [];
362
363
            /** @var Pages[] $objects */
364
            $objects = $this->getObjectsByType('Pages');
365
            foreach ($objects as $object) {
366
                $pages = array_merge($pages, $object->getPages(true));
367
            }
368
369
            return $pages;
370
        }
371
372
        if ($this->hasObjectsByType('Page')) {
373
            // Search for 'page' (unordered pages).
374
            $pages = $this->getObjectsByType('Page');
375
376
            return array_values($pages);
377
        }
378
379
        throw new \Exception('Missing catalog.');
380
    }
381
382
    public function getText(int $pageLimit = null): string
383
    {
384
        $texts = [];
385
        $pages = $this->getPages();
386
387
        // Only use the first X number of pages if $pageLimit is set and numeric.
388
        if (\is_int($pageLimit) && 0 < $pageLimit) {
389
            $pages = \array_slice($pages, 0, $pageLimit);
390
        }
391
392
        foreach ($pages as $index => $page) {
393
            /**
394
             * In some cases, the $page variable may be null.
395
             */
396
            if (null === $page) {
397
                continue;
398
            }
399
            if ($text = trim($page->getText())) {
400
                $texts[] = $text;
401
            }
402
        }
403
404
        return implode("\n\n", $texts);
405
    }
406
407
    public function getTrailer(): Header
408
    {
409
        return $this->trailer;
410
    }
411
412
    public function setTrailer(Header $trailer)
413
    {
414
        $this->trailer = $trailer;
415
    }
416
417
    public function getDetails(): array
418
    {
419
        return $this->details;
420
    }
421
}
422