Test Failed
Pull Request — master (#606)
by
unknown
01:52
created

Document::getText()   A

Complexity

Conditions 6
Paths 8

Size

Total Lines 23
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 3
Bugs 1 Features 0
Metric Value
cc 6
eloc 10
c 3
b 1
f 0
nc 8
nop 1
dl 0
loc 23
ccs 0
cts 0
cp 0
crap 42
rs 9.2222
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
/**
36
 * Technical references :
37
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39
 * - http://www.php.net/manual/en/ref.pdf.php#74211
40
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44
 *
45
 * Class Document
46
 */
47
class Document
48
{
49
    /**
50
     * @var PDFObject[]
51
     */
52
    protected $objects = [];
53
54
    /**
55
     * @var array
56
     */
57
    protected $dictionary = [];
58
59
    /**
60
     * @var Header
61
     */
62
    protected $trailer;
63
64
    /**
65
     * @var Metadata
0 ignored issues
show
Bug introduced by
The type Smalot\PdfParser\Metadata was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
66
     */
67
    protected $metadata = [];
68
69 72
    /**
70
     * @var array
71 72
     */
72 72
    protected $details;
73
74 49
    public function __construct()
75
    {
76 49
        $this->trailer = new Header([], $this);
77
    }
78 49
79
    public function init()
80
    {
81 49
        $this->buildDictionary();
82 49
83 49
        $this->buildDetails();
84
85 49
        // Propagate init to objects.
86
        foreach ($this->objects as $object) {
87
            $object->getHeader()->init();
88
            $object->init();
89
        }
90 49
    }
91
92
    /**
93 49
     * Build dictionary based on type header field.
94
     */
95 49
    protected function buildDictionary()
96
    {
97 49
        // Build dictionary.
98
        $this->dictionary = [];
99 49
100 49
        foreach ($this->objects as $id => $object) {
101 49
            // Cache objects by type and subtype
102
            $type = $object->getHeader()->get('Type')->getContent();
103
104
            if (null != $type) {
105
                if (!isset($this->dictionary[$type])) {
106
                    $this->dictionary[$type] = [
107 49
                        'all' => [],
108
                        'subtype' => [],
109 49
                    ];
110 49
                }
111 42
112 42
                $this->dictionary[$type]['all'][$id] = $object;
113
114 42
                $subtype = $object->getHeader()->get('Subtype')->getContent();
115
                if (null != $subtype) {
116
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117
                        $this->dictionary[$type]['subtype'][$subtype] = [];
118 49
                    }
119
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120
                }
121
            }
122
        }
123 49
    }
124
125
    /**
126 49
     * Build details array.
127
     */
128
    protected function buildDetails()
129 49
    {
130
        // Build details array.
131 40
        $details = [];
132
133
        // Extract document info
134 40
        if ($this->trailer->has('Info')) {
135 40
            /** @var PDFObject $info */
136
            $info = $this->trailer->get('Info');
137
            // This could be an ElementMissing object, so we need to check for
138
            // the getHeader method first.
139
            if (null !== $info && method_exists($info, 'getHeader')) {
140
                $details = $info->getHeader()->getDetails();
141 49
            }
142 48
        }
143 2
144 2
        // Retrieve the page count
145
        try {
146
            $pages = $this->getPages();
147 49
            $details['Pages'] = \count($pages);
148 49
        } catch (\Exception $e) {
149
            $details['Pages'] = 0;
150 1
        }
151
152 1
        $details = array_merge($details, $this->metadata);
0 ignored issues
show
Bug introduced by
$this->metadata of type Smalot\PdfParser\Metadata is incompatible with the type array expected by parameter $arrays of array_merge(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

152
        $details = array_merge($details, /** @scrutinizer ignore-type */ $this->metadata);
Loading history...
153
154
        $this->details = $details;
155
    }
156
157
    /**
158 49
     * Extract XMP Metadata
159
     */
160 49
    public function extractXMPMetadata(string $content): void
161
    {
162 49
        $xml = xml_parser_create();
163 49
        xml_parser_set_option($xml, XML_OPTION_SKIP_WHITE, 1);
164
165
        if (xml_parse_into_struct($xml, $content, $values, $index)) {
166
167
            $detail = '';
168 1
169
            foreach ($values as $val) {
170 1
                switch ($val['tag']) {
171
                    case 'DC:CREATOR':
172
                        $detail = ($val['type'] == 'open') ? 'Author' : '';
173
                        break;
174
175
                    case 'DC:DESCRIPTION':
176 46
                        $detail = ($val['type'] == 'open') ? 'Description' : '';
177
                        break;
178 46
179 46
                    case 'DC:TITLE':
180
                        $detail = ($val['type'] == 'open') ? 'Title' : '';
181
                        break;
182 3
183
                    case 'DC:SUBJECT':
184
                        $detail = ($val['type'] == 'open') ? 'Subject' : '';
185 50
                        break;
186
187 50
                    case 'RDF:LI':
188
                        if ($detail && $val['type'] == 'complete' && isset($val['value'])) {
189
                            $this->metadata[$detail] = $val['value'];
190 53
                        }
191
                        break;
192 53
193 12
                    case 'DC:FORMAT':
194
                        if ($val['type'] == 'complete' && isset($val['value'])) {
195
                            $this->metadata['Format'] = $val['value'];
196 48
                        }
197
                        break;
198
199
                    case 'PDF:KEYWORDS':
200
                        if ($val['type'] == 'complete' && isset($val['value'])) {
201
                            $this->metadata['Keywords'] = $val['value'];
202
                        }
203
                        break;
204 48
205
                    case 'PDF:PRODUCER':
206
                        if ($val['type'] == 'complete' && isset($val['value'])) {
207
                            $this->metadata['Producer'] = $val['value'];
208
                        }
209
                        break;
210 27
211
                    case 'PDFX:SOURCEMODIFIED':
212 27
                        if ($val['type'] == 'complete' && isset($val['value'])) {
213
                            $this->metadata['SourceModified'] = $val['value'];
214
                        }
215 21
                        break;
216
217 21
                    case 'PDFX:COMPANY':
218 21
                        if ($val['type'] == 'complete' && isset($val['value'])) {
219 3
                            $this->metadata['Company'] = $val['value'];
220
                        }
221
                        break;
222 18
223
                    case 'XMP:CREATEDATE':
224
                        if ($val['type'] == 'complete' && isset($val['value'])) {
225
                            $this->metadata['CreationDate'] = $val['value'];
226
                        }
227
                        break;
228
229
                    case 'XMP:CREATORTOOL':
230 50
                        if ($val['type'] == 'complete' && isset($val['value'])) {
231
                            $this->metadata['Creator'] = $val['value'];
232 50
                        }
233
                        break;
234 42
235 42
                    case 'XMP:MODIFYDATE':
236
                        if ($val['type'] == 'complete' && isset($val['value'])) {
237
                            $this->metadata['ModDate'] = $val['value'];
238 42
                        }
239 42
                        break;
240 42
241
                    case 'XMP:METADATADATE':
242
                        if ($val['type'] == 'complete' && isset($val['value'])) {
243
                            $this->metadata['MetadataDate'] = $val['value'];
244 9
                        }
245
                        break;                
246 1
247
                    case 'XMPMM:DOCUMENTID':
248
                        if ($val['type'] == 'complete' && isset($val['value'])) {
249 1
                            $this->metadata['DocumentUUID'] = $val['value'];
250 1
                        }
251 1
                        break;                
252
253
                    case 'XMPMM:INSTANCEID':
254 1
                        if ($val['type'] == 'complete' && isset($val['value'])) {
255
                            $this->metadata['InstanceUUID'] = $val['value'];
256
                        }
257 9
                        break;                
258
259 7
                }
260
            }
261 7
        }
262
    }
263
264 3
265
    public function getDictionary(): array
266
    {
267 12
        return $this->dictionary;
268
    }
269 12
270 12
    /**
271
     * @param PDFObject[] $objects
272
     */
273 12
    public function setObjects($objects = [])
274 1
    {
275
        $this->objects = (array) $objects;
276
277 12
        $this->init();
278
    }
279
280
    /**
281 12
     * @return PDFObject[]
282
     */
283
    public function getObjects()
284 12
    {
285 12
        return $this->objects;
286
    }
287
288
    /**
289 12
     * @return PDFObject|Font|Page|Element|null
290
     */
291
    public function getObjectById(string $id)
292
    {
293
        if (isset($this->objects[$id])) {
294
            return $this->objects[$id];
295
        }
296
297 41
        return null;
298
    }
299 41
300 41
    public function hasObjectsByType(string $type, string $subtype = null): bool
301
    {
302 12
        return 0 < \count($this->getObjectsByType($type, $subtype));
303
    }
304 12
305
    public function getObjectsByType(string $type, string $subtype = null): array
306
    {
307
        if (!isset($this->dictionary[$type])) {
308
            return [];
309
        }
310
311
        if (null != $subtype) {
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $subtype of type null|string against null; this is ambiguous if the string can be empty. Consider using a strict comparison !== instead.
Loading history...
312
            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
313
                return [];
314
            }
315
316
            return $this->dictionary[$type]['subtype'][$subtype];
317
        }
318
319
        return $this->dictionary[$type]['all'];
320
    }
321
322
    /**
323
     * @return Font[]
324
     */
325
    public function getFonts()
326
    {
327
        return $this->getObjectsByType('Font');
328
    }
329
330
    public function getFirstFont(): ?Font
331
    {
332
        $fonts = $this->getFonts();
333
        if ([] === $fonts) {
334
            return null;
335
        }
336
337
        return reset($fonts);
338
    }
339
340
    /**
341
     * @return Page[]
342
     *
343
     * @throws \Exception
344
     */
345
    public function getPages()
346
    {
347
        if ($this->hasObjectsByType('Catalog')) {
348
            // Search for catalog to list pages.
349
            $catalogues = $this->getObjectsByType('Catalog');
350
            $catalogue = reset($catalogues);
351
352
            /** @var Pages $object */
353
            $object = $catalogue->get('Pages');
354
            if (method_exists($object, 'getPages')) {
355
                return $object->getPages(true);
356
            }
357
        }
358
359
        if ($this->hasObjectsByType('Pages')) {
360
            // Search for pages to list kids.
361
            $pages = [];
362
363
            /** @var Pages[] $objects */
364
            $objects = $this->getObjectsByType('Pages');
365
            foreach ($objects as $object) {
366
                $pages = array_merge($pages, $object->getPages(true));
367
            }
368
369
            return $pages;
370
        }
371
372
        if ($this->hasObjectsByType('Page')) {
373
            // Search for 'page' (unordered pages).
374
            $pages = $this->getObjectsByType('Page');
375
376
            return array_values($pages);
377
        }
378
379
        throw new \Exception('Missing catalog.');
380
    }
381
382
    public function getText(int $pageLimit = null): string
383
    {
384
        $texts = [];
385
        $pages = $this->getPages();
386
387
        // Only use the first X number of pages if $pageLimit is set and numeric.
388
        if (\is_int($pageLimit) && 0 < $pageLimit) {
389
            $pages = \array_slice($pages, 0, $pageLimit);
390
        }
391
392
        foreach ($pages as $index => $page) {
393
            /**
394
             * In some cases, the $page variable may be null.
395
             */
396
            if (null === $page) {
397
                continue;
398
            }
399
            if ($text = trim($page->getText())) {
400
                $texts[] = $text;
401
            }
402
        }
403
404
        return implode("\n\n", $texts);
405
    }
406
407
    public function getTrailer(): Header
408
    {
409
        return $this->trailer;
410
    }
411
412
    public function setTrailer(Header $trailer)
413
    {
414
        $this->trailer = $trailer;
415
    }
416
417
    public function getDetails(): array
418
    {
419
        return $this->details;
420
    }
421
}
422