Document::getDictionary() - Code Metrics - Inspection of "Fixes Scrutinizer integration (mostly failing test..." - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 2939df...ddf03e )

by Konrad

created 2024-02-27 07:23 UTC

Document::getDictionary() A

↳ Parent: Document

Complexity

Conditions	1
Paths	1

Size

Total Lines	3
Code Lines	1

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
eloc	1
dl	0
loc	3
rs	10
c	1
b	0
f	0
ccs	2
cts	2
cp	1
cc	1
nc	1
nop	0
crap	1

<?php

/**
 * @file
 *          This file is part of the PdfParser library.
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Smalot\PdfParser;

use Smalot\PdfParser\Encoding\PDFDocEncoding;

/**
 * Technical references :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
 *
 * Class Document
 */
class Document
{
    /**
     * @var PDFObject[]
     */
    protected $objects = [];

    /**
     * @var array
     */
    protected $dictionary = [];

    /**
     * @var Header
     */
    protected $trailer;

    /**
     * @var array<mixed>
     */
    protected $metadata = [];

    /**
     * @var array
     */
    protected $details;

    public function __construct()
    {
        $this->trailer = new Header([], $this);
    }

    public function init()
    {
        $this->buildDictionary();

        $this->buildDetails();

        // Propagate init to objects.
        foreach ($this->objects as $object) {
            $object->getHeader()->init();
            $object->init();
        }
    }

    /**
     * Build dictionary based on type header field.
     */
    protected function buildDictionary()
    {
        // Build dictionary.
        $this->dictionary = [];

        foreach ($this->objects as $id => $object) {
            // Cache objects by type and subtype
            $type = $object->getHeader()->get('Type')->getContent();

            if (null != $type) {
                if (!isset($this->dictionary[$type])) {
                    $this->dictionary[$type] = [
                        'all' => [],
                        'subtype' => [],
                    ];
                }

                $this->dictionary[$type]['all'][$id] = $object;

                $subtype = $object->getHeader()->get('Subtype')->getContent();
                if (null != $subtype) {
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                        $this->dictionary[$type]['subtype'][$subtype] = [];
                    }
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
                }
            }
        }
    }

    /**
     * Build details array.
     */
    protected function buildDetails()
    {
        // Build details array.
        $details = [];

        // Extract document info
        if ($this->trailer->has('Info')) {
            /** @var PDFObject $info */
            $info = $this->trailer->get('Info');
            // This could be an ElementMissing object, so we need to check for
            // the getHeader method first.
            if (null !== $info && method_exists($info, 'getHeader')) {
                $details = $info->getHeader()->getDetails();
            }
        }

        // Retrieve the page count
        try {
            $pages = $this->getPages();
            $details['Pages'] = \count($pages);
        } catch (\Exception $e) {
            $details['Pages'] = 0;
        }

        // Decode and repair encoded document properties
        foreach ($details as $key => $value) {
            if (\is_string($value)) {
                // If the string is already UTF-8 encoded, that means we only
                // need to repair Adobe's ham-fisted insertion of line-feeds
                // every ~127 characters, which doesn't seem to be multi-byte
                // safe
                if (mb_check_encoding($value, 'UTF-8')) {
                    // Remove literal backslash + line-feed "\\r"
                    $value = str_replace("\x5c\x0d", '', $value);

                    // Remove backslash plus bytes written into high part of
                    // multibyte unicode character
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
                        $diff = (\ord($match[1]) - 182) * 64;
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
                    }

                    // Remove bytes written into low part of multibyte unicode
                    // character
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
                        $diff = \ord($match[2]) - 181;
                        $newbyte = \chr(\ord($match[1]) + $diff);
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
                    }

                    // Remove this byte string that Adobe occasionally adds
                    // between two single byte characters in a unicode string
                    $value = str_replace("\xe5\xb0\x8d", '', $value);

                    $details[$key] = $value;
                } else {
                    // If the string is just PDFDocEncoding, remove any line-feeds
                    // and decode the whole thing.
                    $value = str_replace("\\\r", '', $value);
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
                }
            }
        }

        $details = array_merge($details, $this->metadata);

        $this->details = $details;
    }

    /**
     * Extract XMP Metadata
     */
    public function extractXMPMetadata(string $content): void
    {
        $xml = xml_parser_create();
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);

        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
            /*
             * short overview about the following code parts:
             *
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
             * element) is set as the current $metadata context.
             */
            $metadata = [];
            $stack = [];
            foreach ($values as $val) {
                // Standardize to lowercase
                $val['tag'] = strtolower($val['tag']);

                // Ignore structural x: and rdf: XML elements
                if (0 === strpos($val['tag'], 'x:')) {
                    continue;
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
                    continue;
                }

                switch ($val['type']) {
                    case 'open':
                        // Create an array of list items
                        if ('rdf:li' == $val['tag']) {
                            $metadata[] = [];

                            // Move up one level in the stack
                            $stack[\count($stack)] = &$metadata;
                            $metadata = &$metadata[\count($metadata) - 1];
                        } else {
                            // Else create an array of named values
                            $metadata[$val['tag']] = [];

                            // Move up one level in the stack
                            $stack[\count($stack)] = &$metadata;
                            $metadata = &$metadata[$val['tag']];
                        }
                        break;

                    case 'complete':
                        if (isset($val['value'])) {
                            // Assign a value to this list item
                            if ('rdf:li' == $val['tag']) {
                                $metadata[] = $val['value'];

                            // Else assign a value to this property
                            } else {
                                $metadata[$val['tag']] = $val['value'];
                            }
                        }
                        break;

                    case 'close':
                        // If the value of this property is an array
                        if (\is_array($metadata)) {
                            // If the value is a single element array
                            // where the element is of type string, use
                            // the value of the first list item as the
                            // value for this property
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
                                $metadata = $metadata[0];
                            } elseif (0 == \count($metadata)) {
                                // if the value is an empty array, set
                                // the value of this property to the empty
                                // string
                                $metadata = '';
                            }
                        }

                        // Move down one level in the stack
                        $metadata = &$stack[\count($stack) - 1];
                        unset($stack[\count($stack) - 1]);
                        break;
                }
            }

            // Only use this metadata if it's referring to a PDF
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
                // According to the XMP specifications: 'Conflict resolution
                // for separate packets that describe the same resource is
                // beyond the scope of this document.' - Section 6.1
                // Source: https://www.adobe.com/devnet/xmp.html
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
                // So if there are multiple XMP blocks, just merge the values
                // of each found block over top of the existing values
                $this->metadata = array_merge($this->metadata, $metadata);
            }
        }
        xml_parser_free($xml);
    }

    public function getDictionary(): array
    {
        return $this->dictionary;
    }

    /**
     * @param PDFObject[] $objects
     */
    public function setObjects($objects = [])
    {
        $this->objects = (array) $objects;

        $this->init();
    }

    /**
     * @return PDFObject[]
     */
    public function getObjects()
    {
        return $this->objects;
    }

    /**
     * @return PDFObject|Font|Page|Element|null
     */
    public function getObjectById(string $id)
    {
        if (isset($this->objects[$id])) {
            return $this->objects[$id];
        }

        return null;
    }

    public function hasObjectsByType(string $type, ?string $subtype = null): bool
    {
        return 0 < \count($this->getObjectsByType($type, $subtype));
    }

    public function getObjectsByType(string $type, ?string $subtype = null): array
    {
        if (!isset($this->dictionary[$type])) {
            return [];
        }

        if (null != $subtype) {

            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                return [];
            }

            return $this->dictionary[$type]['subtype'][$subtype];
        }

        return $this->dictionary[$type]['all'];
    }

    /**
     * @return Font[]
     */
    public function getFonts()
    {
        return $this->getObjectsByType('Font');
    }

    public function getFirstFont(): ?Font
    {
        $fonts = $this->getFonts();
        if ([] === $fonts) {
            return null;
        }

        return reset($fonts);
    }

    /**
     * @return Page[]
     *
     * @throws \Exception
     */
    public function getPages()
    {
        if ($this->hasObjectsByType('Catalog')) {
            // Search for catalog to list pages.
            $catalogues = $this->getObjectsByType('Catalog');
            $catalogue = reset($catalogues);

            /** @var Pages $object */
            $object = $catalogue->get('Pages');
            if (method_exists($object, 'getPages')) {
                return $object->getPages(true);
            }
        }

        if ($this->hasObjectsByType('Pages')) {
            // Search for pages to list kids.
            $pages = [];

            /** @var Pages[] $objects */
            $objects = $this->getObjectsByType('Pages');
            foreach ($objects as $object) {
                $pages = array_merge($pages, $object->getPages(true));
            }

            return $pages;
        }

        if ($this->hasObjectsByType('Page')) {
            // Search for 'page' (unordered pages).
            $pages = $this->getObjectsByType('Page');

            return array_values($pages);
        }

        throw new \Exception('Missing catalog.');
    }

    public function getText(?int $pageLimit = null): string
    {
        $texts = [];
        $pages = $this->getPages();

        // Only use the first X number of pages if $pageLimit is set and numeric.
        if (\is_int($pageLimit) && 0 < $pageLimit) {
            $pages = \array_slice($pages, 0, $pageLimit);
        }

        foreach ($pages as $index => $page) {
            /**
             * In some cases, the $page variable may be null.
             */
            if (null === $page) {
                continue;
            }
            if ($text = trim($page->getText())) {
                $texts[] = $text;
            }
        }

        return implode("\n\n", $texts);
    }

    public function getTrailer(): Header
    {
        return $this->trailer;
    }

    public function setTrailer(Header $trailer)
    {
        $this->trailer = $trailer;
    }

    public function getDetails(): array
    {
        return $this->details;
    }
}


1		<?php
2
3		/**
4		* @file
5		* This file is part of the PdfParser library.
6		*
7		* @author Sébastien MALOT <[email protected]>
8		*
9		* @date 2017-01-03
10		*
11		* @license LGPLv3
12		*
13		* @url <https://github.com/smalot/pdfparser>
14		*
15		* PdfParser is a pdf library written in PHP, extraction oriented.
16		* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17		*
18		* This program is free software: you can redistribute it and/or modify
19		* it under the terms of the GNU Lesser General Public License as published by
20		* the Free Software Foundation, either version 3 of the License, or
21		* (at your option) any later version.
22		*
23		* This program is distributed in the hope that it will be useful,
24		* but WITHOUT ANY WARRANTY; without even the implied warranty of
25		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26		* GNU Lesser General Public License for more details.
27		*
28		* You should have received a copy of the GNU Lesser General Public License
29		* along with this program.
30		* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31		*/
32
33		namespace Smalot\PdfParser;
34
35		use Smalot\PdfParser\Encoding\PDFDocEncoding;
36
37		/**
38		* Technical references :
39		* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
40		* - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
41		* - http://www.php.net/manual/en/ref.pdf.php#74211
42		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
43		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
44		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
45		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
46		*
47		* Class Document
48		*/
49		class Document
50		{
51		/**
52		* @var PDFObject[]
53		*/
54		protected $objects = [];
55
56		/**
57		* @var array
58		*/
59		protected $dictionary = [];
60
61		/**
62		* @var Header
63		*/
64		protected $trailer;
65
66		/**
67		* @var array<mixed>
68		*/
69		protected $metadata = [];
70
71		/**
72		* @var array
73		*/
74		protected $details;
75
76	98	public function __construct()
77		{
78	98	$this->trailer = new Header([], $this);
79		}
80
81	71	public function init()
82		{
83	71	$this->buildDictionary();
84
85	71	$this->buildDetails();
86
87		// Propagate init to objects.
88	71	foreach ($this->objects as $object) {
89	71	$object->getHeader()->init();
90	71	$object->init();
91		}
92		}
93
94		/**
95		* Build dictionary based on type header field.
96		*/
97	71	protected function buildDictionary()
98		{
99		// Build dictionary.
100	71	$this->dictionary = [];
101
102	71	foreach ($this->objects as $id => $object) {
103		// Cache objects by type and subtype
104	71	$type = $object->getHeader()->get('Type')->getContent();
105
106	71	if (null != $type) {
107	71	if (!isset($this->dictionary[$type])) {
108	71	$this->dictionary[$type] = [
109	71	'all' => [],
110	71	'subtype' => [],
111	71	];
112		}
113
114	71	$this->dictionary[$type]['all'][$id] = $object;
115
116	71	$subtype = $object->getHeader()->get('Subtype')->getContent();
117	71	if (null != $subtype) {
118	64	if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
119	64	$this->dictionary[$type]['subtype'][$subtype] = [];
120		}
121	64	$this->dictionary[$type]['subtype'][$subtype][$id] = $object;
122		}
123		}
124		}
125		}
126
127		/**
128		* Build details array.
129		*/
130	71	protected function buildDetails()
131		{
132		// Build details array.
133	71	$details = [];
134
135		// Extract document info
136	71	if ($this->trailer->has('Info')) {
137		/** @var PDFObject $info */
138	60	$info = $this->trailer->get('Info');
139		// This could be an ElementMissing object, so we need to check for
140		// the getHeader method first.
141	60	if (null !== $info && method_exists($info, 'getHeader')) {
142	60	$details = $info->getHeader()->getDetails();
143		}
144		}
145
146		// Retrieve the page count
147		try {
148	71	$pages = $this->getPages();
149	70	$details['Pages'] = \count($pages);
150	2	} catch (\Exception $e) {
151	2	$details['Pages'] = 0;
152		}
153
154		// Decode and repair encoded document properties
155	71	foreach ($details as $key => $value) {
156	71	if (\is_string($value)) {
157		// If the string is already UTF-8 encoded, that means we only
158		// need to repair Adobe's ham-fisted insertion of line-feeds
159		// every ~127 characters, which doesn't seem to be multi-byte
160		// safe
161	59	if (mb_check_encoding($value, 'UTF-8')) {
162		// Remove literal backslash + line-feed "\\r"
163	58	$value = str_replace("\x5c\x0d", '', $value);
164
165		// Remove backslash plus bytes written into high part of
166		// multibyte unicode character
167	58	while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
168	1	$diff = (\ord($match[1]) - 182) * 64;
169	1	$newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
170	1	$value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
171		}
172
173		// Remove bytes written into low part of multibyte unicode
174		// character
175	58	while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
176		$diff = \ord($match[2]) - 181;
177		$newbyte = \chr(\ord($match[1]) + $diff);
178		$value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
179		}
180
181		// Remove this byte string that Adobe occasionally adds
182		// between two single byte characters in a unicode string
183	58	$value = str_replace("\xe5\xb0\x8d", '', $value);
184
185	58	$details[$key] = $value;
186		} else {
187		// If the string is just PDFDocEncoding, remove any line-feeds
188		// and decode the whole thing.
189	11	$value = str_replace("\\\r", '', $value);
190	11	$details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
191		}
192		}
193		}
194
195	71	$details = array_merge($details, $this->metadata);
196
197	71	$this->details = $details;
198		}
199
200		/**
201		* Extract XMP Metadata
202		*/
203	41	public function extractXMPMetadata(string $content): void
204		{
205	41	$xml = xml_parser_create();
206	41	xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
207
208	41	if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
209		/*
210		* short overview about the following code parts:
211		*
212		* The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
213		* first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
214		* results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
215		* we save the current $metadata context in the $stack, then create a child array of $metadata and
216		* make that the current $metadata context. When a "close" XML tag is encountered, the operations are
217		* reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
218		* element) is set as the current $metadata context.
219		*/
220	41	$metadata = [];
221	41	$stack = [];
222	41	foreach ($values as $val) {
223		// Standardize to lowercase
224	41	$val['tag'] = strtolower($val['tag']);
225
226		// Ignore structural x: and rdf: XML elements
227	41	if (0 === strpos($val['tag'], 'x:')) {
228	41	continue;
229	41	} elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
230	41	continue;
231		}
232
233	41	switch ($val['type']) {
234	41	case 'open':
235		// Create an array of list items
236	37	if ('rdf:li' == $val['tag']) {
237	5	$metadata[] = [];
238
239		// Move up one level in the stack
240	5	$stack[\count($stack)] = &$metadata;
241	5	$metadata = &$metadata[\count($metadata) - 1];
242		} else {
243		// Else create an array of named values
244	37	$metadata[$val['tag']] = [];
245
246		// Move up one level in the stack
247	37	$stack[\count($stack)] = &$metadata;
248	37	$metadata = &$metadata[$val['tag']];
249		}
250	37	break;
251
252	41	case 'complete':
253	41	if (isset($val['value'])) {
254		// Assign a value to this list item
255	41	if ('rdf:li' == $val['tag']) {
256	33	$metadata[] = $val['value'];
257
258		// Else assign a value to this property
259		} else {
260	41	$metadata[$val['tag']] = $val['value'];
261		}
262		}
263	41	break;
264
265	37	case 'close':
266		// If the value of this property is an array
267	37	if (\is_array($metadata)) {
268		// If the value is a single element array
269		// where the element is of type string, use
270		// the value of the first list item as the
271		// value for this property
272	37	if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
273	33	$metadata = $metadata[0];
274	10	} elseif (0 == \count($metadata)) {
275		// if the value is an empty array, set
276		// the value of this property to the empty
277		// string
278	7	$metadata = '';
279		}
280		}
281
282		// Move down one level in the stack
283	37	$metadata = &$stack[\count($stack) - 1];
284	37	unset($stack[\count($stack) - 1]);
285	37	break;
286		}
287		}
288
289		// Only use this metadata if it's referring to a PDF
290	41	if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
291		// According to the XMP specifications: 'Conflict resolution
292		// for separate packets that describe the same resource is
293		// beyond the scope of this document.' - Section 6.1
294		// Source: https://www.adobe.com/devnet/xmp.html
295		// Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
296		// So if there are multiple XMP blocks, just merge the values
297		// of each found block over top of the existing values
298	22	$this->metadata = array_merge($this->metadata, $metadata);
299		}
300		}
301	41	xml_parser_free($xml);
302		}
303
304	1	public function getDictionary(): array
305		{
306	1	return $this->dictionary;
307		}
308
309		/**
310		* @param PDFObject[] $objects
311		*/
312	71	public function setObjects($objects = [])
313		{
314	71	$this->objects = (array) $objects;
315
316	71	$this->init();
317		}
318
319		/**
320		* @return PDFObject[]
321		*/
322	2	public function getObjects()
323		{
324	2	return $this->objects;
325		}
326
327		/**
328		* @return PDFObject\|Font\|Page\|Element\|null
329		*/
330	68	public function getObjectById(string $id)
331		{
332	68	if (isset($this->objects[$id])) {
333	68	return $this->objects[$id];
334		}
335
336	3	return null;
337		}
338
339	72	public function hasObjectsByType(string $type, ?string $subtype = null): bool
340		{
341	72	return 0 < \count($this->getObjectsByType($type, $subtype));
342		}
343
344	76	public function getObjectsByType(string $type, ?string $subtype = null): array
345		{
346	76	if (!isset($this->dictionary[$type])) {
347	13	return [];
348		}
349
350	70	if (null != $subtype) {
		0 ignored issues – show Bug introduced 2021-06-13 00:39 UTC by Report Bug Copy Issue Report It seems like you are loosely comparing `$subtype` of type `null\|string` against `null`; this is ambiguous if the string can be empty. Consider using a strict comparison `!==` instead. Loading history...
351		if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
352		return [];
353		}
354
355		return $this->dictionary[$type]['subtype'][$subtype];
356		}
357
358	70	return $this->dictionary[$type]['all'];
359		}
360
361		/**
362		* @return Font[]
363		*/
364	52	public function getFonts()
365		{
366	52	return $this->getObjectsByType('Font');
367		}
368
369	46	public function getFirstFont(): ?Font
370		{
371	46	$fonts = $this->getFonts();
372	46	if ([] === $fonts) {
373	4	return null;
374		}
375
376	42	return reset($fonts);
377		}
378
379		/**
380		* @return Page[]
381		*
382		* @throws \Exception
383		*/
384	72	public function getPages()
385		{
386	72	if ($this->hasObjectsByType('Catalog')) {
387		// Search for catalog to list pages.
388	64	$catalogues = $this->getObjectsByType('Catalog');
389	64	$catalogue = reset($catalogues);
390
391		/** @var Pages $object */
392	64	$object = $catalogue->get('Pages');
393	64	if (method_exists($object, 'getPages')) {
394	64	return $object->getPages(true);
395		}
396		}
397
398	9	if ($this->hasObjectsByType('Pages')) {
399		// Search for pages to list kids.
400	1	$pages = [];
401
402		/** @var Pages[] $objects */
403	1	$objects = $this->getObjectsByType('Pages');
404	1	foreach ($objects as $object) {
405	1	$pages = array_merge($pages, $object->getPages(true));
406		}
407
408	1	return $pages;
409		}
410
411	9	if ($this->hasObjectsByType('Page')) {
412		// Search for 'page' (unordered pages).
413	7	$pages = $this->getObjectsByType('Page');
414
415	7	return array_values($pages);
416		}
417
418	3	throw new \Exception('Missing catalog.');
419		}
420
421	23	public function getText(?int $pageLimit = null): string
422		{
423	23	$texts = [];
424	23	$pages = $this->getPages();
425
426		// Only use the first X number of pages if $pageLimit is set and numeric.
427	23	if (\is_int($pageLimit) && 0 < $pageLimit) {
428	1	$pages = \array_slice($pages, 0, $pageLimit);
429		}
430
431	23	foreach ($pages as $index => $page) {
432		/**
433		* In some cases, the $page variable may be null.
434		*/
435	23	if (null === $page) {
436		continue;
437		}
438	23	if ($text = trim($page->getText())) {
439	23	$texts[] = $text;
440		}
441		}
442
443	23	return implode("\n\n", $texts);
444		}
445
446		public function getTrailer(): Header
447		{
448		return $this->trailer;
449		}
450
451	63	public function setTrailer(Header $trailer)
452		{
453	63	$this->trailer = $trailer;
454		}
455
456	16	public function getDetails(): array
457		{
458	16	return $this->details;
459		}
460		}
461

smalot / pdfparser

Push — master ( 2939df...ddf03e )

Document::getDictionary() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like