Document - Code Metrics - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Document F
last analyzed 2025-09-04 08:56 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	419
Duplicated Lines	0 %

Importance

Changes	9
Bugs	2	Features	0

Metric	Value
eloc	147
c	9
b	2
f	0
dl	0
loc	419
rs	3.04
wmc	67

18 Methods

Rating	Name	Size	Complexity
A	buildDictionary()	25	6
A	init()	10	2
A	__construct()	3	1
B	buildDetails()	68	10
A	getObjectsByType()	15	4
A	getPages()	35	6
A	getDetails()	3	1
A	getObjects()	3	1
A	setObjects()	5	1
A	getObjectById()	7	2
A	getFonts()	3	1
A	getFirstFont()	8	2
A	hasObjectsByType()	3	1
A	getDictionary()	3	1
A	getText()	23	6
A	getTrailer()	3	1
D	extractXMPMetadata()	107	20
A	setTrailer()	3	1

How to fix Complexity

<?php

/**
 * @file
 *          This file is part of the PdfParser library.
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Smalot\PdfParser;

use Smalot\PdfParser\Encoding\PDFDocEncoding;
use Smalot\PdfParser\Exception\MissingCatalogException;

/**
 * Technical references :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
 *
 * Class Document
 */
class Document
{
    /**
     * @var PDFObject[]
     */
    protected $objects = [];

    /**
     * @var array
     */
    protected $dictionary = [];

    /**
     * @var Header
     */
    protected $trailer;

    /**
     * @var array<mixed>
     */
    protected $metadata = [];

    /**
     * @var array
     */
    protected $details;

    public function __construct()
    {
        $this->trailer = new Header([], $this);
    }

    public function init()
    {
        $this->buildDictionary();

        $this->buildDetails();

        // Propagate init to objects.
        foreach ($this->objects as $object) {
            $object->getHeader()->init();
            $object->init();
        }
    }

    /**
     * Build dictionary based on type header field.
     */
    protected function buildDictionary()
    {
        // Build dictionary.
        $this->dictionary = [];

        foreach ($this->objects as $id => $object) {
            // Cache objects by type and subtype
            $type = $object->getHeader()->get('Type')->getContent();

            if (null != $type) {
                if (!isset($this->dictionary[$type])) {
                    $this->dictionary[$type] = [
                        'all' => [],
                        'subtype' => [],
                    ];
                }

                $this->dictionary[$type]['all'][$id] = $object;

                $subtype = $object->getHeader()->get('Subtype')->getContent();
                if (null != $subtype) {
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                        $this->dictionary[$type]['subtype'][$subtype] = [];
                    }
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
                }
            }
        }
    }

    /**
     * Build details array.
     */
    protected function buildDetails()
    {
        // Build details array.
        $details = [];

        // Extract document info
        if ($this->trailer->has('Info')) {
            /** @var PDFObject $info */
            $info = $this->trailer->get('Info');
            // This could be an ElementMissing object, so we need to check for
            // the getHeader method first.
            if (null !== $info && method_exists($info, 'getHeader')) {
                $details = $info->getHeader()->getDetails();
            }
        }

        // Retrieve the page count
        try {
            $pages = $this->getPages();
            $details['Pages'] = \count($pages);
        } catch (\Exception $e) {
            $details['Pages'] = 0;
        }

        // Decode and repair encoded document properties
        foreach ($details as $key => $value) {
            if (\is_string($value)) {
                // If the string is already UTF-8 encoded, that means we only
                // need to repair Adobe's ham-fisted insertion of line-feeds
                // every ~127 characters, which doesn't seem to be multi-byte
                // safe
                if (mb_check_encoding($value, 'UTF-8')) {
                    // Remove literal backslash + line-feed "\\r"
                    $value = str_replace("\x5c\x0d", '', $value);

                    // Remove backslash plus bytes written into high part of
                    // multibyte unicode character
                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
                        $diff = (\ord($match[1]) - 182) * 64;
                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
                    }

                    // Remove bytes written into low part of multibyte unicode
                    // character
                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
                        $diff = \ord($match[2]) - 181;
                        $newbyte = \chr(\ord($match[1]) + $diff);
                        $value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
                    }

                    // Remove this byte string that Adobe occasionally adds
                    // between two single byte characters in a unicode string
                    $value = str_replace("\xe5\xb0\x8d", '', $value);

                    $details[$key] = $value;
                } else {
                    // If the string is just PDFDocEncoding, remove any line-feeds
                    // and decode the whole thing.
                    $value = str_replace("\\\r", '', $value);
                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
                }
            }
        }

        $details = array_merge($details, $this->metadata);

        $this->details = $details;
    }

    /**
     * Extract XMP Metadata
     */
    public function extractXMPMetadata(string $content): void
    {
        $xml = xml_parser_create();
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);

        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
            /*
             * short overview about the following code parts:
             *
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
             * we save the current $metadata context in the $stack, then create a child array of $metadata and
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
             * element) is set as the current $metadata context.
             */
            $metadata = [];
            $stack = [];
            foreach ($values as $val) {
                // Standardize to lowercase
                $val['tag'] = strtolower($val['tag']);

                // Ignore structural x: and rdf: XML elements
                if (0 === strpos($val['tag'], 'x:')) {
                    continue;
                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
                    continue;
                }

                switch ($val['type']) {
                    case 'open':
                        // Create an array of list items
                        if ('rdf:li' == $val['tag']) {
                            $metadata[] = [];

                            // Move up one level in the stack
                            $stack[\count($stack)] = &$metadata;
                            $metadata = &$metadata[\count($metadata) - 1];
                        } else {
                            // Else create an array of named values
                            $metadata[$val['tag']] = [];

                            // Move up one level in the stack
                            $stack[\count($stack)] = &$metadata;
                            $metadata = &$metadata[$val['tag']];
                        }
                        break;

                    case 'complete':
                        if (isset($val['value'])) {
                            // Assign a value to this list item
                            if ('rdf:li' == $val['tag']) {
                                $metadata[] = $val['value'];

                                // Else assign a value to this property
                            } else {
                                $metadata[$val['tag']] = $val['value'];
                            }
                        }
                        break;

                    case 'close':
                        // If the value of this property is an array
                        if (\is_array($metadata)) {
                            // If the value is a single element array
                            // where the element is of type string, use
                            // the value of the first list item as the
                            // value for this property
                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
                                $metadata = $metadata[0];
                            } elseif (0 == \count($metadata)) {
                                // if the value is an empty array, set
                                // the value of this property to the empty
                                // string
                                $metadata = '';
                            }
                        }

                        // Move down one level in the stack
                        $metadata = &$stack[\count($stack) - 1];
                        unset($stack[\count($stack) - 1]);
                        break;
                }
            }

            // Only use this metadata if it's referring to a PDF
            if (!isset($metadata['dc:format']) || 'application/pdf' == $metadata['dc:format']) {
                // According to the XMP specifications: 'Conflict resolution
                // for separate packets that describe the same resource is
                // beyond the scope of this document.' - Section 6.1
                // Source: https://www.adobe.com/devnet/xmp.html
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
                // So if there are multiple XMP blocks, just merge the values
                // of each found block over top of the existing values
                $this->metadata = array_merge($this->metadata, $metadata);
            }
        }

        // TODO: remove this if-clause and its content when dropping PHP 7 support
        if (version_compare(PHP_VERSION, '8.0.0', '<')) {
            // ref: https://www.php.net/manual/en/function.xml-parser-free.php
            xml_parser_free($xml);

            // to avoid memory leaks; documentation said:
            // > it was necessary to also explicitly unset the reference to parser to avoid memory leaks
            unset($xml);
        }
    }

    public function getDictionary(): array
    {
        return $this->dictionary;
    }

    /**
     * @param PDFObject[] $objects
     */
    public function setObjects($objects = [])
    {
        $this->objects = (array) $objects;

        $this->init();
    }

    /**
     * @return PDFObject[]
     */
    public function getObjects()
    {
        return $this->objects;
    }

    /**
     * @return PDFObject|Font|Page|Element|null
     */
    public function getObjectById(string $id)
    {
        if (isset($this->objects[$id])) {
            return $this->objects[$id];
        }

        return null;
    }

    public function hasObjectsByType(string $type, ?string $subtype = null): bool
    {
        return 0 < \count($this->getObjectsByType($type, $subtype));
    }

    public function getObjectsByType(string $type, ?string $subtype = null): array
    {
        if (!isset($this->dictionary[$type])) {
            return [];
        }

        if (null != $subtype) {

            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                return [];
            }

            return $this->dictionary[$type]['subtype'][$subtype];
        }

        return $this->dictionary[$type]['all'];
    }

    /**
     * @return Font[]
     */
    public function getFonts()
    {
        return $this->getObjectsByType('Font');
    }

    public function getFirstFont(): ?Font
    {
        $fonts = $this->getFonts();
        if ([] === $fonts) {
            return null;
        }

        return reset($fonts);
    }

    /**
     * @return Page[]
     *
     * @throws MissingCatalogException
     */
    public function getPages()
    {
        if ($this->hasObjectsByType('Catalog')) {
            // Search for catalog to list pages.
            $catalogues = $this->getObjectsByType('Catalog');
            $catalogue = reset($catalogues);

            /** @var Pages $object */
            $object = $catalogue->get('Pages');
            if (method_exists($object, 'getPages')) {
                return $object->getPages(true);
            }
        }

        if ($this->hasObjectsByType('Pages')) {
            // Search for pages to list kids.
            $pages = [];

            /** @var Pages[] $objects */
            $objects = $this->getObjectsByType('Pages');
            foreach ($objects as $object) {
                $pages = array_merge($pages, $object->getPages(true));
            }

            return $pages;
        }

        if ($this->hasObjectsByType('Page')) {
            // Search for 'page' (unordered pages).
            $pages = $this->getObjectsByType('Page');

            return array_values($pages);
        }

        throw new MissingCatalogException('Missing catalog.');
    }

    public function getText(?int $pageLimit = null): string
    {
        $texts = [];
        $pages = $this->getPages();

        // Only use the first X number of pages if $pageLimit is set and numeric.
        if (\is_int($pageLimit) && 0 < $pageLimit) {
            $pages = \array_slice($pages, 0, $pageLimit);
        }

        foreach ($pages as $index => $page) {
            /**
             * In some cases, the $page variable may be null.
             */
            if (null === $page) {
                continue;
            }
            if ($text = trim($page->getText())) {
                $texts[] = $text;
            }
        }

        return implode("\n\n", $texts);
    }

    public function getTrailer(): Header
    {
        return $this->trailer;
    }

    public function setTrailer(Header $trailer)
    {
        $this->trailer = $trailer;
    }

    public function getDetails(): array
    {
        return $this->details;
    }
}


1			<?php
2
3			/**
4			* @file
5			* This file is part of the PdfParser library.
6			*
7			* @author Sébastien MALOT <[email protected]>
8			*
9			* @date 2017-01-03
10			*
11			* @license LGPLv3
12			*
13			* @url <https://github.com/smalot/pdfparser>
14			*
15			* PdfParser is a pdf library written in PHP, extraction oriented.
16			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17			*
18			* This program is free software: you can redistribute it and/or modify
19			* it under the terms of the GNU Lesser General Public License as published by
20			* the Free Software Foundation, either version 3 of the License, or
21			* (at your option) any later version.
22			*
23			* This program is distributed in the hope that it will be useful,
24			* but WITHOUT ANY WARRANTY; without even the implied warranty of
25			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26			* GNU Lesser General Public License for more details.
27			*
28			* You should have received a copy of the GNU Lesser General Public License
29			* along with this program.
30			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31			*/
32
33			namespace Smalot\PdfParser;
34
35			use Smalot\PdfParser\Encoding\PDFDocEncoding;
36			use Smalot\PdfParser\Exception\MissingCatalogException;
37
38			/**
39			* Technical references :
40			* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
41			* - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
42			* - http://www.php.net/manual/en/ref.pdf.php#74211
43			* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
44			* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
45			* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
46			* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
47			*
48			* Class Document
49			*/
50			class Document
51			{
52			/**
53			* @var PDFObject[]
54			*/
55			protected $objects = [];
56
57			/**
58			* @var array
59			*/
60			protected $dictionary = [];
61
62			/**
63			* @var Header
64			*/
65			protected $trailer;
66
67			/**
68			* @var array<mixed>
69			*/
70			protected $metadata = [];
71
72			/**
73			* @var array
74			*/
75			protected $details;
76
77			public function __construct()
78			{
79			$this->trailer = new Header([], $this);
80			}
81
82			public function init()
83			{
84			$this->buildDictionary();
85
86			$this->buildDetails();
87
88			// Propagate init to objects.
89			foreach ($this->objects as $object) {
90			$object->getHeader()->init();
91			$object->init();
92			}
93			}
94
95			/**
96			* Build dictionary based on type header field.
97			*/
98			protected function buildDictionary()
99			{
100			// Build dictionary.
101			$this->dictionary = [];
102
103			foreach ($this->objects as $id => $object) {
104			// Cache objects by type and subtype
105			$type = $object->getHeader()->get('Type')->getContent();
106
107			if (null != $type) {
108			if (!isset($this->dictionary[$type])) {
109			$this->dictionary[$type] = [
110			'all' => [],
111			'subtype' => [],
112			];
113			}
114
115			$this->dictionary[$type]['all'][$id] = $object;
116
117			$subtype = $object->getHeader()->get('Subtype')->getContent();
118			if (null != $subtype) {
119			if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
120			$this->dictionary[$type]['subtype'][$subtype] = [];
121			}
122			$this->dictionary[$type]['subtype'][$subtype][$id] = $object;
123			}
124			}
125			}
126			}
127
128			/**
129			* Build details array.
130			*/
131			protected function buildDetails()
132			{
133			// Build details array.
134			$details = [];
135
136			// Extract document info
137			if ($this->trailer->has('Info')) {
138			/** @var PDFObject $info */
139			$info = $this->trailer->get('Info');
140			// This could be an ElementMissing object, so we need to check for
141			// the getHeader method first.
142			if (null !== $info && method_exists($info, 'getHeader')) {
143			$details = $info->getHeader()->getDetails();
144			}
145			}
146
147			// Retrieve the page count
148			try {
149			$pages = $this->getPages();
150			$details['Pages'] = \count($pages);
151			} catch (\Exception $e) {
152			$details['Pages'] = 0;
153			}
154
155			// Decode and repair encoded document properties
156			foreach ($details as $key => $value) {
157			if (\is_string($value)) {
158			// If the string is already UTF-8 encoded, that means we only
159			// need to repair Adobe's ham-fisted insertion of line-feeds
160			// every ~127 characters, which doesn't seem to be multi-byte
161			// safe
162			if (mb_check_encoding($value, 'UTF-8')) {
163			// Remove literal backslash + line-feed "\\r"
164			$value = str_replace("\x5c\x0d", '', $value);
165
166			// Remove backslash plus bytes written into high part of
167			// multibyte unicode character
168			while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
169			$diff = (\ord($match[1]) - 182) * 64;
170			$newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
171			$value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2].'/', $newbyte, $value);
172			}
173
174			// Remove bytes written into low part of multibyte unicode
175			// character
176			while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
177			$diff = \ord($match[2]) - 181;
178			$newbyte = \chr(\ord($match[1]) + $diff);
179			$value = preg_replace('/'.$match[1]."\x9c\xe0".$match[2].'/', $newbyte, $value);
180			}
181
182			// Remove this byte string that Adobe occasionally adds
183			// between two single byte characters in a unicode string
184			$value = str_replace("\xe5\xb0\x8d", '', $value);
185
186			$details[$key] = $value;
187			} else {
188			// If the string is just PDFDocEncoding, remove any line-feeds
189			// and decode the whole thing.
190			$value = str_replace("\\\r", '', $value);
191			$details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
192			}
193			}
194			}
195
196			$details = array_merge($details, $this->metadata);
197
198			$this->details = $details;
199			}
200
201			/**
202			* Extract XMP Metadata
203			*/
204			public function extractXMPMetadata(string $content): void
205			{
206			$xml = xml_parser_create();
207			xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
208
209			if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
210			/*
211			* short overview about the following code parts:
212			*
213			* The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
214			* first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
215			* results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
216			* we save the current $metadata context in the $stack, then create a child array of $metadata and
217			* make that the current $metadata context. When a "close" XML tag is encountered, the operations are
218			* reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
219			* element) is set as the current $metadata context.
220			*/
221			$metadata = [];
222			$stack = [];
223			foreach ($values as $val) {
224			// Standardize to lowercase
225			$val['tag'] = strtolower($val['tag']);
226
227			// Ignore structural x: and rdf: XML elements
228			if (0 === strpos($val['tag'], 'x:')) {
229			continue;
230			} elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
231			continue;
232			}
233
234			switch ($val['type']) {
235			case 'open':
236			// Create an array of list items
237			if ('rdf:li' == $val['tag']) {
238			$metadata[] = [];
239
240			// Move up one level in the stack
241			$stack[\count($stack)] = &$metadata;
242			$metadata = &$metadata[\count($metadata) - 1];
243			} else {
244			// Else create an array of named values
245			$metadata[$val['tag']] = [];
246
247			// Move up one level in the stack
248			$stack[\count($stack)] = &$metadata;
249			$metadata = &$metadata[$val['tag']];
250			}
251			break;
252
253			case 'complete':
254			if (isset($val['value'])) {
255			// Assign a value to this list item
256			if ('rdf:li' == $val['tag']) {
257			$metadata[] = $val['value'];
258
259			// Else assign a value to this property
260			} else {
261			$metadata[$val['tag']] = $val['value'];
262			}
263			}
264			break;
265
266			case 'close':
267			// If the value of this property is an array
268			if (\is_array($metadata)) {
269			// If the value is a single element array
270			// where the element is of type string, use
271			// the value of the first list item as the
272			// value for this property
273			if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
274			$metadata = $metadata[0];
275			} elseif (0 == \count($metadata)) {
276			// if the value is an empty array, set
277			// the value of this property to the empty
278			// string
279			$metadata = '';
280			}
281			}
282
283			// Move down one level in the stack
284			$metadata = &$stack[\count($stack) - 1];
285			unset($stack[\count($stack) - 1]);
286			break;
287			}
288			}
289
290			// Only use this metadata if it's referring to a PDF
291			if (!isset($metadata['dc:format']) \|\| 'application/pdf' == $metadata['dc:format']) {
292			// According to the XMP specifications: 'Conflict resolution
293			// for separate packets that describe the same resource is
294			// beyond the scope of this document.' - Section 6.1
295			// Source: https://www.adobe.com/devnet/xmp.html
296			// Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
297			// So if there are multiple XMP blocks, just merge the values
298			// of each found block over top of the existing values
299			$this->metadata = array_merge($this->metadata, $metadata);
300			}
301			}
302
303			// TODO: remove this if-clause and its content when dropping PHP 7 support
304			if (version_compare(PHP_VERSION, '8.0.0', '<')) {
305			// ref: https://www.php.net/manual/en/function.xml-parser-free.php
306			xml_parser_free($xml);
307
308			// to avoid memory leaks; documentation said:
309			// > it was necessary to also explicitly unset the reference to parser to avoid memory leaks
310			unset($xml);
311			}
312			}
313
314			public function getDictionary(): array
315			{
316			return $this->dictionary;
317			}
318
319			/**
320			* @param PDFObject[] $objects
321			*/
322			public function setObjects($objects = [])
323			{
324			$this->objects = (array) $objects;
325
326			$this->init();
327			}
328
329			/**
330			* @return PDFObject[]
331			*/
332			public function getObjects()
333			{
334			return $this->objects;
335			}
336
337			/**
338			* @return PDFObject\|Font\|Page\|Element\|null
339			*/
340			public function getObjectById(string $id)
341			{
342			if (isset($this->objects[$id])) {
343			return $this->objects[$id];
344			}
345
346			return null;
347			}
348
349			public function hasObjectsByType(string $type, ?string $subtype = null): bool
350			{
351			return 0 < \count($this->getObjectsByType($type, $subtype));
352			}
353
354			public function getObjectsByType(string $type, ?string $subtype = null): array
355			{
356			if (!isset($this->dictionary[$type])) {
357			return [];
358			}
359
360			if (null != $subtype) {
			0 ignored issues – show Bug introduced 2021-06-13 00:39 UTC by Report Bug Copy Issue Report It seems like you are loosely comparing `$subtype` of type `null\|string` against `null`; this is ambiguous if the string can be empty. Consider using a strict comparison `!==` instead. Loading history...
361			if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
362			return [];
363			}
364
365			return $this->dictionary[$type]['subtype'][$subtype];
366			}
367
368			return $this->dictionary[$type]['all'];
369			}
370
371			/**
372			* @return Font[]
373			*/
374			public function getFonts()
375			{
376			return $this->getObjectsByType('Font');
377			}
378
379			public function getFirstFont(): ?Font
380			{
381			$fonts = $this->getFonts();
382			if ([] === $fonts) {
383			return null;
384			}
385
386			return reset($fonts);
387			}
388
389			/**
390			* @return Page[]
391			*
392			* @throws MissingCatalogException
393			*/
394			public function getPages()
395			{
396			if ($this->hasObjectsByType('Catalog')) {
397			// Search for catalog to list pages.
398			$catalogues = $this->getObjectsByType('Catalog');
399			$catalogue = reset($catalogues);
400
401			/** @var Pages $object */
402			$object = $catalogue->get('Pages');
403			if (method_exists($object, 'getPages')) {
404			return $object->getPages(true);
405			}
406			}
407
408			if ($this->hasObjectsByType('Pages')) {
409			// Search for pages to list kids.
410			$pages = [];
411
412			/** @var Pages[] $objects */
413			$objects = $this->getObjectsByType('Pages');
414			foreach ($objects as $object) {
415			$pages = array_merge($pages, $object->getPages(true));
416			}
417
418			return $pages;
419			}
420
421			if ($this->hasObjectsByType('Page')) {
422			// Search for 'page' (unordered pages).
423			$pages = $this->getObjectsByType('Page');
424
425			return array_values($pages);
426			}
427
428			throw new MissingCatalogException('Missing catalog.');
429			}
430
431			public function getText(?int $pageLimit = null): string
432			{
433			$texts = [];
434			$pages = $this->getPages();
435
436			// Only use the first X number of pages if $pageLimit is set and numeric.
437			if (\is_int($pageLimit) && 0 < $pageLimit) {
438			$pages = \array_slice($pages, 0, $pageLimit);
439			}
440
441			foreach ($pages as $index => $page) {
442			/**
443			* In some cases, the $page variable may be null.
444			*/
445			if (null === $page) {
446			continue;
447			}
448			if ($text = trim($page->getText())) {
449			$texts[] = $text;
450			}
451			}
452
453			return implode("\n\n", $texts);
454			}
455
456			public function getTrailer(): Header
457			{
458			return $this->trailer;
459			}
460
461			public function setTrailer(Header $trailer)
462			{
463			$this->trailer = $trailer;
464			}
465
466			public function getDetails(): array
467			{
468			return $this->details;
469			}
470			}
471

smalot / pdfparser

Document F last analyzed 2025-09-04 08:56 UTC

Complexity

Size/Duplication

Importance

18 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like

Document F
last analyzed 2025-09-04 08:56 UTC