Document - Code Metrics - Inspection of "Read XMP Metadata and add it to data returned by g..." - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — master (#606)

by Konrad

created 2023-07-04 06:06 UTC

Document F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	363
Duplicated Lines	0 %

Test Coverage

Coverage

94%

Importance

Changes	13
Bugs	2	Features	1

Metric	Value
eloc	123
c	13
b	2
f	1
dl	0
loc	363
ccs	94
cts	100
cp	0.94
rs	3.6
wmc	60

18 Methods

Rating	Name	Size	Complexity
A	buildDictionary()	25	6
A	init()	10	2
A	__construct()	3	1
A	buildDetails()	27	5
A	getObjectsByType()	15	4
A	getPages()	35	6
A	getDetails()	3	1
A	getObjects()	3	1
A	setObjects()	5	1
A	getObjectById()	7	2
A	getFonts()	3	1
A	getFirstFont()	8	2
A	hasObjectsByType()	3	1
A	getDictionary()	3	1
A	getText()	23	6
A	getTrailer()	3	1
D	extractXMPMetadata()	93	18
A	setTrailer()	3	1

How to fix Complexity

<?php

/**
 * @file
 *          This file is part of the PdfParser library.
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Smalot\PdfParser;

/**
 * Technical references :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
 * - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
 * - http://www.php.net/manual/en/ref.pdf.php#74211
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
 * - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
 *
 * Class Document
 */
class Document
{
    /**
     * @var PDFObject[]
     */
    protected $objects = [];

    /**
     * @var array
     */
    protected $dictionary = [];

    /**
     * @var Header
     */
    protected $trailer;

    /**
     * @var array<mixed>
     */
    protected $metadata = [];

    /**
     * @var array
     */
    protected $details;

    public function __construct()
    {
        $this->trailer = new Header([], $this);
    }

    public function init()
    {
        $this->buildDictionary();

        $this->buildDetails();

        // Propagate init to objects.
        foreach ($this->objects as $object) {
            $object->getHeader()->init();
            $object->init();
        }
    }

    /**
     * Build dictionary based on type header field.
     */
    protected function buildDictionary()
    {
        // Build dictionary.
        $this->dictionary = [];

        foreach ($this->objects as $id => $object) {
            // Cache objects by type and subtype
            $type = $object->getHeader()->get('Type')->getContent();

            if (null != $type) {
                if (!isset($this->dictionary[$type])) {
                    $this->dictionary[$type] = [
                        'all' => [],
                        'subtype' => [],
                    ];
                }

                $this->dictionary[$type]['all'][$id] = $object;

                $subtype = $object->getHeader()->get('Subtype')->getContent();
                if (null != $subtype) {
                    if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                        $this->dictionary[$type]['subtype'][$subtype] = [];
                    }
                    $this->dictionary[$type]['subtype'][$subtype][$id] = $object;
                }
            }
        }
    }

    /**
     * Build details array.
     */
    protected function buildDetails()
    {
        // Build details array.
        $details = [];

        // Extract document info
        if ($this->trailer->has('Info')) {
            /** @var PDFObject $info */
            $info = $this->trailer->get('Info');
            // This could be an ElementMissing object, so we need to check for
            // the getHeader method first.
            if (null !== $info && method_exists($info, 'getHeader')) {
                $details = $info->getHeader()->getDetails();
            }
        }

        // Retrieve the page count
        try {
            $pages = $this->getPages();
            $details['Pages'] = \count($pages);
        } catch (\Exception $e) {
            $details['Pages'] = 0;
        }

        $details = array_merge($details, $this->metadata);

        $this->details = $details;
    }

    /**
     * Extract XMP Metadata
     */
    public function extractXMPMetadata(string $content): void
    {
        $xml = xml_parser_create();
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);

        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
            /*
             * short overview about the following code parts:
             *
             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on, 
             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered, 
             * we save the current $metadata context in the $stack, then create a child array of $metadata and 
             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
             * element) is set as the current $metadata context.
             */
            $metadata = [];
            $stack = [];
            foreach ($values as $val) {

                // Standardize to lowercase
                $val['tag'] = strtolower($val['tag']);

                // Ignore structural x: and rdf: XML elements
                if (0 === strpos($val['tag'], 'x:')) continue;
                if (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) continue;
  
                switch ($val['type']) {
                    case 'open':
                        // Create an array of list items
                        if ('rdf:li' == $val['tag']) {
                            $metadata[] = [];

                            // Move up one level in the stack
                            $stack[count($stack)] = &$metadata;
                            $metadata = &$metadata[count($metadata) - 1];

                        // Else create an array of named values
                        } else {
                            $metadata[$val['tag']] = [];

                            // Move up one level in the stack
                            $stack[count($stack)] = &$metadata;
                            $metadata = &$metadata[$val['tag']];
                        }
                        break;

                    case 'complete':
                        if (isset($val['value'])) {

                            // Assign a value to this list item
                            if ('rdf:li' == $val['tag']) {
                                $metadata[] = $val['value'];

                            // Else assign a value to this property
                            } else {
                                $metadata[$val['tag']] = $val['value'];
                            }
                        }
                        break;

                    case 'close':
                        // If the value of this property is a single-
                        // element array where the element is of type
                        // string, use the value of the first list item
                        // as the value for this property
                        if (is_array($metadata) && isset($metadata[0]) && 1 == count($metadata) && is_string($metadata[0])) {
                            $metadata = $metadata[0];
                        }

                        // Move down one level in the stack
                        $metadata = &$stack[count($stack) - 1];
                        unset($stack[count($stack) - 1]);
                        break;

                }
            }

            // Only use this metadata if it's referring to a PDF
            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {

                // According to the XMP specifications: 'Conflict resolution
                // for separate packets that describe the same resource is
                // beyond the scope of this document.' - Section 6.1
                // Source: https://www.adobe.com/devnet/xmp.html
                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
                // So if there are multiple XMP blocks, just merge the values
                // of each found block over top of the existing values
                $this->metadata = array_merge($this->metadata, $metadata);
            }
        }
        xml_parser_free($xml);
    }

    public function getDictionary(): array
    {
        return $this->dictionary;
    }

    /**
     * @param PDFObject[] $objects
     */
    public function setObjects($objects = [])
    {
        $this->objects = (array) $objects;

        $this->init();
    }

    /**
     * @return PDFObject[]
     */
    public function getObjects()
    {
        return $this->objects;
    }

    /**
     * @return PDFObject|Font|Page|Element|null
     */
    public function getObjectById(string $id)
    {
        if (isset($this->objects[$id])) {
            return $this->objects[$id];
        }

        return null;
    }

    public function hasObjectsByType(string $type, string $subtype = null): bool
    {
        return 0 < \count($this->getObjectsByType($type, $subtype));
    }

    public function getObjectsByType(string $type, string $subtype = null): array
    {
        if (!isset($this->dictionary[$type])) {
            return [];
        }

        if (null != $subtype) {

            if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
                return [];
            }

            return $this->dictionary[$type]['subtype'][$subtype];
        }

        return $this->dictionary[$type]['all'];
    }

    /**
     * @return Font[]
     */
    public function getFonts()
    {
        return $this->getObjectsByType('Font');
    }

    public function getFirstFont(): ?Font
    {
        $fonts = $this->getFonts();
        if ([] === $fonts) {
            return null;
        }

        return reset($fonts);
    }

    /**
     * @return Page[]
     *
     * @throws \Exception
     */
    public function getPages()
    {
        if ($this->hasObjectsByType('Catalog')) {
            // Search for catalog to list pages.
            $catalogues = $this->getObjectsByType('Catalog');
            $catalogue = reset($catalogues);

            /** @var Pages $object */
            $object = $catalogue->get('Pages');
            if (method_exists($object, 'getPages')) {
                return $object->getPages(true);
            }
        }

        if ($this->hasObjectsByType('Pages')) {
            // Search for pages to list kids.
            $pages = [];

            /** @var Pages[] $objects */
            $objects = $this->getObjectsByType('Pages');
            foreach ($objects as $object) {
                $pages = array_merge($pages, $object->getPages(true));
            }

            return $pages;
        }

        if ($this->hasObjectsByType('Page')) {
            // Search for 'page' (unordered pages).
            $pages = $this->getObjectsByType('Page');

            return array_values($pages);
        }

        throw new \Exception('Missing catalog.');
    }

    public function getText(int $pageLimit = null): string
    {
        $texts = [];
        $pages = $this->getPages();

        // Only use the first X number of pages if $pageLimit is set and numeric.
        if (\is_int($pageLimit) && 0 < $pageLimit) {
            $pages = \array_slice($pages, 0, $pageLimit);
        }

        foreach ($pages as $index => $page) {
            /**
             * In some cases, the $page variable may be null.
             */
            if (null === $page) {
                continue;
            }
            if ($text = trim($page->getText())) {
                $texts[] = $text;
            }
        }

        return implode("\n\n", $texts);
    }

    public function getTrailer(): Header
    {
        return $this->trailer;
    }

    public function setTrailer(Header $trailer)
    {
        $this->trailer = $trailer;
    }

    public function getDetails(): array
    {
        return $this->details;
    }
}


1		<?php
2
3		/**
4		* @file
5		* This file is part of the PdfParser library.
6		*
7		* @author Sébastien MALOT <[email protected]>
8		*
9		* @date 2017-01-03
10		*
11		* @license LGPLv3
12		*
13		* @url <https://github.com/smalot/pdfparser>
14		*
15		* PdfParser is a pdf library written in PHP, extraction oriented.
16		* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17		*
18		* This program is free software: you can redistribute it and/or modify
19		* it under the terms of the GNU Lesser General Public License as published by
20		* the Free Software Foundation, either version 3 of the License, or
21		* (at your option) any later version.
22		*
23		* This program is distributed in the hope that it will be useful,
24		* but WITHOUT ANY WARRANTY; without even the implied warranty of
25		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26		* GNU Lesser General Public License for more details.
27		*
28		* You should have received a copy of the GNU Lesser General Public License
29		* along with this program.
30		* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31		*/
32
33		namespace Smalot\PdfParser;
34
35		/**
36		* Technical references :
37		* - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
38		* - http://framework.zend.com/issues/secure/attachment/12512/Pdf.php
39		* - http://www.php.net/manual/en/ref.pdf.php#74211
40		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin1Encoding.pm
41		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/ISOLatin9Encoding.pm
42		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/StandardEncoding.pm
43		* - http://cpansearch.perl.org/src/JV/PostScript-Font-1.10.02/lib/PostScript/WinAnsiEncoding.pm
44		*
45		* Class Document
46		*/
47		class Document
48		{
49		/**
50		* @var PDFObject[]
51		*/
52		protected $objects = [];
53
54		/**
55		* @var array
56		*/
57		protected $dictionary = [];
58
59		/**
60		* @var Header
61		*/
62		protected $trailer;
63
64		/**
65		* @var array<mixed>
66		*/
67		protected $metadata = [];
68
69	72	/**
70		* @var array
71	72	*/
72	72	protected $details;
73
74	49	public function __construct()
75		{
76	49	$this->trailer = new Header([], $this);
77		}
78	49
79		public function init()
80		{
81	49	$this->buildDictionary();
82	49
83	49	$this->buildDetails();
84
85	49	// Propagate init to objects.
86		foreach ($this->objects as $object) {
87		$object->getHeader()->init();
88		$object->init();
89		}
90	49	}
91
92		/**
93	49	* Build dictionary based on type header field.
94		*/
95	49	protected function buildDictionary()
96		{
97	49	// Build dictionary.
98		$this->dictionary = [];
99	49
100	49	foreach ($this->objects as $id => $object) {
101	49	// Cache objects by type and subtype
102		$type = $object->getHeader()->get('Type')->getContent();
103
104		if (null != $type) {
105		if (!isset($this->dictionary[$type])) {
106		$this->dictionary[$type] = [
107	49	'all' => [],
108		'subtype' => [],
109	49	];
110	49	}
111	42
112	42	$this->dictionary[$type]['all'][$id] = $object;
113
114	42	$subtype = $object->getHeader()->get('Subtype')->getContent();
115		if (null != $subtype) {
116		if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
117		$this->dictionary[$type]['subtype'][$subtype] = [];
118	49	}
119		$this->dictionary[$type]['subtype'][$subtype][$id] = $object;
120		}
121		}
122		}
123	49	}
124
125		/**
126	49	* Build details array.
127		*/
128		protected function buildDetails()
129	49	{
130		// Build details array.
131	40	$details = [];
132
133		// Extract document info
134	40	if ($this->trailer->has('Info')) {
135	40	/** @var PDFObject $info */
136		$info = $this->trailer->get('Info');
137		// This could be an ElementMissing object, so we need to check for
138		// the getHeader method first.
139		if (null !== $info && method_exists($info, 'getHeader')) {
140		$details = $info->getHeader()->getDetails();
141	49	}
142	48	}
143	2
144	2	// Retrieve the page count
145		try {
146		$pages = $this->getPages();
147	49	$details['Pages'] = \count($pages);
148	49	} catch (\Exception $e) {
149		$details['Pages'] = 0;
150	1	}
151
152	1	$details = array_merge($details, $this->metadata);
153
154		$this->details = $details;
155		}
156
157		/**
158	49	* Extract XMP Metadata
159		*/
160	49	public function extractXMPMetadata(string $content): void
161		{
162	49	$xml = xml_parser_create();
163	49	xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
164
165		if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
166		/*
167		* short overview about the following code parts:
168	1	*
169		* The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
170	1	* first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
171		* results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
172		* we save the current $metadata context in the $stack, then create a child array of $metadata and
173		* make that the current $metadata context. When a "close" XML tag is encountered, the operations are
174		* reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
175		* element) is set as the current $metadata context.
176	46	*/
177		$metadata = [];
178	46	$stack = [];
179	46	foreach ($values as $val) {
180
181		// Standardize to lowercase
182	3	$val['tag'] = strtolower($val['tag']);
183
184		// Ignore structural x: and rdf: XML elements
185	50	if (0 === strpos($val['tag'], 'x:')) continue;
186		if (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) continue;
187	50
188		switch ($val['type']) {
189		case 'open':
190	53	// Create an array of list items
191		if ('rdf:li' == $val['tag']) {
192	53	$metadata[] = [];
193	12
194		// Move up one level in the stack
195		$stack[count($stack)] = &$metadata;
196	48	$metadata = &$metadata[count($metadata) - 1];
197
198		// Else create an array of named values
199		} else {
200		$metadata[$val['tag']] = [];
201
202		// Move up one level in the stack
203		$stack[count($stack)] = &$metadata;
204	48	$metadata = &$metadata[$val['tag']];
205		}
206		break;
207
208		case 'complete':
209		if (isset($val['value'])) {
210	27
211		// Assign a value to this list item
212	27	if ('rdf:li' == $val['tag']) {
213		$metadata[] = $val['value'];
214
215	21	// Else assign a value to this property
216		} else {
217	21	$metadata[$val['tag']] = $val['value'];
218	21	}
219	3	}
220		break;
221
222	18	case 'close':
223		// If the value of this property is a single-
224		// element array where the element is of type
225		// string, use the value of the first list item
226		// as the value for this property
227		if (is_array($metadata) && isset($metadata[0]) && 1 == count($metadata) && is_string($metadata[0])) {
228		$metadata = $metadata[0];
229		}
230	50
231		// Move down one level in the stack
232	50	$metadata = &$stack[count($stack) - 1];
233		unset($stack[count($stack) - 1]);
234	42	break;
235	42
236		}
237		}
238	42
239	42	// Only use this metadata if it's referring to a PDF
240	42	if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
241
242		// According to the XMP specifications: 'Conflict resolution
243		// for separate packets that describe the same resource is
244	9	// beyond the scope of this document.' - Section 6.1
245		// Source: https://www.adobe.com/devnet/xmp.html
246	1	// Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
247		// So if there are multiple XMP blocks, just merge the values
248		// of each found block over top of the existing values
249	1	$this->metadata = array_merge($this->metadata, $metadata);
250	1	}
251	1	}
252		xml_parser_free($xml);
253		}
254	1
255		public function getDictionary(): array
256		{
257	9	return $this->dictionary;
258		}
259	7
260		/**
261	7	* @param PDFObject[] $objects
262		*/
263		public function setObjects($objects = [])
264	3	{
265		$this->objects = (array) $objects;
266
267	12	$this->init();
268		}
269	12
270	12	/**
271		* @return PDFObject[]
272		*/
273	12	public function getObjects()
274	1	{
275		return $this->objects;
276		}
277	12
278		/**
279		* @return PDFObject\|Font\|Page\|Element\|null
280		*/
281	12	public function getObjectById(string $id)
282		{
283		if (isset($this->objects[$id])) {
284	12	return $this->objects[$id];
285	12	}
286
287		return null;
288		}
289	12
290		public function hasObjectsByType(string $type, string $subtype = null): bool
291		{
292		return 0 < \count($this->getObjectsByType($type, $subtype));
293		}
294
295		public function getObjectsByType(string $type, string $subtype = null): array
296		{
297	41	if (!isset($this->dictionary[$type])) {
298		return [];
299	41	}
300	41
301		if (null != $subtype) {
		0 ignored issues – show Bug introduced 2021-06-13 00:39 UTC by Report Bug Copy Issue Report It seems like you are loosely comparing `$subtype` of type `null\|string` against `null`; this is ambiguous if the string can be empty. Consider using a strict comparison `!==` instead. Loading history...
302	12	if (!isset($this->dictionary[$type]['subtype'][$subtype])) {
303		return [];
304	12	}
305
306		return $this->dictionary[$type]['subtype'][$subtype];
307		}
308
309		return $this->dictionary[$type]['all'];
310		}
311
312		/**
313		* @return Font[]
314		*/
315		public function getFonts()
316		{
317		return $this->getObjectsByType('Font');
318		}
319
320		public function getFirstFont(): ?Font
321		{
322		$fonts = $this->getFonts();
323		if ([] === $fonts) {
324		return null;
325		}
326
327		return reset($fonts);
328		}
329
330		/**
331		* @return Page[]
332		*
333		* @throws \Exception
334		*/
335		public function getPages()
336		{
337		if ($this->hasObjectsByType('Catalog')) {
338		// Search for catalog to list pages.
339		$catalogues = $this->getObjectsByType('Catalog');
340		$catalogue = reset($catalogues);
341
342		/** @var Pages $object */
343		$object = $catalogue->get('Pages');
344		if (method_exists($object, 'getPages')) {
345		return $object->getPages(true);
346		}
347		}
348
349		if ($this->hasObjectsByType('Pages')) {
350		// Search for pages to list kids.
351		$pages = [];
352
353		/** @var Pages[] $objects */
354		$objects = $this->getObjectsByType('Pages');
355		foreach ($objects as $object) {
356		$pages = array_merge($pages, $object->getPages(true));
357		}
358
359		return $pages;
360		}
361
362		if ($this->hasObjectsByType('Page')) {
363		// Search for 'page' (unordered pages).
364		$pages = $this->getObjectsByType('Page');
365
366		return array_values($pages);
367		}
368
369		throw new \Exception('Missing catalog.');
370		}
371
372		public function getText(int $pageLimit = null): string
373		{
374		$texts = [];
375		$pages = $this->getPages();
376
377		// Only use the first X number of pages if $pageLimit is set and numeric.
378		if (\is_int($pageLimit) && 0 < $pageLimit) {
379		$pages = \array_slice($pages, 0, $pageLimit);
380		}
381
382		foreach ($pages as $index => $page) {
383		/**
384		* In some cases, the $page variable may be null.
385		*/
386		if (null === $page) {
387		continue;
388		}
389		if ($text = trim($page->getText())) {
390		$texts[] = $text;
391		}
392		}
393
394		return implode("\n\n", $texts);
395		}
396
397		public function getTrailer(): Header
398		{
399		return $this->trailer;
400		}
401
402		public function setTrailer(Header $trailer)
403		{
404		$this->trailer = $trailer;
405		}
406
407		public function getDetails(): array
408		{
409		return $this->details;
410		}
411		}
412

smalot / pdfparser

Pull Request — master (#606)

Document F

Complexity

Size/Duplication

Test Coverage

Importance

18 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like