Issues in Parser.php (master) - Issues in master - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Issues (82)

src/Smalot/PdfParser/Parser.php (1 issue)

Showing only issues like (Show All)

php_analyzer.check_variables.key_is_overwritten_by_foreach

Bug Comprehensibility Minor

<?php

/**
 * @file
 *          This file is part of the PdfParser library.
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Smalot\PdfParser;

use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Element\ElementBoolean;
use Smalot\PdfParser\Element\ElementDate;
use Smalot\PdfParser\Element\ElementHexa;
use Smalot\PdfParser\Element\ElementName;
use Smalot\PdfParser\Element\ElementNull;
use Smalot\PdfParser\Element\ElementNumeric;
use Smalot\PdfParser\Element\ElementString;
use Smalot\PdfParser\Element\ElementXRef;
use Smalot\PdfParser\RawData\RawDataParser;

/**
 * Class Parser
 */
class Parser
{
    /**
     * @var Config
     */
    private $config;

    /**
     * @var PDFObject[]
     */
    protected $objects = [];

    protected $rawDataParser;

    public function __construct($cfg = [], ?Config $config = null)
    {
        $this->config = $config ?: new Config();
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
    }

    public function getConfig(): Config
    {
        return $this->config;
    }

    /**
     * @throws \Exception
     */
    public function parseFile(string $filename): Document
    {
        $content = file_get_contents($filename);

        /*
         * 2018/06/20 @doganoo as multiple times a
         * users have complained that the parseFile()
         * method dies silently, it is an better option
         * to remove the error control operator (@) and
         * let the users know that the method throws an exception
         * by adding @throws tag to PHPDoc.
         *
         * See here for an example: https://github.com/smalot/pdfparser/issues/204
         */
        return $this->parseContent($content);
    }

    /**
     * @param string $content PDF content to parse
     *
     * @throws \Exception if secured PDF file was detected
     * @throws \Exception if no object list was found
     */
    public function parseContent(string $content): Document
    {
        // Create structure from raw data.
        list($xref, $data) = $this->rawDataParser->parseData($content);

        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
            throw new \Exception('Secured pdf file are currently not supported.');
        }

        if (empty($data)) {
            throw new \Exception('Object list not found. Possible secured file.');
        }

        // Create destination object.
        $document = new Document();
        $this->objects = [];

        foreach ($data as $id => $structure) {
            $this->parseObject($id, $structure, $document);
            unset($data[$id]);
        }

        $document->setTrailer($this->parseTrailer($xref['trailer'], $document));
        $document->setObjects($this->objects);

        return $document;
    }

    protected function parseTrailer(array $structure, ?Document $document)
    {
        $trailer = [];

        foreach ($structure as $name => $values) {
            $name = ucfirst($name);

            if (is_numeric($values)) {
                $trailer[$name] = new ElementNumeric($values);
            } elseif (\is_array($values)) {
                $value = $this->parseTrailer($values, null);
                $trailer[$name] = new ElementArray($value, null);
            } elseif (false !== strpos($values, '_')) {
                $trailer[$name] = new ElementXRef($values, $document);
            } else {
                $trailer[$name] = $this->parseHeaderElement('(', $values, $document);
            }
        }

        return new Header($trailer, $document);
    }

    protected function parseObject(string $id, array $structure, ?Document $document)
    {
        $header = new Header([], $document);
        $content = '';

        foreach ($structure as $position => $part) {
            if (\is_int($part)) {
                $part = [null, null];
            }
            switch ($part[0]) {
                case '[':
                    $elements = [];

                    foreach ($part[1] as $sub_element) {
                        $sub_type = $sub_element[0];
                        $sub_value = $sub_element[1];
                        $elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
                    }

                    $header = new Header($elements, $document);
                    break;

                case '<<':
                    $header = $this->parseHeader($part[1], $document);
                    break;

                case 'stream':
                    $content = isset($part[3][0]) ? $part[3][0] : $part[1];

                    if ($header->get('Type')->equals('ObjStm')) {
                        $match = [];

                        // Split xrefs and contents.
                        preg_match('/^((\d+\s+\d+\s*)*)(.*)$/s', $content, $match);
                        $content = $match[3];

                        // Extract xrefs.
                        $xrefs = preg_split(
                            '/(\d+\s+\d+\s*)/s',
                            $match[1],
                            -1,
                            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
                        );
                        $table = [];

                        foreach ($xrefs as $xref) {
                            list($id, $position) = preg_split("/\s+/", trim($xref));
                            $table[$position] = $id;
                        }

                        ksort($table);

                        $ids = array_values($table);
                        $positions = array_keys($table);

                        foreach ($positions as $index => $position) {

                            $id = $ids[$index].'_0';
                            $next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
                            $sub_content = substr($content, $position, (int) $next_position - (int) $position);

                            $sub_header = Header::parse($sub_content, $document);
                            $object = PDFObject::factory($document, $sub_header, '', $this->config);
                            $this->objects[$id] = $object;
                        }

                        // It is not necessary to store this content.

                        return;
                    } elseif ($header->get('Type')->equals('Metadata')) {
                        // Attempt to parse XMP XML Metadata
                        $document->extractXMPMetadata($content);
                    }
                    break;

                default:
                    if ('null' != $part) {
                        $element = $this->parseHeaderElement($part[0], $part[1], $document);

                        if ($element) {
                            $header = new Header([$element], $document);
                        }
                    }
                    break;
            }
        }

        if (!isset($this->objects[$id])) {
            $this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
        }
    }

    /**
     * @throws \Exception
     */
    protected function parseHeader(array $structure, ?Document $document): Header
    {
        $elements = [];
        $count = \count($structure);

        for ($position = 0; $position < $count; $position += 2) {
            $name = $structure[$position][1];
            $type = $structure[$position + 1][0];
            $value = $structure[$position + 1][1];

            $elements[$name] = $this->parseHeaderElement($type, $value, $document);
        }

        return new Header($elements, $document);
    }

    /**
     * @param string|array $value
     *
     * @return Element|Header|null
     *
     * @throws \Exception
     */
    protected function parseHeaderElement(?string $type, $value, ?Document $document)
    {
        $valueIsEmpty = null == $value || '' == $value || false == $value;
        if (('<<' === $type || '>>' === $type) && $valueIsEmpty) {
            $value = [];
        }

        switch ($type) {
            case '<<':
            case '>>':
                $header = $this->parseHeader($value, $document);
                PDFObject::factory($document, $header, null, $this->config);

                return $header;

            case 'numeric':
                return new ElementNumeric($value);

            case 'boolean':
                return new ElementBoolean($value);

            case 'null':
                return new ElementNull();

            case '(':
                if ($date = ElementDate::parse('('.$value.')', $document)) {
                    return $date;
                }

                return ElementString::parse('('.$value.')', $document);

            case '<':
                return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);

            case '/':
                return ElementName::parse('/'.$value, $document);

            case 'ojbref': // old mistake in tcpdf parser
            case 'objref':
                return new ElementXRef($value, $document);

            case '[':
                $values = [];

                if (\is_array($value)) {
                    foreach ($value as $sub_element) {
                        $sub_type = $sub_element[0];
                        $sub_value = $sub_element[1];
                        $values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
                    }
                }

                return new ElementArray($values, $document);

            case 'endstream':
            case 'obj': // I don't know what it means but got my project fixed.
            case '':
                // Nothing to do with.
                return null;

            default:
                throw new \Exception('Invalid type: "'.$type.'".');
        }
    }
}


1		<?php
2
3		/**
4		* @file
5		* This file is part of the PdfParser library.
6		*
7		* @author Sébastien MALOT <[email protected]>
8		*
9		* @date 2017-01-03
10		*
11		* @license LGPLv3
12		*
13		* @url <https://github.com/smalot/pdfparser>
14		*
15		* PdfParser is a pdf library written in PHP, extraction oriented.
16		* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17		*
18		* This program is free software: you can redistribute it and/or modify
19		* it under the terms of the GNU Lesser General Public License as published by
20		* the Free Software Foundation, either version 3 of the License, or
21		* (at your option) any later version.
22		*
23		* This program is distributed in the hope that it will be useful,
24		* but WITHOUT ANY WARRANTY; without even the implied warranty of
25		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26		* GNU Lesser General Public License for more details.
27		*
28		* You should have received a copy of the GNU Lesser General Public License
29		* along with this program.
30		* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31		*/
32
33		namespace Smalot\PdfParser;
34
35		use Smalot\PdfParser\Element\ElementArray;
36		use Smalot\PdfParser\Element\ElementBoolean;
37		use Smalot\PdfParser\Element\ElementDate;
38		use Smalot\PdfParser\Element\ElementHexa;
39		use Smalot\PdfParser\Element\ElementName;
40		use Smalot\PdfParser\Element\ElementNull;
41		use Smalot\PdfParser\Element\ElementNumeric;
42		use Smalot\PdfParser\Element\ElementString;
43		use Smalot\PdfParser\Element\ElementXRef;
44		use Smalot\PdfParser\RawData\RawDataParser;
45
46		/**
47		* Class Parser
48		*/
49		class Parser
50		{
51		/**
52		* @var Config
53		*/
54		private $config;
55
56		/**
57		* @var PDFObject[]
58		*/
59		protected $objects = [];
60
61		protected $rawDataParser;
62
63	73	public function __construct($cfg = [], ?Config $config = null)
64		{
65	73	$this->config = $config ?: new Config();
66	73	$this->rawDataParser = new RawDataParser($cfg, $this->config);
67		}
68
69	1	public function getConfig(): Config
70		{
71	1	return $this->config;
72		}
73
74		/**
75		* @throws \Exception
76		*/
77	68	public function parseFile(string $filename): Document
78		{
79	68	$content = file_get_contents($filename);
80
81		/*
82		* 2018/06/20 @doganoo as multiple times a
83		* users have complained that the parseFile()
84		* method dies silently, it is an better option
85		* to remove the error control operator (@) and
86		* let the users know that the method throws an exception
87		* by adding @throws tag to PHPDoc.
88		*
89		* See here for an example: https://github.com/smalot/pdfparser/issues/204
90		*/
91	68	return $this->parseContent($content);
92		}
93
94		/**
95		* @param string $content PDF content to parse
96		*
97		* @throws \Exception if secured PDF file was detected
98		* @throws \Exception if no object list was found
99		*/
100	68	public function parseContent(string $content): Document
101		{
102		// Create structure from raw data.
103	68	list($xref, $data) = $this->rawDataParser->parseData($content);
104
105	67	if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
106	1	throw new \Exception('Secured pdf file are currently not supported.');
107		}
108
109	66	if (empty($data)) {
110		throw new \Exception('Object list not found. Possible secured file.');
111		}
112
113		// Create destination object.
114	66	$document = new Document();
115	66	$this->objects = [];
116
117	66	foreach ($data as $id => $structure) {
118	66	$this->parseObject($id, $structure, $document);
119	66	unset($data[$id]);
120		}
121
122	66	$document->setTrailer($this->parseTrailer($xref['trailer'], $document));
123	66	$document->setObjects($this->objects);
124
125	66	return $document;
126		}
127
128	66	protected function parseTrailer(array $structure, ?Document $document)
129		{
130	66	$trailer = [];
131
132	66	foreach ($structure as $name => $values) {
133	66	$name = ucfirst($name);
134
135	66	if (is_numeric($values)) {
136	66	$trailer[$name] = new ElementNumeric($values);
137	66	} elseif (\is_array($values)) {
138	53	$value = $this->parseTrailer($values, null);
139	53	$trailer[$name] = new ElementArray($value, null);
140	66	} elseif (false !== strpos($values, '_')) {
141	66	$trailer[$name] = new ElementXRef($values, $document);
142		} else {
143	53	$trailer[$name] = $this->parseHeaderElement('(', $values, $document);
144		}
145		}
146
147	66	return new Header($trailer, $document);
148		}
149
150	67	protected function parseObject(string $id, array $structure, ?Document $document)
151		{
152	67	$header = new Header([], $document);
153	67	$content = '';
154
155	67	foreach ($structure as $position => $part) {
156	67	if (\is_int($part)) {
157		$part = [null, null];
158		}
159	67	switch ($part[0]) {
160	67	case '[':
161	27	$elements = [];
162
163	27	foreach ($part[1] as $sub_element) {
164	27	$sub_type = $sub_element[0];
165	27	$sub_value = $sub_element[1];
166	27	$elements[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
167		}
168
169	27	$header = new Header($elements, $document);
170	27	break;
171
172	67	case '<<':
173	67	$header = $this->parseHeader($part[1], $document);
174	67	break;
175
176	67	case 'stream':
177	67	$content = isset($part[3][0]) ? $part[3][0] : $part[1];
178
179	67	if ($header->get('Type')->equals('ObjStm')) {
180	17	$match = [];
181
182		// Split xrefs and contents.
183	17	preg_match('/^((\d+\s+\d+\s))(.*)$/s', $content, $match);
184	17	$content = $match[3];
185
186		// Extract xrefs.
187	17	$xrefs = preg_split(
188	17	'/(\d+\s+\d+\s*)/s',
189	17	$match[1],
190	17	-1,
191	17	\PREG_SPLIT_NO_EMPTY \| \PREG_SPLIT_DELIM_CAPTURE
192	17	);
193	17	$table = [];
194
195	17	foreach ($xrefs as $xref) {
196	17	list($id, $position) = preg_split("/\s+/", trim($xref));
197	17	$table[$position] = $id;
198		}
199
200	17	ksort($table);
201
202	17	$ids = array_values($table);
203	17	$positions = array_keys($table);
204
205	17	foreach ($positions as $index => $position) {
		0 ignored issues – show Comprehensibility Bug introduced 2020-05-26 20:14 UTC by Report Bug Copy Issue Report Show Similar Issues like this `$position` is overwriting a variable from outer foreach loop. Loading history...
206	17	$id = $ids[$index].'_0';
207	17	$next_position = isset($positions[$index + 1]) ? $positions[$index + 1] : \strlen($content);
208	17	$sub_content = substr($content, $position, (int) $next_position - (int) $position);
209
210	17	$sub_header = Header::parse($sub_content, $document);
211	17	$object = PDFObject::factory($document, $sub_header, '', $this->config);
212	17	$this->objects[$id] = $object;
213		}
214
215		// It is not necessary to store this content.
216
217	17	return;
218	66	} elseif ($header->get('Type')->equals('Metadata')) {
219		// Attempt to parse XMP XML Metadata
220	41	$document->extractXMPMetadata($content);
221		}
222	66	break;
223
224		default:
225	66	if ('null' != $part) {
226	66	$element = $this->parseHeaderElement($part[0], $part[1], $document);
227
228	66	if ($element) {
229	29	$header = new Header([$element], $document);
230		}
231		}
232	66	break;
233		}
234		}
235
236	66	if (!isset($this->objects[$id])) {
237	66	$this->objects[$id] = PDFObject::factory($document, $header, $content, $this->config);
238		}
239		}
240
241		/**
242		* @throws \Exception
243		*/
244	67	protected function parseHeader(array $structure, ?Document $document): Header
245		{
246	67	$elements = [];
247	67	$count = \count($structure);
248
249	67	for ($position = 0; $position < $count; $position += 2) {
250	67	$name = $structure[$position][1];
251	67	$type = $structure[$position + 1][0];
252	67	$value = $structure[$position + 1][1];
253
254	67	$elements[$name] = $this->parseHeaderElement($type, $value, $document);
255		}
256
257	67	return new Header($elements, $document);
258		}
259
260		/**
261		* @param string\|array $value
262		*
263		* @return Element\|Header\|null
264		*
265		* @throws \Exception
266		*/
267	67	protected function parseHeaderElement(?string $type, $value, ?Document $document)
268		{
269	67	$valueIsEmpty = null == $value \|\| '' == $value \|\| false == $value;
270	67	if (('<<' === $type \|\| '>>' === $type) && $valueIsEmpty) {
271	13	$value = [];
272		}
273
274		switch ($type) {
275	67	case '<<':
276	67	case '>>':
277	66	$header = $this->parseHeader($value, $document);
278	66	PDFObject::factory($document, $header, null, $this->config);
279
280	66	return $header;
281
282	67	case 'numeric':
283	66	return new ElementNumeric($value);
284
285	67	case 'boolean':
286	30	return new ElementBoolean($value);
287
288	67	case 'null':
289	11	return new ElementNull();
290
291	67	case '(':
292	65	if ($date = ElementDate::parse('('.$value.')', $document)) {
293	53	return $date;
294		}
295
296	65	return ElementString::parse('('.$value.')', $document);
297
298	67	case '<':
299	31	return $this->parseHeaderElement('(', ElementHexa::decode($value), $document);
300
301	67	case '/':
302	67	return ElementName::parse('/'.$value, $document);
303
304	66	case 'ojbref': // old mistake in tcpdf parser
305	66	case 'objref':
306	66	return new ElementXRef($value, $document);
307
308	66	case '[':
309	66	$values = [];
310
311	66	if (\is_array($value)) {
312	66	foreach ($value as $sub_element) {
313	66	$sub_type = $sub_element[0];
314	66	$sub_value = $sub_element[1];
315	66	$values[] = $this->parseHeaderElement($sub_type, $sub_value, $document);
316		}
317		}
318
319	66	return new ElementArray($values, $document);
320
321	66	case 'endstream':
322	1	case 'obj': // I don't know what it means but got my project fixed.
323		case '':
324		// Nothing to do with.
325	66	return null;
326
327		default:
328		throw new \Exception('Invalid type: "'.$type.'".');
329		}
330		}
331		}
332

smalot / pdfparser

Issues (82)

src/Smalot/PdfParser/Parser.php (1 issue)

Showing only issues like (Show All)

Introduced By

Duplication Side-by-Side

Filter issues like