RawDataParserTest::testGetXrefDataIssue673() - Code Metrics - Inspection of "Account for inaccurate offsets in getXrefData() (#..." - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( ed3fc0...fb77ea )

by Konrad

created 2024-04-02 06:27 UTC

RawDataParserTest::testGetXrefDataIssue673() A

↳ Parent: RawDataParserTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	10
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	1
eloc	5
c	1
b	0
f	0
nc	1
nop	0
dl	0
loc	10
rs	10

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 *
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace PHPUnitTests\Integration\RawData;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Config;
use Smalot\PdfParser\RawData\RawDataParser;

class RawDataParserHelper extends RawDataParser
{
    /**
     * Expose protected function "getRawObject".
     */
    public function exposeGetRawObject($pdfData, $offset = 0)
    {
        return $this->getRawObject($pdfData, $offset);
    }
}

class RawDataParserTest extends TestCase
{
    protected function setUp(): void
    {
        parent::setUp();

        $this->fixture = new RawDataParserHelper([], new Config());
    }

    /**
     * Tests buggy behavior of getRawObject.
     *
     * When PDF has corrupted xref table getRawObject may run into an infinite loop.
     *
     * @see https://github.com/smalot/pdfparser/issues/372
     * @see https://github.com/smalot/pdfparser/pull/377
     */
    public function testGetRawObjectIssue372(): void
    {
        // The following $data content is a minimal example to trigger the infinite loop
        $data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';

        // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
        // if the fix is not there.
        $result = $this->fixture->exposeGetRawObject($data);

        $this->assertEquals(
            [
                '<<',
                [
                    ['/', 'Producer', 11],
                    ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
                ],
                52,
            ],
            $result
        );

        // Test that spaces after a 'stream' declaration are absorbed
        // See: https://github.com/smalot/pdfparser/issues/641
        $data = 'stream '."\n";
        $data .= 'streamdata'."\n";
        $data .= 'endstream'."\n";
        $data .= 'endobj';

        $result = $this->fixture->exposeGetRawObject($data);

        // Value 'streamdata'."\n" would be empty string without the fix
        $this->assertEquals(
            [
                'stream',
                'streamdata'."\n",
                19,
            ],
            $result
        );
    }

    /**
     * Tests buggy behavior of decodeXrefStream.
     *
     * @see https://github.com/smalot/pdfparser/issues/30
     * @see https://github.com/smalot/pdfparser/issues/192
     * @see https://github.com/smalot/pdfparser/issues/209
     * @see https://github.com/smalot/pdfparser/issues/330
     * @see https://github.com/smalot/pdfparser/issues/356
     * @see https://github.com/smalot/pdfparser/issues/373
     * @see https://github.com/smalot/pdfparser/issues/392
     * @see https://github.com/smalot/pdfparser/issues/397
     */
    public function testDecodeXrefStreamIssue356(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue356.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $pages = $document->getPages();

        $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
    }

    public function testDecodeObjectHeaderIssue405(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue405.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $pages = $document->getPages();

        $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
    }

    /**
     * Tests buggy behavior of decodeXrefStream.
     *
     * When PDF has more than one entry in the /Index area (for example by changing
     * the document description), only the first entry is used.
     * If the fix is not used the array returned by getDetails() contains only the entry
     * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
     * 'Subject' (which come from the 'Info' object) are not listed, because the
     * 'Info' object gets a wrong object id during parsing the data into the xref structure.
     * So the object id listed at the /Info entry is not valid and the data of the info object
     * cannot be loaded during executing Document::buildDetails().
     *
     * @see https://github.com/smalot/pdfparser/pull/479
     */
    public function testDecodeXrefStreamIssue479(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue479.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $details = $document->getDetails();

        $this->assertArrayHasKey('Author', $details);
        $this->assertArrayHasKey('CreationDate', $details);
        $this->assertArrayHasKey('Creator', $details);
        $this->assertArrayHasKey('ModDate', $details);
        $this->assertArrayHasKey('Producer', $details);
        $this->assertArrayHasKey('Subject', $details);
        $this->assertArrayHasKey('Title', $details);
    }

    /**
     * Account for inaccurate offset values in getXrefData.
     *
     * Normally offset values extracted from the PDF document are exact.
     * However in some cases, they may point to whitespace *before* a
     * valid xref keyword. Move the offset forward past whitespace to
     * make this function a little more lenient.
     *
     * @see https://github.com/smalot/pdfparser/issues/673
     */
    public function testGetXrefDataIssue673(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue673.pdf';

        // Parsing this document would previously throw an Exception
        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $text = $document->getText();

        self::assertStringContainsString('6 rue des Goutais', $text);
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			*
8			* @date 2020-06-01
9			*
10			* @author Sébastien MALOT <[email protected]>
11			*
12			* @date 2017-01-03
13			*
14			* @license LGPLv3
15			*
16			* @url <https://github.com/smalot/pdfparser>
17			*
18			* PdfParser is a pdf library written in PHP, extraction oriented.
19			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20			*
21			* This program is free software: you can redistribute it and/or modify
22			* it under the terms of the GNU Lesser General Public License as published by
23			* the Free Software Foundation, either version 3 of the License, or
24			* (at your option) any later version.
25			*
26			* This program is distributed in the hope that it will be useful,
27			* but WITHOUT ANY WARRANTY; without even the implied warranty of
28			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29			* GNU Lesser General Public License for more details.
30			*
31			* You should have received a copy of the GNU Lesser General Public License
32			* along with this program.
33			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34			*/
35
36			namespace PHPUnitTests\Integration\RawData;
37
38			use PHPUnitTests\TestCase;
39			use Smalot\PdfParser\Config;
40			use Smalot\PdfParser\RawData\RawDataParser;
41
42			class RawDataParserHelper extends RawDataParser
43			{
44			/**
45			* Expose protected function "getRawObject".
46			*/
47			public function exposeGetRawObject($pdfData, $offset = 0)
48			{
49			return $this->getRawObject($pdfData, $offset);
50			}
51			}
52
53			class RawDataParserTest extends TestCase
54			{
55			protected function setUp(): void
56			{
57			parent::setUp();
58
59			$this->fixture = new RawDataParserHelper([], new Config());
60			}
61
62			/**
63			* Tests buggy behavior of getRawObject.
64			*
65			* When PDF has corrupted xref table getRawObject may run into an infinite loop.
66			*
67			* @see https://github.com/smalot/pdfparser/issues/372
68			* @see https://github.com/smalot/pdfparser/pull/377
69			*/
70			public function testGetRawObjectIssue372(): void
71			{
72			// The following $data content is a minimal example to trigger the infinite loop
73			$data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';
74
75			// calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
76			// if the fix is not there.
77			$result = $this->fixture->exposeGetRawObject($data);
78
79			$this->assertEquals(
80			[
81			'<<',
82			[
83			['/', 'Producer', 11],
84			['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
85			],
86			52,
87			],
88			$result
89			);
90
91			// Test that spaces after a 'stream' declaration are absorbed
92			// See: https://github.com/smalot/pdfparser/issues/641
93			$data = 'stream '."\n";
94			$data .= 'streamdata'."\n";
95			$data .= 'endstream'."\n";
96			$data .= 'endobj';
97
98			$result = $this->fixture->exposeGetRawObject($data);
99
100			// Value 'streamdata'."\n" would be empty string without the fix
101			$this->assertEquals(
102			[
103			'stream',
104			'streamdata'."\n",
105			19,
106			],
107			$result
108			);
109			}
110
111			/**
112			* Tests buggy behavior of decodeXrefStream.
113			*
114			* @see https://github.com/smalot/pdfparser/issues/30
115			* @see https://github.com/smalot/pdfparser/issues/192
116			* @see https://github.com/smalot/pdfparser/issues/209
117			* @see https://github.com/smalot/pdfparser/issues/330
118			* @see https://github.com/smalot/pdfparser/issues/356
119			* @see https://github.com/smalot/pdfparser/issues/373
120			* @see https://github.com/smalot/pdfparser/issues/392
121			* @see https://github.com/smalot/pdfparser/issues/397
122			*/
123			public function testDecodeXrefStreamIssue356(): void
124			{
125			$filename = $this->rootDir.'/samples/bugs/Issue356.pdf';
126
127			$parser = $this->getParserInstance();
128			$document = $parser->parseFile($filename);
129			$pages = $document->getPages();
130
131			$this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
132			}
133
134			public function testDecodeObjectHeaderIssue405(): void
135			{
136			$filename = $this->rootDir.'/samples/bugs/Issue405.pdf';
137
138			$parser = $this->getParserInstance();
139			$document = $parser->parseFile($filename);
140			$pages = $document->getPages();
141
142			$this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
143			}
144
145			/**
146			* Tests buggy behavior of decodeXrefStream.
147			*
148			* When PDF has more than one entry in the /Index area (for example by changing
149			* the document description), only the first entry is used.
150			* If the fix is not used the array returned by getDetails() contains only the entry
151			* with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
152			* 'Subject' (which come from the 'Info' object) are not listed, because the
153			* 'Info' object gets a wrong object id during parsing the data into the xref structure.
154			* So the object id listed at the /Info entry is not valid and the data of the info object
155			* cannot be loaded during executing Document::buildDetails().
156			*
157			* @see https://github.com/smalot/pdfparser/pull/479
158			*/
159			public function testDecodeXrefStreamIssue479(): void
160			{
161			$filename = $this->rootDir.'/samples/bugs/Issue479.pdf';
162
163			$parser = $this->getParserInstance();
164			$document = $parser->parseFile($filename);
165			$details = $document->getDetails();
166
167			$this->assertArrayHasKey('Author', $details);
168			$this->assertArrayHasKey('CreationDate', $details);
169			$this->assertArrayHasKey('Creator', $details);
170			$this->assertArrayHasKey('ModDate', $details);
171			$this->assertArrayHasKey('Producer', $details);
172			$this->assertArrayHasKey('Subject', $details);
173			$this->assertArrayHasKey('Title', $details);
174			}
175
176			/**
177			* Account for inaccurate offset values in getXrefData.
178			*
179			* Normally offset values extracted from the PDF document are exact.
180			* However in some cases, they may point to whitespace before a
181			* valid xref keyword. Move the offset forward past whitespace to
182			* make this function a little more lenient.
183			*
184			* @see https://github.com/smalot/pdfparser/issues/673
185			*/
186			public function testGetXrefDataIssue673(): void
187			{
188			$filename = $this->rootDir.'/samples/bugs/Issue673.pdf';
189
190			// Parsing this document would previously throw an Exception
191			$parser = $this->getParserInstance();
192			$document = $parser->parseFile($filename);
193			$text = $document->getText();
194
195			self::assertStringContainsString('6 rue des Goutais', $text);
196			}
197			}
198

smalot / pdfparser

Push — master ( ed3fc0...fb77ea )

RawDataParserTest::testGetXrefDataIssue673() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like