RawDataParserTest::testGetXrefDataTracksVisitedOffsets() - Code Metrics - Inspection of "Fix memory exhaustion caused by circular reference..." - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 6b52c6...61c9bc )

by Konrad

created 2026-01-08 08:04 UTC

testGetXrefDataTracksVisitedOffsets() A

↳ Parent: RawDataParserTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	19
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	1
eloc	12
c	1
b	0
f	0
nc	1
nop	0
dl	0
loc	19
rs	9.8666

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 *
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace PHPUnitTests\Integration\RawData;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Config;
use Smalot\PdfParser\RawData\RawDataParser;

class RawDataParserHelper extends RawDataParser
{
    /**
     * Expose protected function "getRawObject".
     */
    public function exposeGetRawObject($pdfData, $offset = 0)
    {
        return $this->getRawObject($pdfData, $offset);
    }

    /**
     * Expose protected function "getXrefData".
     */
    public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
    {
        return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
    }

    /**
     * Expose protected function "decodeXref".
     */
    public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
    {
        return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
    }

    /**
     * Expose protected function "decodeXrefStream".
     */
    public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
    {
        return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
    }
}

class RawDataParserTest extends TestCase
{
    protected function setUp(): void
    {
        parent::setUp();

        $this->fixture = new RawDataParserHelper([], new Config());
    }

    /**
     * Tests buggy behavior of getRawObject.
     *
     * When PDF has corrupted xref table getRawObject may run into an infinite loop.
     *
     * @see https://github.com/smalot/pdfparser/issues/372
     * @see https://github.com/smalot/pdfparser/pull/377
     */
    public function testGetRawObjectIssue372(): void
    {
        // The following $data content is a minimal example to trigger the infinite loop
        $data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';

        // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
        // if the fix is not there.
        $result = $this->fixture->exposeGetRawObject($data);

        $this->assertEquals(
            [
                '<<',
                [
                    ['/', 'Producer', 11],
                    ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
                ],
                52,
            ],
            $result
        );

        // Test that spaces after a 'stream' declaration are absorbed
        // See: https://github.com/smalot/pdfparser/issues/641
        $data = 'stream '."\n";
        $data .= 'streamdata'."\n";
        $data .= 'endstream'."\n";
        $data .= 'endobj';

        $result = $this->fixture->exposeGetRawObject($data);

        // Value 'streamdata'."\n" would be empty string without the fix
        $this->assertEquals(
            [
                'stream',
                'streamdata'."\n",
                19,
            ],
            $result
        );
    }

    /**
     * Tests buggy behavior of decodeXrefStream.
     *
     * @see https://github.com/smalot/pdfparser/issues/30
     * @see https://github.com/smalot/pdfparser/issues/192
     * @see https://github.com/smalot/pdfparser/issues/209
     * @see https://github.com/smalot/pdfparser/issues/330
     * @see https://github.com/smalot/pdfparser/issues/356
     * @see https://github.com/smalot/pdfparser/issues/373
     * @see https://github.com/smalot/pdfparser/issues/392
     * @see https://github.com/smalot/pdfparser/issues/397
     */
    public function testDecodeXrefStreamIssue356(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue356.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $pages = $document->getPages();

        $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
    }

    public function testDecodeObjectHeaderIssue405(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue405.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $pages = $document->getPages();

        $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
    }

    /**
     * Tests buggy behavior of decodeXrefStream.
     *
     * When PDF has more than one entry in the /Index area (for example by changing
     * the document description), only the first entry is used.
     * If the fix is not used the array returned by getDetails() contains only the entry
     * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
     * 'Subject' (which come from the 'Info' object) are not listed, because the
     * 'Info' object gets a wrong object id during parsing the data into the xref structure.
     * So the object id listed at the /Info entry is not valid and the data of the info object
     * cannot be loaded during executing Document::buildDetails().
     *
     * @see https://github.com/smalot/pdfparser/pull/479
     */
    public function testDecodeXrefStreamIssue479(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue479.pdf';

        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $details = $document->getDetails();

        $this->assertArrayHasKey('Author', $details);
        $this->assertArrayHasKey('CreationDate', $details);
        $this->assertArrayHasKey('Creator', $details);
        $this->assertArrayHasKey('ModDate', $details);
        $this->assertArrayHasKey('Producer', $details);
        $this->assertArrayHasKey('Subject', $details);
        $this->assertArrayHasKey('Title', $details);
    }

    /**
     * Account for inaccurate offset values in getXrefData.
     *
     * Normally offset values extracted from the PDF document are exact.
     * However in some cases, they may point to whitespace *before* a
     * valid xref keyword. Move the offset forward past whitespace to
     * make this function a little more lenient.
     *
     * @see https://github.com/smalot/pdfparser/issues/673
     */
    public function testGetXrefDataIssue673(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue673.pdf';

        // Parsing this document would previously throw an Exception
        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $text = $document->getText();

        self::assertStringContainsString('6 rue des Goutais', $text);
    }

    /**
     * Handle self referencing xref
     *
     * It seems that some PDF creators output `Prev 0` when there is no previous xref.
     *
     * @see https://github.com/smalot/pdfparser/pull/727
     */
    public function testDecodeXrefIssue727(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue727.pdf';

        // Parsing this document would previously cause an infinite loop
        $parser = $this->getParserInstance();
        $document = $parser->parseFile($filename);
        $text = $document->getText();

        self::assertStringContainsString('', $text);
    }

    /**
     * Test that getXrefData prevents circular references
     *
     * When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset),
     * the parser should detect this and stop recursion to prevent infinite loops.
     */
    public function testGetXrefDataPreventsCircularReferences(): void
    {
        // Create a minimal PDF structure with xref that would create a circular reference
        $pdfData = "%PDF-1.5\n";
        $pdfData .= "xref\n";
        $pdfData .= "0 1\n";
        $pdfData .= "0000000000 65535 f \n";
        $pdfData .= "trailer\n";
        $pdfData .= "<</Size 1/Prev 7>>\n";  // Prev points back to offset 7 (the xref keyword)
        $pdfData .= "startxref\n";
        $pdfData .= "7\n";
        $pdfData .= "%%EOF\n";

        // Test with visitedOffsets containing the offset we're trying to visit
        $result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]);

        // Should return empty xref array without recursing
        $this->assertIsArray($result);
        $this->assertEmpty($result);
    }

    /**
     * Test that decodeXref passes visitedOffsets correctly when handling Prev
     *
     * This ensures that circular reference detection works when decodeXref
     * calls getXrefData for a Prev pointer.
     */
    public function testDecodeXrefPassesVisitedOffsets(): void
    {
        // Create a minimal xref structure with Prev
        $pdfData = "xref\n";
        $pdfData .= "0 1\n";
        $pdfData .= "0000000000 65535 f \n";
        $pdfData .= "trailer\n";
        $pdfData .= "<</Size 1/Prev 100>>\n";

        // Call decodeXref with visitedOffsets that includes the Prev offset
        // This should not cause infinite recursion
        $result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]);

        // Should complete without error and return an array
        $this->assertIsArray($result);
        $this->assertArrayHasKey('trailer', $result);
    }

    /**
     * Test that getXrefData tracks visited offsets correctly
     *
     * Ensures that offsets are added to visitedOffsets array to prevent
     * circular references in subsequent calls.
     */
    public function testGetXrefDataTracksVisitedOffsets(): void
    {
        // Test that calling with an already-visited offset returns immediately
        $pdfData = "%PDF-1.5\n";
        $pdfData .= "xref\n";
        $pdfData .= "0 1\n";
        $pdfData .= "0000000000 65535 f \n";
        $pdfData .= "trailer\n";
        $pdfData .= "<</Size 1>>\n";
        $pdfData .= "startxref\n";
        $pdfData .= "7\n";
        $pdfData .= "%%EOF\n";

        // Call with offset 50 already in visitedOffsets - should return immediately
        $result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]);

        // Should return empty array without processing
        $this->assertIsArray($result);
        $this->assertEmpty($result);
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			*
8			* @date 2020-06-01
9			*
10			* @author Sébastien MALOT <[email protected]>
11			*
12			* @date 2017-01-03
13			*
14			* @license LGPLv3
15			*
16			* @url <https://github.com/smalot/pdfparser>
17			*
18			* PdfParser is a pdf library written in PHP, extraction oriented.
19			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20			*
21			* This program is free software: you can redistribute it and/or modify
22			* it under the terms of the GNU Lesser General Public License as published by
23			* the Free Software Foundation, either version 3 of the License, or
24			* (at your option) any later version.
25			*
26			* This program is distributed in the hope that it will be useful,
27			* but WITHOUT ANY WARRANTY; without even the implied warranty of
28			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29			* GNU Lesser General Public License for more details.
30			*
31			* You should have received a copy of the GNU Lesser General Public License
32			* along with this program.
33			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34			*/
35
36			namespace PHPUnitTests\Integration\RawData;
37
38			use PHPUnitTests\TestCase;
39			use Smalot\PdfParser\Config;
40			use Smalot\PdfParser\RawData\RawDataParser;
41
42			class RawDataParserHelper extends RawDataParser
43			{
44			/**
45			* Expose protected function "getRawObject".
46			*/
47			public function exposeGetRawObject($pdfData, $offset = 0)
48			{
49			return $this->getRawObject($pdfData, $offset);
50			}
51
52			/**
53			* Expose protected function "getXrefData".
54			*/
55			public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
56			{
57			return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
58			}
59
60			/**
61			* Expose protected function "decodeXref".
62			*/
63			public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
64			{
65			return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
66			}
67
68			/**
69			* Expose protected function "decodeXrefStream".
70			*/
71			public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
72			{
73			return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
74			}
75			}
76
77			class RawDataParserTest extends TestCase
78			{
79			protected function setUp(): void
80			{
81			parent::setUp();
82
83			$this->fixture = new RawDataParserHelper([], new Config());
84			}
85
86			/**
87			* Tests buggy behavior of getRawObject.
88			*
89			* When PDF has corrupted xref table getRawObject may run into an infinite loop.
90			*
91			* @see https://github.com/smalot/pdfparser/issues/372
92			* @see https://github.com/smalot/pdfparser/pull/377
93			*/
94			public function testGetRawObjectIssue372(): void
95			{
96			// The following $data content is a minimal example to trigger the infinite loop
97			$data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';
98
99			// calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
100			// if the fix is not there.
101			$result = $this->fixture->exposeGetRawObject($data);
102
103			$this->assertEquals(
104			[
105			'<<',
106			[
107			['/', 'Producer', 11],
108			['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
109			],
110			52,
111			],
112			$result
113			);
114
115			// Test that spaces after a 'stream' declaration are absorbed
116			// See: https://github.com/smalot/pdfparser/issues/641
117			$data = 'stream '."\n";
118			$data .= 'streamdata'."\n";
119			$data .= 'endstream'."\n";
120			$data .= 'endobj';
121
122			$result = $this->fixture->exposeGetRawObject($data);
123
124			// Value 'streamdata'."\n" would be empty string without the fix
125			$this->assertEquals(
126			[
127			'stream',
128			'streamdata'."\n",
129			19,
130			],
131			$result
132			);
133			}
134
135			/**
136			* Tests buggy behavior of decodeXrefStream.
137			*
138			* @see https://github.com/smalot/pdfparser/issues/30
139			* @see https://github.com/smalot/pdfparser/issues/192
140			* @see https://github.com/smalot/pdfparser/issues/209
141			* @see https://github.com/smalot/pdfparser/issues/330
142			* @see https://github.com/smalot/pdfparser/issues/356
143			* @see https://github.com/smalot/pdfparser/issues/373
144			* @see https://github.com/smalot/pdfparser/issues/392
145			* @see https://github.com/smalot/pdfparser/issues/397
146			*/
147			public function testDecodeXrefStreamIssue356(): void
148			{
149			$filename = $this->rootDir.'/samples/bugs/Issue356.pdf';
150
151			$parser = $this->getParserInstance();
152			$document = $parser->parseFile($filename);
153			$pages = $document->getPages();
154
155			$this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
156			}
157
158			public function testDecodeObjectHeaderIssue405(): void
159			{
160			$filename = $this->rootDir.'/samples/bugs/Issue405.pdf';
161
162			$parser = $this->getParserInstance();
163			$document = $parser->parseFile($filename);
164			$pages = $document->getPages();
165
166			$this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
167			}
168
169			/**
170			* Tests buggy behavior of decodeXrefStream.
171			*
172			* When PDF has more than one entry in the /Index area (for example by changing
173			* the document description), only the first entry is used.
174			* If the fix is not used the array returned by getDetails() contains only the entry
175			* with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
176			* 'Subject' (which come from the 'Info' object) are not listed, because the
177			* 'Info' object gets a wrong object id during parsing the data into the xref structure.
178			* So the object id listed at the /Info entry is not valid and the data of the info object
179			* cannot be loaded during executing Document::buildDetails().
180			*
181			* @see https://github.com/smalot/pdfparser/pull/479
182			*/
183			public function testDecodeXrefStreamIssue479(): void
184			{
185			$filename = $this->rootDir.'/samples/bugs/Issue479.pdf';
186
187			$parser = $this->getParserInstance();
188			$document = $parser->parseFile($filename);
189			$details = $document->getDetails();
190
191			$this->assertArrayHasKey('Author', $details);
192			$this->assertArrayHasKey('CreationDate', $details);
193			$this->assertArrayHasKey('Creator', $details);
194			$this->assertArrayHasKey('ModDate', $details);
195			$this->assertArrayHasKey('Producer', $details);
196			$this->assertArrayHasKey('Subject', $details);
197			$this->assertArrayHasKey('Title', $details);
198			}
199
200			/**
201			* Account for inaccurate offset values in getXrefData.
202			*
203			* Normally offset values extracted from the PDF document are exact.
204			* However in some cases, they may point to whitespace before a
205			* valid xref keyword. Move the offset forward past whitespace to
206			* make this function a little more lenient.
207			*
208			* @see https://github.com/smalot/pdfparser/issues/673
209			*/
210			public function testGetXrefDataIssue673(): void
211			{
212			$filename = $this->rootDir.'/samples/bugs/Issue673.pdf';
213
214			// Parsing this document would previously throw an Exception
215			$parser = $this->getParserInstance();
216			$document = $parser->parseFile($filename);
217			$text = $document->getText();
218
219			self::assertStringContainsString('6 rue des Goutais', $text);
220			}
221
222			/**
223			* Handle self referencing xref
224			*
225			* It seems that some PDF creators output `Prev 0` when there is no previous xref.
226			*
227			* @see https://github.com/smalot/pdfparser/pull/727
228			*/
229			public function testDecodeXrefIssue727(): void
230			{
231			$filename = $this->rootDir.'/samples/bugs/Issue727.pdf';
232
233			// Parsing this document would previously cause an infinite loop
234			$parser = $this->getParserInstance();
235			$document = $parser->parseFile($filename);
236			$text = $document->getText();
237
238			self::assertStringContainsString('', $text);
239			}
240
241			/**
242			* Test that getXrefData prevents circular references
243			*
244			* When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset),
245			* the parser should detect this and stop recursion to prevent infinite loops.
246			*/
247			public function testGetXrefDataPreventsCircularReferences(): void
248			{
249			// Create a minimal PDF structure with xref that would create a circular reference
250			$pdfData = "%PDF-1.5\n";
251			$pdfData .= "xref\n";
252			$pdfData .= "0 1\n";
253			$pdfData .= "0000000000 65535 f \n";
254			$pdfData .= "trailer\n";
255			$pdfData .= "<</Size 1/Prev 7>>\n"; // Prev points back to offset 7 (the xref keyword)
256			$pdfData .= "startxref\n";
257			$pdfData .= "7\n";
258			$pdfData .= "%%EOF\n";
259
260			// Test with visitedOffsets containing the offset we're trying to visit
261			$result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]);
262
263			// Should return empty xref array without recursing
264			$this->assertIsArray($result);
265			$this->assertEmpty($result);
266			}
267
268			/**
269			* Test that decodeXref passes visitedOffsets correctly when handling Prev
270			*
271			* This ensures that circular reference detection works when decodeXref
272			* calls getXrefData for a Prev pointer.
273			*/
274			public function testDecodeXrefPassesVisitedOffsets(): void
275			{
276			// Create a minimal xref structure with Prev
277			$pdfData = "xref\n";
278			$pdfData .= "0 1\n";
279			$pdfData .= "0000000000 65535 f \n";
280			$pdfData .= "trailer\n";
281			$pdfData .= "<</Size 1/Prev 100>>\n";
282
283			// Call decodeXref with visitedOffsets that includes the Prev offset
284			// This should not cause infinite recursion
285			$result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]);
286
287			// Should complete without error and return an array
288			$this->assertIsArray($result);
289			$this->assertArrayHasKey('trailer', $result);
290			}
291
292			/**
293			* Test that getXrefData tracks visited offsets correctly
294			*
295			* Ensures that offsets are added to visitedOffsets array to prevent
296			* circular references in subsequent calls.
297			*/
298			public function testGetXrefDataTracksVisitedOffsets(): void
299			{
300			// Test that calling with an already-visited offset returns immediately
301			$pdfData = "%PDF-1.5\n";
302			$pdfData .= "xref\n";
303			$pdfData .= "0 1\n";
304			$pdfData .= "0000000000 65535 f \n";
305			$pdfData .= "trailer\n";
306			$pdfData .= "<</Size 1>>\n";
307			$pdfData .= "startxref\n";
308			$pdfData .= "7\n";
309			$pdfData .= "%%EOF\n";
310
311			// Call with offset 50 already in visitedOffsets - should return immediately
312			$result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]);
313
314			// Should return empty array without processing
315			$this->assertIsArray($result);
316			$this->assertEmpty($result);
317			}
318			}
319

smalot / pdfparser

Push — master ( 6b52c6...61c9bc )

testGetXrefDataTracksVisitedOffsets() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like