Test Failed
Push — master ( 6b52c6...61c9bc )
by Konrad
14:36
created

RawDataParserHelper::exposeGetXrefData()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 4
dl 0
loc 3
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration\RawData;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Config;
40
use Smalot\PdfParser\RawData\RawDataParser;
41
42
class RawDataParserHelper extends RawDataParser
43
{
44
    /**
45
     * Expose protected function "getRawObject".
46
     */
47
    public function exposeGetRawObject($pdfData, $offset = 0)
48
    {
49
        return $this->getRawObject($pdfData, $offset);
50
    }
51
52
    /**
53
     * Expose protected function "getXrefData".
54
     */
55
    public function exposeGetXrefData(string $pdfData, int $offset = 0, array $xref = [], array $visitedOffsets = []): array
56
    {
57
        return $this->getXrefData($pdfData, $offset, $xref, $visitedOffsets);
58
    }
59
60
    /**
61
     * Expose protected function "decodeXref".
62
     */
63
    public function exposeDecodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
64
    {
65
        return $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets);
66
    }
67
68
    /**
69
     * Expose protected function "decodeXrefStream".
70
     */
71
    public function exposeDecodeXrefStream(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array
72
    {
73
        return $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets);
74
    }
75
}
76
77
class RawDataParserTest extends TestCase
78
{
79
    protected function setUp(): void
80
    {
81
        parent::setUp();
82
83
        $this->fixture = new RawDataParserHelper([], new Config());
84
    }
85
86
    /**
87
     * Tests buggy behavior of getRawObject.
88
     *
89
     * When PDF has corrupted xref table getRawObject may run into an infinite loop.
90
     *
91
     * @see https://github.com/smalot/pdfparser/issues/372
92
     * @see https://github.com/smalot/pdfparser/pull/377
93
     */
94
    public function testGetRawObjectIssue372(): void
95
    {
96
        // The following $data content is a minimal example to trigger the infinite loop
97
        $data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';
98
99
        // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
100
        // if the fix is not there.
101
        $result = $this->fixture->exposeGetRawObject($data);
102
103
        $this->assertEquals(
104
            [
105
                '<<',
106
                [
107
                    ['/', 'Producer', 11],
108
                    ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
109
                ],
110
                52,
111
            ],
112
            $result
113
        );
114
115
        // Test that spaces after a 'stream' declaration are absorbed
116
        // See: https://github.com/smalot/pdfparser/issues/641
117
        $data = 'stream '."\n";
118
        $data .= 'streamdata'."\n";
119
        $data .= 'endstream'."\n";
120
        $data .= 'endobj';
121
122
        $result = $this->fixture->exposeGetRawObject($data);
123
124
        // Value 'streamdata'."\n" would be empty string without the fix
125
        $this->assertEquals(
126
            [
127
                'stream',
128
                'streamdata'."\n",
129
                19,
130
            ],
131
            $result
132
        );
133
    }
134
135
    /**
136
     * Tests buggy behavior of decodeXrefStream.
137
     *
138
     * @see https://github.com/smalot/pdfparser/issues/30
139
     * @see https://github.com/smalot/pdfparser/issues/192
140
     * @see https://github.com/smalot/pdfparser/issues/209
141
     * @see https://github.com/smalot/pdfparser/issues/330
142
     * @see https://github.com/smalot/pdfparser/issues/356
143
     * @see https://github.com/smalot/pdfparser/issues/373
144
     * @see https://github.com/smalot/pdfparser/issues/392
145
     * @see https://github.com/smalot/pdfparser/issues/397
146
     */
147
    public function testDecodeXrefStreamIssue356(): void
148
    {
149
        $filename = $this->rootDir.'/samples/bugs/Issue356.pdf';
150
151
        $parser = $this->getParserInstance();
152
        $document = $parser->parseFile($filename);
153
        $pages = $document->getPages();
154
155
        $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
156
    }
157
158
    public function testDecodeObjectHeaderIssue405(): void
159
    {
160
        $filename = $this->rootDir.'/samples/bugs/Issue405.pdf';
161
162
        $parser = $this->getParserInstance();
163
        $document = $parser->parseFile($filename);
164
        $pages = $document->getPages();
165
166
        $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
167
    }
168
169
    /**
170
     * Tests buggy behavior of decodeXrefStream.
171
     *
172
     * When PDF has more than one entry in the /Index area (for example by changing
173
     * the document description), only the first entry is used.
174
     * If the fix is not used the array returned by getDetails() contains only the entry
175
     * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
176
     * 'Subject' (which come from the 'Info' object) are not listed, because the
177
     * 'Info' object gets a wrong object id during parsing the data into the xref structure.
178
     * So the object id listed at the /Info entry is not valid and the data of the info object
179
     * cannot be loaded during executing Document::buildDetails().
180
     *
181
     * @see https://github.com/smalot/pdfparser/pull/479
182
     */
183
    public function testDecodeXrefStreamIssue479(): void
184
    {
185
        $filename = $this->rootDir.'/samples/bugs/Issue479.pdf';
186
187
        $parser = $this->getParserInstance();
188
        $document = $parser->parseFile($filename);
189
        $details = $document->getDetails();
190
191
        $this->assertArrayHasKey('Author', $details);
192
        $this->assertArrayHasKey('CreationDate', $details);
193
        $this->assertArrayHasKey('Creator', $details);
194
        $this->assertArrayHasKey('ModDate', $details);
195
        $this->assertArrayHasKey('Producer', $details);
196
        $this->assertArrayHasKey('Subject', $details);
197
        $this->assertArrayHasKey('Title', $details);
198
    }
199
200
    /**
201
     * Account for inaccurate offset values in getXrefData.
202
     *
203
     * Normally offset values extracted from the PDF document are exact.
204
     * However in some cases, they may point to whitespace *before* a
205
     * valid xref keyword. Move the offset forward past whitespace to
206
     * make this function a little more lenient.
207
     *
208
     * @see https://github.com/smalot/pdfparser/issues/673
209
     */
210
    public function testGetXrefDataIssue673(): void
211
    {
212
        $filename = $this->rootDir.'/samples/bugs/Issue673.pdf';
213
214
        // Parsing this document would previously throw an Exception
215
        $parser = $this->getParserInstance();
216
        $document = $parser->parseFile($filename);
217
        $text = $document->getText();
218
219
        self::assertStringContainsString('6 rue des Goutais', $text);
220
    }
221
222
    /**
223
     * Handle self referencing xref
224
     *
225
     * It seems that some PDF creators output `Prev 0` when there is no previous xref.
226
     *
227
     * @see https://github.com/smalot/pdfparser/pull/727
228
     */
229
    public function testDecodeXrefIssue727(): void
230
    {
231
        $filename = $this->rootDir.'/samples/bugs/Issue727.pdf';
232
233
        // Parsing this document would previously cause an infinite loop
234
        $parser = $this->getParserInstance();
235
        $document = $parser->parseFile($filename);
236
        $text = $document->getText();
237
238
        self::assertStringContainsString('', $text);
239
    }
240
241
    /**
242
     * Test that getXrefData prevents circular references
243
     *
244
     * When a PDF has circular references in xref chain (e.g., Prev pointing to already visited offset),
245
     * the parser should detect this and stop recursion to prevent infinite loops.
246
     */
247
    public function testGetXrefDataPreventsCircularReferences(): void
248
    {
249
        // Create a minimal PDF structure with xref that would create a circular reference
250
        $pdfData = "%PDF-1.5\n";
251
        $pdfData .= "xref\n";
252
        $pdfData .= "0 1\n";
253
        $pdfData .= "0000000000 65535 f \n";
254
        $pdfData .= "trailer\n";
255
        $pdfData .= "<</Size 1/Prev 7>>\n";  // Prev points back to offset 7 (the xref keyword)
256
        $pdfData .= "startxref\n";
257
        $pdfData .= "7\n";
258
        $pdfData .= "%%EOF\n";
259
260
        // Test with visitedOffsets containing the offset we're trying to visit
261
        $result = $this->fixture->exposeGetXrefData($pdfData, 7, [], [7]);
262
263
        // Should return empty xref array without recursing
264
        $this->assertIsArray($result);
265
        $this->assertEmpty($result);
266
    }
267
268
    /**
269
     * Test that decodeXref passes visitedOffsets correctly when handling Prev
270
     *
271
     * This ensures that circular reference detection works when decodeXref
272
     * calls getXrefData for a Prev pointer.
273
     */
274
    public function testDecodeXrefPassesVisitedOffsets(): void
275
    {
276
        // Create a minimal xref structure with Prev
277
        $pdfData = "xref\n";
278
        $pdfData .= "0 1\n";
279
        $pdfData .= "0000000000 65535 f \n";
280
        $pdfData .= "trailer\n";
281
        $pdfData .= "<</Size 1/Prev 100>>\n";
282
283
        // Call decodeXref with visitedOffsets that includes the Prev offset
284
        // This should not cause infinite recursion
285
        $result = $this->fixture->exposeDecodeXref($pdfData, 0, [], [100]);
286
287
        // Should complete without error and return an array
288
        $this->assertIsArray($result);
289
        $this->assertArrayHasKey('trailer', $result);
290
    }
291
292
    /**
293
     * Test that getXrefData tracks visited offsets correctly
294
     *
295
     * Ensures that offsets are added to visitedOffsets array to prevent
296
     * circular references in subsequent calls.
297
     */
298
    public function testGetXrefDataTracksVisitedOffsets(): void
299
    {
300
        // Test that calling with an already-visited offset returns immediately
301
        $pdfData = "%PDF-1.5\n";
302
        $pdfData .= "xref\n";
303
        $pdfData .= "0 1\n";
304
        $pdfData .= "0000000000 65535 f \n";
305
        $pdfData .= "trailer\n";
306
        $pdfData .= "<</Size 1>>\n";
307
        $pdfData .= "startxref\n";
308
        $pdfData .= "7\n";
309
        $pdfData .= "%%EOF\n";
310
311
        // Call with offset 50 already in visitedOffsets - should return immediately
312
        $result = $this->fixture->exposeGetXrefData($pdfData, 50, [], [50]);
313
314
        // Should return empty array without processing
315
        $this->assertIsArray($result);
316
        $this->assertEmpty($result);
317
    }
318
}
319