Passed
Push — master ( ed3fc0...fb77ea )
by Konrad
13:24 queued 10:54
created

RawDataParserTest::testGetXrefDataIssue673()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 5
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 10
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration\RawData;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Config;
40
use Smalot\PdfParser\RawData\RawDataParser;
41
42
class RawDataParserHelper extends RawDataParser
43
{
44
    /**
45
     * Expose protected function "getRawObject".
46
     */
47
    public function exposeGetRawObject($pdfData, $offset = 0)
48
    {
49
        return $this->getRawObject($pdfData, $offset);
50
    }
51
}
52
53
class RawDataParserTest extends TestCase
54
{
55
    protected function setUp(): void
56
    {
57
        parent::setUp();
58
59
        $this->fixture = new RawDataParserHelper([], new Config());
60
    }
61
62
    /**
63
     * Tests buggy behavior of getRawObject.
64
     *
65
     * When PDF has corrupted xref table getRawObject may run into an infinite loop.
66
     *
67
     * @see https://github.com/smalot/pdfparser/issues/372
68
     * @see https://github.com/smalot/pdfparser/pull/377
69
     */
70
    public function testGetRawObjectIssue372(): void
71
    {
72
        // The following $data content is a minimal example to trigger the infinite loop
73
        $data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';
74
75
        // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
76
        // if the fix is not there.
77
        $result = $this->fixture->exposeGetRawObject($data);
78
79
        $this->assertEquals(
80
            [
81
                '<<',
82
                [
83
                    ['/', 'Producer', 11],
84
                    ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
85
                ],
86
                52,
87
            ],
88
            $result
89
        );
90
91
        // Test that spaces after a 'stream' declaration are absorbed
92
        // See: https://github.com/smalot/pdfparser/issues/641
93
        $data = 'stream '."\n";
94
        $data .= 'streamdata'."\n";
95
        $data .= 'endstream'."\n";
96
        $data .= 'endobj';
97
98
        $result = $this->fixture->exposeGetRawObject($data);
99
100
        // Value 'streamdata'."\n" would be empty string without the fix
101
        $this->assertEquals(
102
            [
103
                'stream',
104
                'streamdata'."\n",
105
                19,
106
            ],
107
            $result
108
        );
109
    }
110
111
    /**
112
     * Tests buggy behavior of decodeXrefStream.
113
     *
114
     * @see https://github.com/smalot/pdfparser/issues/30
115
     * @see https://github.com/smalot/pdfparser/issues/192
116
     * @see https://github.com/smalot/pdfparser/issues/209
117
     * @see https://github.com/smalot/pdfparser/issues/330
118
     * @see https://github.com/smalot/pdfparser/issues/356
119
     * @see https://github.com/smalot/pdfparser/issues/373
120
     * @see https://github.com/smalot/pdfparser/issues/392
121
     * @see https://github.com/smalot/pdfparser/issues/397
122
     */
123
    public function testDecodeXrefStreamIssue356(): void
124
    {
125
        $filename = $this->rootDir.'/samples/bugs/Issue356.pdf';
126
127
        $parser = $this->getParserInstance();
128
        $document = $parser->parseFile($filename);
129
        $pages = $document->getPages();
130
131
        $this->assertStringContainsString('Ημερήσια έκθεση επιδημιολογικής', $pages[0]->getText());
132
    }
133
134
    public function testDecodeObjectHeaderIssue405(): void
135
    {
136
        $filename = $this->rootDir.'/samples/bugs/Issue405.pdf';
137
138
        $parser = $this->getParserInstance();
139
        $document = $parser->parseFile($filename);
140
        $pages = $document->getPages();
141
142
        $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText());
143
    }
144
145
    /**
146
     * Tests buggy behavior of decodeXrefStream.
147
     *
148
     * When PDF has more than one entry in the /Index area (for example by changing
149
     * the document description), only the first entry is used.
150
     * If the fix is not used the array returned by getDetails() contains only the entry
151
     * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title',
152
     * 'Subject' (which come from the 'Info' object) are not listed, because the
153
     * 'Info' object gets a wrong object id during parsing the data into the xref structure.
154
     * So the object id listed at the /Info entry is not valid and the data of the info object
155
     * cannot be loaded during executing Document::buildDetails().
156
     *
157
     * @see https://github.com/smalot/pdfparser/pull/479
158
     */
159
    public function testDecodeXrefStreamIssue479(): void
160
    {
161
        $filename = $this->rootDir.'/samples/bugs/Issue479.pdf';
162
163
        $parser = $this->getParserInstance();
164
        $document = $parser->parseFile($filename);
165
        $details = $document->getDetails();
166
167
        $this->assertArrayHasKey('Author', $details);
168
        $this->assertArrayHasKey('CreationDate', $details);
169
        $this->assertArrayHasKey('Creator', $details);
170
        $this->assertArrayHasKey('ModDate', $details);
171
        $this->assertArrayHasKey('Producer', $details);
172
        $this->assertArrayHasKey('Subject', $details);
173
        $this->assertArrayHasKey('Title', $details);
174
    }
175
176
    /**
177
     * Account for inaccurate offset values in getXrefData.
178
     *
179
     * Normally offset values extracted from the PDF document are exact.
180
     * However in some cases, they may point to whitespace *before* a
181
     * valid xref keyword. Move the offset forward past whitespace to
182
     * make this function a little more lenient.
183
     *
184
     * @see https://github.com/smalot/pdfparser/issues/673
185
     */
186
    public function testGetXrefDataIssue673(): void
187
    {
188
        $filename = $this->rootDir.'/samples/bugs/Issue673.pdf';
189
190
        // Parsing this document would previously throw an Exception
191
        $parser = $this->getParserInstance();
192
        $document = $parser->parseFile($filename);
193
        $text = $document->getText();
194
195
        self::assertStringContainsString('6 rue des Goutais', $text);
196
    }
197
}
198