Test Failed
Pull Request — master (#634)
by
unknown
01:58
created

DocumentTest::testGetTextPull634SmallPDF()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 11
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 22
rs 9.9
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\Header;
41
use Smalot\PdfParser\Page;
42
use Smalot\PdfParser\Pages;
43
use Smalot\PdfParser\Parser;
44
use Smalot\PdfParser\PDFObject;
45
46
class DocumentTest extends TestCase
47
{
48
    protected function getDocumentInstance(): Document
49
    {
50
        return new Document();
51
    }
52
53
    protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject
54
    {
55
        return new PDFObject($document, $header);
56
    }
57
58
    protected function getPageInstance(Document $document, Header $header): PDFObject
59
    {
60
        return new Page($document, $header);
61
    }
62
63
    protected function getPagesInstance(Document $document, Header $header): PDFObject
64
    {
65
        return new Pages($document, $header);
66
    }
67
68
    public function testSetObjects(): void
69
    {
70
        $document = $this->getDocumentInstance();
71
        $object = $this->getPDFObjectInstance($document);
72
73
        // Obj #1 is missing
74
        $this->assertNull($document->getObjectById(1));
75
        $document->setObjects([1 => $object]);
76
77
        // Obj #1 exists
78
        $this->assertTrue($document->getObjectById(1) instanceof PDFObject);
79
80
        $content = '<</Type/Page>>';
81
        $header = Header::parse($content, $document);
82
        $object = $this->getPDFObjectInstance($document, $header);
83
        $document->setObjects([2 => $object]);
84
85
        // Obj #1 is missing
86
        $this->assertNull($document->getObjectById(1));
87
88
        // Obj #2 exists
89
        $this->assertTrue($document->getObjectById(2) instanceof PDFObject);
90
    }
91
92
    public function testGetObjects(): void
93
    {
94
        $document = $this->getDocumentInstance();
95
        $object1 = $this->getPDFObjectInstance($document);
96
        $content = '<</Type/Page>>unparsed content';
97
        $header = Header::parse($content, $document);
98
99
        $object2 = $this->getPageInstance($document, $header);
100
        $document->setObjects([1 => $object1, 2 => $object2]);
101
102
        $objects = $document->getObjects();
103
        $this->assertEquals(2, \count($objects));
104
        $this->assertTrue($objects[1] instanceof PDFObject);
105
        $this->assertTrue($objects[2] instanceof PDFObject);
106
        $this->assertTrue($objects[2] instanceof Page);
107
    }
108
109
    public function testDictionary(): void
110
    {
111
        $document = $this->getDocumentInstance();
112
        $objects = $document->getDictionary();
113
        $this->assertEquals(0, \count($objects));
114
        $object1 = $this->getPDFObjectInstance($document);
115
116
        $content = '<</Type/Page>>';
117
        $header = Header::parse($content, $document);
118
        $object2 = $this->getPageInstance($document, $header);
119
        $document->setObjects([1 => $object1, 2 => $object2]);
120
121
        $objects = $document->getDictionary();
122
        $this->assertEquals(1, \count($objects));
123
        $this->assertEquals(1, \count($objects['Page']['all']));
124
        $this->assertEquals($object2, $objects['Page']['all'][2]);
125
    }
126
127
    public function testGetObjectsByType(): void
128
    {
129
        $document = $this->getDocumentInstance();
130
        $object1 = $this->getPDFObjectInstance($document);
131
        $content = '<</Type/Page>>';
132
        $header = Header::parse($content, $document);
133
        $object2 = $this->getPageInstance($document, $header);
134
        $document->setObjects([1 => $object1, 2 => $object2]);
135
136
        $objects = $document->getObjectsByType('Page');
137
        $this->assertEquals(1, \count($objects));
138
        $this->assertTrue($objects[2] instanceof PDFObject);
139
        $this->assertTrue($objects[2] instanceof Page);
140
    }
141
142
    public function testGetPages(): void
143
    {
144
        $document = $this->getDocumentInstance();
145
146
        // Listing pages from type Page
147
        $content = '<</Type/Page>>';
148
        $header = Header::parse($content, $document);
149
        $object1 = $this->getPageInstance($document, $header);
150
        $header = Header::parse($content, $document);
151
        $object2 = $this->getPageInstance($document, $header);
152
        $document->setObjects([1 => $object1, 2 => $object2]);
153
        $pages = $document->getPages();
154
155
        $this->assertEquals(2, \count($pages));
156
        $this->assertTrue($pages[0] instanceof Page);
157
        $this->assertTrue($pages[1] instanceof Page);
158
159
        // Listing pages from type Pages (kids)
160
        $content = '<</Type/Page>>';
161
        $header = Header::parse($content, $document);
162
        $object1 = $this->getPageInstance($document, $header);
163
        $header = Header::parse($content, $document);
164
        $object2 = $this->getPageInstance($document, $header);
165
        $header = Header::parse($content, $document);
166
        $object3 = $this->getPageInstance($document, $header);
167
168
        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
169
        $header = Header::parse($content, $document);
170
        $object4 = $this->getPagesInstance($document, $header);
171
172
        $content = '<</Type/Pages/Kids[3 0 R]>>';
173
        $header = Header::parse($content, $document);
174
        $object5 = $this->getPagesInstance($document, $header);
175
176
        $document->setObjects([
177
            '1_0' => $object1,
178
            '2_0' => $object2,
179
            '3_0' => $object3,
180
            '4_0' => $object4,
181
            '5_0' => $object5,
182
        ]);
183
        $pages = $document->getPages();
184
185
        $this->assertEquals(3, \count($pages));
186
        $this->assertTrue($pages[0] instanceof Page);
187
        $this->assertTrue($pages[1] instanceof Page);
188
        $this->assertTrue($pages[2] instanceof Page);
189
190
        // Listing pages from type Catalog
191
        $content = '<</Type/Page>>';
192
        $header = Header::parse($content, $document);
193
        $object1 = $this->getPageInstance($document, $header);
194
        $header = Header::parse($content, $document);
195
        $object2 = $this->getPageInstance($document, $header);
196
        $header = Header::parse($content, $document);
197
        $object3 = $this->getPageInstance($document, $header);
198
        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
199
        $header = Header::parse($content, $document);
200
        $object4 = $this->getPagesInstance($document, $header);
201
        $content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>';
202
        $header = Header::parse($content, $document);
203
        $object5 = $this->getPagesInstance($document, $header);
204
        $content = '<</Type/Catalog/Pages 5 0 R >>';
205
        $header = Header::parse($content, $document);
206
        $object6 = $this->getPagesInstance($document, $header);
207
        $document->setObjects(
208
            [
209
                '1_0' => $object1,
210
                '2_0' => $object2,
211
                '3_0' => $object3,
212
                '4_0' => $object4,
213
                '5_0' => $object5,
214
                '6_0' => $object6,
215
            ]
216
        );
217
        $pages = $document->getPages();
218
        $this->assertEquals(3, \count($pages));
219
        $this->assertTrue($pages[0] instanceof Page);
220
        $this->assertTrue($pages[1] instanceof Page);
221
        $this->assertTrue($pages[2] instanceof Page);
222
    }
223
224
    public function testGetPagesMissingCatalog(): void
225
    {
226
        $this->expectException(\Exception::class);
227
        $this->expectExceptionMessage('Missing catalog.');
228
229
        // Missing catalog
230
        $document = $this->getDocumentInstance();
231
        $document->getPages();
232
    }
233
234
    /**
235
     * Tests getText method without a given page limit.
236
     *
237
     * @see https://github.com/smalot/pdfparser/pull/562
238
     */
239
    public function testGetTextNoPageLimit(): void
240
    {
241
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
242
243
        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
244
    }
245
246
    /**
247
     * Tests getText method with a given page limit.
248
     *
249
     * @see https://github.com/smalot/pdfparser/pull/562
250
     */
251
    public function testGetTextWithPageLimit(): void
252
    {
253
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
254
255
        // given text is on page 2, it has to be ignored because of that
256
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
257
    }
258
259
    /**
260
     * Tests extraction of XMP Metadata vs. getHeader() data.
261
     *
262
     * @see https://github.com/smalot/pdfparser/pull/606
263
     */
264
    public function testExtractXMPMetadata(): void
265
    {
266
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
267
268
        $details = $document->getDetails();
269
270
        // Test that the dc:title data was extracted from the XMP
271
        // Metadata.
272
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
273
    }
274
275
    /**
276
     * Tests PDFDocEncoding decode of Document Properties
277
     *
278
     * @see https://github.com/smalot/pdfparser/issues/609
279
     */
280
    public function testPDFDocEncodingDecode(): void
281
    {
282
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Issue609.pdf');
283
284
        $details = $document->getDetails();
285
286
        // These test that Adobe-inserted \r are removed from a UTF-8
287
        // escaped metadata string, and the surrounding characters are
288
        // repaired
289
        $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
290
        self::assertStringContainsString($testKeywords, $details['Keywords']);
291
292
        $testKeywords = 'added line-feeds often destroy multibyte characters';
293
        self::assertStringContainsString($testKeywords, $details['Keywords']);
294
295
        // This tests that the PDFDocEncoding characters that differ
296
        // from CP-1252 are decoded to their correct UTF-8 code points
297
        // as well as removing \r line-feeds
298
        $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
299
        self::assertStringContainsString($testSubject, $details['Subject']);
300
    }
301
302
    /**
303
     * Test getText result.
304
     *
305
     * PDF generated with Chromium 116 via SaveAs-dialog.
306
     */
307
    public function testGetTextPull634Chromium(): void
308
    {
309
        $document = (new Parser())->parseFile($this->rootDir.'/samples/R2RML-Spec_Generated_via_Chromium-SaveAs-PDF.pdf');
310
311
        self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText());
312
    }
313
314
    /**
315
     * Test getText result.
316
     *
317
     * PDF (1.4) generated with LibreOffice Writer (6.4).
318
     *
319
     * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html
320
     */
321
    public function testGetTextPull634LibreOffice(): void
322
    {
323
        $document = (new Parser())->parseFile($this->rootDir.'/samples/RichDocument_Generated_via_Libreoffice-6.4_PDF-v1.4.pdf');
324
325
        self::assertStringContainsString(
326
            'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß',
327
            $document->getText()
328
        );
329
    }
330
331
    /**
332
     * Test getText result.
333
     *
334
     * PDF (v 1.4) generated with Inkscape 0.92.
335
     */
336
    public function testGetTextPull634InkscapePDF14(): void
337
    {
338
        $document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.4.pdf');
339
340
        self::assertEquals('TEST', $document->getText());
341
    }
342
343
    /**
344
     * Test getText result.
345
     *
346
     * PDF (v 1.5) generated with Inkscape 0.92.
347
     */
348
    public function testGetTextPull634InkscapePDF15(): void
349
    {
350
        $document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.5.pdf');
351
352
        self::assertEquals('TEST', $document->getText());
353
    }
354
355
    /**
356
     * Test getText result.
357
     *
358
     * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox.
359
     */
360
    public function testGetTextPull634MicrosoftPDF17(): void
361
    {
362
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf');
363
364
        $outputText = $document->getText();
365
366
        self::assertStringContainsString(
367
            'Adobe PDF icon'."\n".'Filename'."\n".'extension',
368
            $outputText
369
        );
370
371
        self::assertStringContainsString(
372
            'are necessary to make, use, sell, and distribute PDF-compliant',
373
            $outputText
374
        );
375
    }
376
377
    /**
378
     * Test getText result.
379
     *
380
     * PDF generated from .docx with SmallPDF (https://smallpdf.com)
381
     */
382
    public function testGetTextPull634SmallPDF(): void
383
    {
384
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Document_Generated_by_SmallPDF.pdf');
385
386
        $outputText = $document->getText();
387
388
        // Actual encoded spaces in the document are preserved
389
        self::assertStringContainsString(
390
            'SmallPDF                       SMALLPDF                             SmallPDF',
391
            $outputText
392
        );
393
394
        // Hebrew text
395
        self::assertStringContainsString(
396
            'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online',
397
            $outputText
398
        );
399
400
        // Russian text
401
        self::assertStringContainsString(
402
            'Russian Keyboard - русская клавиатура - Type Russian',
403
            $outputText
404
        );
405
    }
406
}
407