DocumentTest::testGetTextPull634SmallPDF() - Code Metrics - Inspection of "Major Update to PDFObject.php + Ancillary" - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — master (#634)

unknown

created 2023-09-07 18:18 UTC

DocumentTest::testGetTextPull634SmallPDF() A

↳ Parent: DocumentTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	22
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	1
eloc	11
c	1
b	0
f	0
nc	1
nop	0
dl	0
loc	22
rs	9.9

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 *
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Header;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Pages;
use Smalot\PdfParser\Parser;
use Smalot\PdfParser\PDFObject;

class DocumentTest extends TestCase
{
    protected function getDocumentInstance(): Document
    {
        return new Document();
    }

    protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject
    {
        return new PDFObject($document, $header);
    }

    protected function getPageInstance(Document $document, Header $header): PDFObject
    {
        return new Page($document, $header);
    }

    protected function getPagesInstance(Document $document, Header $header): PDFObject
    {
        return new Pages($document, $header);
    }

    public function testSetObjects(): void
    {
        $document = $this->getDocumentInstance();
        $object = $this->getPDFObjectInstance($document);

        // Obj #1 is missing
        $this->assertNull($document->getObjectById(1));
        $document->setObjects([1 => $object]);

        // Obj #1 exists
        $this->assertTrue($document->getObjectById(1) instanceof PDFObject);

        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object = $this->getPDFObjectInstance($document, $header);
        $document->setObjects([2 => $object]);

        // Obj #1 is missing
        $this->assertNull($document->getObjectById(1));

        // Obj #2 exists
        $this->assertTrue($document->getObjectById(2) instanceof PDFObject);
    }

    public function testGetObjects(): void
    {
        $document = $this->getDocumentInstance();
        $object1 = $this->getPDFObjectInstance($document);
        $content = '<</Type/Page>>unparsed content';
        $header = Header::parse($content, $document);

        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getObjects();
        $this->assertEquals(2, \count($objects));
        $this->assertTrue($objects[1] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof Page);
    }

    public function testDictionary(): void
    {
        $document = $this->getDocumentInstance();
        $objects = $document->getDictionary();
        $this->assertEquals(0, \count($objects));
        $object1 = $this->getPDFObjectInstance($document);

        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getDictionary();
        $this->assertEquals(1, \count($objects));
        $this->assertEquals(1, \count($objects['Page']['all']));
        $this->assertEquals($object2, $objects['Page']['all'][2]);
    }

    public function testGetObjectsByType(): void
    {
        $document = $this->getDocumentInstance();
        $object1 = $this->getPDFObjectInstance($document);
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getObjectsByType('Page');
        $this->assertEquals(1, \count($objects));
        $this->assertTrue($objects[2] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof Page);
    }

    public function testGetPages(): void
    {
        $document = $this->getDocumentInstance();

        // Listing pages from type Page
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);
        $pages = $document->getPages();

        $this->assertEquals(2, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);

        // Listing pages from type Pages (kids)
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object3 = $this->getPageInstance($document, $header);

        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
        $header = Header::parse($content, $document);
        $object4 = $this->getPagesInstance($document, $header);

        $content = '<</Type/Pages/Kids[3 0 R]>>';
        $header = Header::parse($content, $document);
        $object5 = $this->getPagesInstance($document, $header);

        $document->setObjects([
            '1_0' => $object1,
            '2_0' => $object2,
            '3_0' => $object3,
            '4_0' => $object4,
            '5_0' => $object5,
        ]);
        $pages = $document->getPages();

        $this->assertEquals(3, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);
        $this->assertTrue($pages[2] instanceof Page);

        // Listing pages from type Catalog
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object3 = $this->getPageInstance($document, $header);
        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
        $header = Header::parse($content, $document);
        $object4 = $this->getPagesInstance($document, $header);
        $content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>';
        $header = Header::parse($content, $document);
        $object5 = $this->getPagesInstance($document, $header);
        $content = '<</Type/Catalog/Pages 5 0 R >>';
        $header = Header::parse($content, $document);
        $object6 = $this->getPagesInstance($document, $header);
        $document->setObjects(
            [
                '1_0' => $object1,
                '2_0' => $object2,
                '3_0' => $object3,
                '4_0' => $object4,
                '5_0' => $object5,
                '6_0' => $object6,
            ]
        );
        $pages = $document->getPages();
        $this->assertEquals(3, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);
        $this->assertTrue($pages[2] instanceof Page);
    }

    public function testGetPagesMissingCatalog(): void
    {
        $this->expectException(\Exception::class);
        $this->expectExceptionMessage('Missing catalog.');

        // Missing catalog
        $document = $this->getDocumentInstance();
        $document->getPages();
    }

    /**
     * Tests getText method without a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextNoPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
    }

    /**
     * Tests getText method with a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextWithPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        // given text is on page 2, it has to be ignored because of that
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
    }

    /**
     * Tests extraction of XMP Metadata vs. getHeader() data.
     *
     * @see https://github.com/smalot/pdfparser/pull/606
     */
    public function testExtractXMPMetadata(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');

        $details = $document->getDetails();

        // Test that the dc:title data was extracted from the XMP
        // Metadata.
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
    }

    /**
     * Tests PDFDocEncoding decode of Document Properties
     *
     * @see https://github.com/smalot/pdfparser/issues/609
     */
    public function testPDFDocEncodingDecode(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Issue609.pdf');

        $details = $document->getDetails();

        // These test that Adobe-inserted \r are removed from a UTF-8
        // escaped metadata string, and the surrounding characters are
        // repaired
        $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ﬁﬂŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
        self::assertStringContainsString($testKeywords, $details['Keywords']);

        $testKeywords = 'added line-feeds often destroy multibyte characters';
        self::assertStringContainsString($testKeywords, $details['Keywords']);

        // This tests that the PDFDocEncoding characters that differ
        // from CP-1252 are decoded to their correct UTF-8 code points
        // as well as removing \r line-feeds
        $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
        self::assertStringContainsString($testSubject, $details['Subject']);
    }

    /**
     * Test getText result.
     *
     * PDF generated with Chromium 116 via SaveAs-dialog.
     */
    public function testGetTextPull634Chromium(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/R2RML-Spec_Generated_via_Chromium-SaveAs-PDF.pdf');

        self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText());
    }

    /**
     * Test getText result.
     *
     * PDF (1.4) generated with LibreOffice Writer (6.4).
     *
     * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html
     */
    public function testGetTextPull634LibreOffice(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/RichDocument_Generated_via_Libreoffice-6.4_PDF-v1.4.pdf');

        self::assertStringContainsString(
            'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß',
            $document->getText()
        );
    }

    /**
     * Test getText result.
     *
     * PDF (v 1.4) generated with Inkscape 0.92.
     */
    public function testGetTextPull634InkscapePDF14(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.4.pdf');

        self::assertEquals('TEST', $document->getText());
    }

    /**
     * Test getText result.
     *
     * PDF (v 1.5) generated with Inkscape 0.92.
     */
    public function testGetTextPull634InkscapePDF15(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.5.pdf');

        self::assertEquals('TEST', $document->getText());
    }

    /**
     * Test getText result.
     *
     * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox.
     */
    public function testGetTextPull634MicrosoftPDF17(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf');

        $outputText = $document->getText();

        self::assertStringContainsString(
            'Adobe PDF icon'."\n".'Filename'."\n".'extension',
            $outputText
        );

        self::assertStringContainsString(
            'are necessary to make, use, sell, and distribute PDF-compliant',
            $outputText
        );
    }

    /**
     * Test getText result.
     *
     * PDF generated from .docx with SmallPDF (https://smallpdf.com)
     */
    public function testGetTextPull634SmallPDF(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/Document_Generated_by_SmallPDF.pdf');

        $outputText = $document->getText();

        // Actual encoded spaces in the document are preserved
        self::assertStringContainsString(
            'SmallPDF                       SMALLPDF                             SmallPDF',
            $outputText
        );

        // Hebrew text
        self::assertStringContainsString(
            'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online',
            $outputText
        );

        // Russian text
        self::assertStringContainsString(
            'Russian Keyboard - русская клавиатура - Type Russian',
            $outputText
        );
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			*
8			* @date 2020-06-01
9			*
10			* @author Sébastien MALOT <[email protected]>
11			*
12			* @date 2017-01-03
13			*
14			* @license LGPLv3
15			*
16			* @url <https://github.com/smalot/pdfparser>
17			*
18			* PdfParser is a pdf library written in PHP, extraction oriented.
19			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20			*
21			* This program is free software: you can redistribute it and/or modify
22			* it under the terms of the GNU Lesser General Public License as published by
23			* the Free Software Foundation, either version 3 of the License, or
24			* (at your option) any later version.
25			*
26			* This program is distributed in the hope that it will be useful,
27			* but WITHOUT ANY WARRANTY; without even the implied warranty of
28			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29			* GNU Lesser General Public License for more details.
30			*
31			* You should have received a copy of the GNU Lesser General Public License
32			* along with this program.
33			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34			*/
35
36			namespace PHPUnitTests\Integration;
37
38			use PHPUnitTests\TestCase;
39			use Smalot\PdfParser\Document;
40			use Smalot\PdfParser\Header;
41			use Smalot\PdfParser\Page;
42			use Smalot\PdfParser\Pages;
43			use Smalot\PdfParser\Parser;
44			use Smalot\PdfParser\PDFObject;
45
46			class DocumentTest extends TestCase
47			{
48			protected function getDocumentInstance(): Document
49			{
50			return new Document();
51			}
52
53			protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject
54			{
55			return new PDFObject($document, $header);
56			}
57
58			protected function getPageInstance(Document $document, Header $header): PDFObject
59			{
60			return new Page($document, $header);
61			}
62
63			protected function getPagesInstance(Document $document, Header $header): PDFObject
64			{
65			return new Pages($document, $header);
66			}
67
68			public function testSetObjects(): void
69			{
70			$document = $this->getDocumentInstance();
71			$object = $this->getPDFObjectInstance($document);
72
73			// Obj #1 is missing
74			$this->assertNull($document->getObjectById(1));
75			$document->setObjects([1 => $object]);
76
77			// Obj #1 exists
78			$this->assertTrue($document->getObjectById(1) instanceof PDFObject);
79
80			$content = '<</Type/Page>>';
81			$header = Header::parse($content, $document);
82			$object = $this->getPDFObjectInstance($document, $header);
83			$document->setObjects([2 => $object]);
84
85			// Obj #1 is missing
86			$this->assertNull($document->getObjectById(1));
87
88			// Obj #2 exists
89			$this->assertTrue($document->getObjectById(2) instanceof PDFObject);
90			}
91
92			public function testGetObjects(): void
93			{
94			$document = $this->getDocumentInstance();
95			$object1 = $this->getPDFObjectInstance($document);
96			$content = '<</Type/Page>>unparsed content';
97			$header = Header::parse($content, $document);
98
99			$object2 = $this->getPageInstance($document, $header);
100			$document->setObjects([1 => $object1, 2 => $object2]);
101
102			$objects = $document->getObjects();
103			$this->assertEquals(2, \count($objects));
104			$this->assertTrue($objects[1] instanceof PDFObject);
105			$this->assertTrue($objects[2] instanceof PDFObject);
106			$this->assertTrue($objects[2] instanceof Page);
107			}
108
109			public function testDictionary(): void
110			{
111			$document = $this->getDocumentInstance();
112			$objects = $document->getDictionary();
113			$this->assertEquals(0, \count($objects));
114			$object1 = $this->getPDFObjectInstance($document);
115
116			$content = '<</Type/Page>>';
117			$header = Header::parse($content, $document);
118			$object2 = $this->getPageInstance($document, $header);
119			$document->setObjects([1 => $object1, 2 => $object2]);
120
121			$objects = $document->getDictionary();
122			$this->assertEquals(1, \count($objects));
123			$this->assertEquals(1, \count($objects['Page']['all']));
124			$this->assertEquals($object2, $objects['Page']['all'][2]);
125			}
126
127			public function testGetObjectsByType(): void
128			{
129			$document = $this->getDocumentInstance();
130			$object1 = $this->getPDFObjectInstance($document);
131			$content = '<</Type/Page>>';
132			$header = Header::parse($content, $document);
133			$object2 = $this->getPageInstance($document, $header);
134			$document->setObjects([1 => $object1, 2 => $object2]);
135
136			$objects = $document->getObjectsByType('Page');
137			$this->assertEquals(1, \count($objects));
138			$this->assertTrue($objects[2] instanceof PDFObject);
139			$this->assertTrue($objects[2] instanceof Page);
140			}
141
142			public function testGetPages(): void
143			{
144			$document = $this->getDocumentInstance();
145
146			// Listing pages from type Page
147			$content = '<</Type/Page>>';
148			$header = Header::parse($content, $document);
149			$object1 = $this->getPageInstance($document, $header);
150			$header = Header::parse($content, $document);
151			$object2 = $this->getPageInstance($document, $header);
152			$document->setObjects([1 => $object1, 2 => $object2]);
153			$pages = $document->getPages();
154
155			$this->assertEquals(2, \count($pages));
156			$this->assertTrue($pages[0] instanceof Page);
157			$this->assertTrue($pages[1] instanceof Page);
158
159			// Listing pages from type Pages (kids)
160			$content = '<</Type/Page>>';
161			$header = Header::parse($content, $document);
162			$object1 = $this->getPageInstance($document, $header);
163			$header = Header::parse($content, $document);
164			$object2 = $this->getPageInstance($document, $header);
165			$header = Header::parse($content, $document);
166			$object3 = $this->getPageInstance($document, $header);
167
168			$content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
169			$header = Header::parse($content, $document);
170			$object4 = $this->getPagesInstance($document, $header);
171
172			$content = '<</Type/Pages/Kids[3 0 R]>>';
173			$header = Header::parse($content, $document);
174			$object5 = $this->getPagesInstance($document, $header);
175
176			$document->setObjects([
177			'1_0' => $object1,
178			'2_0' => $object2,
179			'3_0' => $object3,
180			'4_0' => $object4,
181			'5_0' => $object5,
182			]);
183			$pages = $document->getPages();
184
185			$this->assertEquals(3, \count($pages));
186			$this->assertTrue($pages[0] instanceof Page);
187			$this->assertTrue($pages[1] instanceof Page);
188			$this->assertTrue($pages[2] instanceof Page);
189
190			// Listing pages from type Catalog
191			$content = '<</Type/Page>>';
192			$header = Header::parse($content, $document);
193			$object1 = $this->getPageInstance($document, $header);
194			$header = Header::parse($content, $document);
195			$object2 = $this->getPageInstance($document, $header);
196			$header = Header::parse($content, $document);
197			$object3 = $this->getPageInstance($document, $header);
198			$content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
199			$header = Header::parse($content, $document);
200			$object4 = $this->getPagesInstance($document, $header);
201			$content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>';
202			$header = Header::parse($content, $document);
203			$object5 = $this->getPagesInstance($document, $header);
204			$content = '<</Type/Catalog/Pages 5 0 R >>';
205			$header = Header::parse($content, $document);
206			$object6 = $this->getPagesInstance($document, $header);
207			$document->setObjects(
208			[
209			'1_0' => $object1,
210			'2_0' => $object2,
211			'3_0' => $object3,
212			'4_0' => $object4,
213			'5_0' => $object5,
214			'6_0' => $object6,
215			]
216			);
217			$pages = $document->getPages();
218			$this->assertEquals(3, \count($pages));
219			$this->assertTrue($pages[0] instanceof Page);
220			$this->assertTrue($pages[1] instanceof Page);
221			$this->assertTrue($pages[2] instanceof Page);
222			}
223
224			public function testGetPagesMissingCatalog(): void
225			{
226			$this->expectException(\Exception::class);
227			$this->expectExceptionMessage('Missing catalog.');
228
229			// Missing catalog
230			$document = $this->getDocumentInstance();
231			$document->getPages();
232			}
233
234			/**
235			* Tests getText method without a given page limit.
236			*
237			* @see https://github.com/smalot/pdfparser/pull/562
238			*/
239			public function testGetTextNoPageLimit(): void
240			{
241			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
242
243			self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
244			}
245
246			/**
247			* Tests getText method with a given page limit.
248			*
249			* @see https://github.com/smalot/pdfparser/pull/562
250			*/
251			public function testGetTextWithPageLimit(): void
252			{
253			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
254
255			// given text is on page 2, it has to be ignored because of that
256			self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
257			}
258
259			/**
260			* Tests extraction of XMP Metadata vs. getHeader() data.
261			*
262			* @see https://github.com/smalot/pdfparser/pull/606
263			*/
264			public function testExtractXMPMetadata(): void
265			{
266			$document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
267
268			$details = $document->getDetails();
269
270			// Test that the dc:title data was extracted from the XMP
271			// Metadata.
272			self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
273			}
274
275			/**
276			* Tests PDFDocEncoding decode of Document Properties
277			*
278			* @see https://github.com/smalot/pdfparser/issues/609
279			*/
280			public function testPDFDocEncodingDecode(): void
281			{
282			$document = (new Parser())->parseFile($this->rootDir.'/samples/Issue609.pdf');
283
284			$details = $document->getDetails();
285
286			// These test that Adobe-inserted \r are removed from a UTF-8
287			// escaped metadata string, and the surrounding characters are
288			// repaired
289			$testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ﬁﬂŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
290			self::assertStringContainsString($testKeywords, $details['Keywords']);
291
292			$testKeywords = 'added line-feeds often destroy multibyte characters';
293			self::assertStringContainsString($testKeywords, $details['Keywords']);
294
295			// This tests that the PDFDocEncoding characters that differ
296			// from CP-1252 are decoded to their correct UTF-8 code points
297			// as well as removing \r line-feeds
298			$testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
299			self::assertStringContainsString($testSubject, $details['Subject']);
300			}
301
302			/**
303			* Test getText result.
304			*
305			* PDF generated with Chromium 116 via SaveAs-dialog.
306			*/
307			public function testGetTextPull634Chromium(): void
308			{
309			$document = (new Parser())->parseFile($this->rootDir.'/samples/R2RML-Spec_Generated_via_Chromium-SaveAs-PDF.pdf');
310
311			self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText());
312			}
313
314			/**
315			* Test getText result.
316			*
317			* PDF (1.4) generated with LibreOffice Writer (6.4).
318			*
319			* @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html
320			*/
321			public function testGetTextPull634LibreOffice(): void
322			{
323			$document = (new Parser())->parseFile($this->rootDir.'/samples/RichDocument_Generated_via_Libreoffice-6.4_PDF-v1.4.pdf');
324
325			self::assertStringContainsString(
326			'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß',
327			$document->getText()
328			);
329			}
330
331			/**
332			* Test getText result.
333			*
334			* PDF (v 1.4) generated with Inkscape 0.92.
335			*/
336			public function testGetTextPull634InkscapePDF14(): void
337			{
338			$document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.4.pdf');
339
340			self::assertEquals('TEST', $document->getText());
341			}
342
343			/**
344			* Test getText result.
345			*
346			* PDF (v 1.5) generated with Inkscape 0.92.
347			*/
348			public function testGetTextPull634InkscapePDF15(): void
349			{
350			$document = (new Parser())->parseFile($this->rootDir.'/samples/SimpleImage_Generated_via_Inkscape-0.92_PDF-v1.5.pdf');
351
352			self::assertEquals('TEST', $document->getText());
353			}
354
355			/**
356			* Test getText result.
357			*
358			* PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox.
359			*/
360			public function testGetTextPull634MicrosoftPDF17(): void
361			{
362			$document = (new Parser())->parseFile($this->rootDir.'/samples/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf');
363
364			$outputText = $document->getText();
365
366			self::assertStringContainsString(
367			'Adobe PDF icon'."\n".'Filename'."\n".'extension',
368			$outputText
369			);
370
371			self::assertStringContainsString(
372			'are necessary to make, use, sell, and distribute PDF-compliant',
373			$outputText
374			);
375			}
376
377			/**
378			* Test getText result.
379			*
380			* PDF generated from .docx with SmallPDF (https://smallpdf.com)
381			*/
382			public function testGetTextPull634SmallPDF(): void
383			{
384			$document = (new Parser())->parseFile($this->rootDir.'/samples/Document_Generated_by_SmallPDF.pdf');
385
386			$outputText = $document->getText();
387
388			// Actual encoded spaces in the document are preserved
389			self::assertStringContainsString(
390			'SmallPDF SMALLPDF SmallPDF',
391			$outputText
392			);
393
394			// Hebrew text
395			self::assertStringContainsString(
396			'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online',
397			$outputText
398			);
399
400			// Russian text
401			self::assertStringContainsString(
402			'Russian Keyboard - русская клавиатура - Type Russian',
403			$outputText
404			);
405			}
406			}
407

smalot / pdfparser

Pull Request — master (#634)

DocumentTest::testGetTextPull634SmallPDF() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like