Issues in DocumentTest.php - New Issues - Inspection of "Read XMP Metadata in place of decoded header 'deta..." - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — master (#606)

unknown

created 2023-06-26 14:21 UTC

tests/PHPUnit/Integration/DocumentTest.php (1 issue)

Labels

Severity

Minor 1

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 *
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Header;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Pages;
use Smalot\PdfParser\Parser;
use Smalot\PdfParser\PDFObject;

class DocumentTest extends TestCase
{
    protected function getDocumentInstance(): Document
    {
        return new Document();
    }

    protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject
    {
        return new PDFObject($document, $header);
    }

    protected function getPageInstance(Document $document, Header $header): PDFObject
    {
        return new Page($document, $header);
    }

    protected function getPagesInstance(Document $document, Header $header): PDFObject
    {
        return new Pages($document, $header);
    }

    public function testSetObjects(): void
    {
        $document = $this->getDocumentInstance();
        $object = $this->getPDFObjectInstance($document);

        // Obj #1 is missing
        $this->assertNull($document->getObjectById(1));
        $document->setObjects([1 => $object]);

        // Obj #1 exists
        $this->assertTrue($document->getObjectById(1) instanceof PDFObject);

        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object = $this->getPDFObjectInstance($document, $header);
        $document->setObjects([2 => $object]);

        // Obj #1 is missing
        $this->assertNull($document->getObjectById(1));

        // Obj #2 exists
        $this->assertTrue($document->getObjectById(2) instanceof PDFObject);
    }

    public function testGetObjects(): void
    {
        $document = $this->getDocumentInstance();
        $object1 = $this->getPDFObjectInstance($document);
        $content = '<</Type/Page>>unparsed content';
        $header = Header::parse($content, $document);

        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getObjects();
        $this->assertEquals(2, \count($objects));
        $this->assertTrue($objects[1] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof Page);
    }

    public function testDictionary(): void
    {
        $document = $this->getDocumentInstance();
        $objects = $document->getDictionary();
        $this->assertEquals(0, \count($objects));
        $object1 = $this->getPDFObjectInstance($document);

        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getDictionary();
        $this->assertEquals(1, \count($objects));
        $this->assertEquals(1, \count($objects['Page']['all']));
        $this->assertEquals($object2, $objects['Page']['all'][2]);
    }

    public function testGetObjectsByType(): void
    {
        $document = $this->getDocumentInstance();
        $object1 = $this->getPDFObjectInstance($document);
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);

        $objects = $document->getObjectsByType('Page');
        $this->assertEquals(1, \count($objects));
        $this->assertTrue($objects[2] instanceof PDFObject);
        $this->assertTrue($objects[2] instanceof Page);
    }

    public function testGetPages(): void
    {
        $document = $this->getDocumentInstance();

        // Listing pages from type Page
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $document->setObjects([1 => $object1, 2 => $object2]);
        $pages = $document->getPages();

        $this->assertEquals(2, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);

        // Listing pages from type Pages (kids)
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object3 = $this->getPageInstance($document, $header);

        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
        $header = Header::parse($content, $document);
        $object4 = $this->getPagesInstance($document, $header);

        $content = '<</Type/Pages/Kids[3 0 R]>>';
        $header = Header::parse($content, $document);
        $object5 = $this->getPagesInstance($document, $header);

        $document->setObjects([
            '1_0' => $object1,
            '2_0' => $object2,
            '3_0' => $object3,
            '4_0' => $object4,
            '5_0' => $object5,
        ]);
        $pages = $document->getPages();

        $this->assertEquals(3, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);
        $this->assertTrue($pages[2] instanceof Page);

        // Listing pages from type Catalog
        $content = '<</Type/Page>>';
        $header = Header::parse($content, $document);
        $object1 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object2 = $this->getPageInstance($document, $header);
        $header = Header::parse($content, $document);
        $object3 = $this->getPageInstance($document, $header);
        $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
        $header = Header::parse($content, $document);
        $object4 = $this->getPagesInstance($document, $header);
        $content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>';
        $header = Header::parse($content, $document);
        $object5 = $this->getPagesInstance($document, $header);
        $content = '<</Type/Catalog/Pages 5 0 R >>';
        $header = Header::parse($content, $document);
        $object6 = $this->getPagesInstance($document, $header);
        $document->setObjects(
            [
                '1_0' => $object1,
                '2_0' => $object2,
                '3_0' => $object3,
                '4_0' => $object4,
                '5_0' => $object5,
                '6_0' => $object6,
            ]
        );
        $pages = $document->getPages();
        $this->assertEquals(3, \count($pages));
        $this->assertTrue($pages[0] instanceof Page);
        $this->assertTrue($pages[1] instanceof Page);
        $this->assertTrue($pages[2] instanceof Page);
    }

    public function testGetPagesMissingCatalog(): void
    {
        $this->expectException(\Exception::class);
        $this->expectExceptionMessage('Missing catalog.');

        // Missing catalog
        $document = $this->getDocumentInstance();
        $document->getPages();
    }

    /**
     * Tests getText method without a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextNoPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
    }

    /**
     * Tests getText method with a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextWithPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        // given text is on page 2, it has to be ignored because of that
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
    }

    /**
     * Tests extraction of XMP Metadata vs. getHeader() data.
     *
     * @see https://github.com/smalot/pdfparser/pull/606
     */
    public function testExtractXMPMetadata(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');

        // Get the original parsed details from getHeader().
        $ref = new \ReflectionClass('\Smalot\PdfParser\Document');
        $prop = $ref->getProperty('trailer');
        $prop->setAccessible(true);
        $trailer = $prop->getValue($document);

        if ($trailer->has('Info')) {
            $info = $trailer->get('Info');
            if (null !== $info && method_exists($info, 'getHeader')) {
                $details = $info->getHeader()->getDetails();
            }
        }

        // Check that the Title does not contain a UTF-8 Right Single
        // Quotation Mark, and that the Creator does not contain a UTF-8
        // Registered Trademark symbol, an indication that getHeader()
        // did not find the correct values.
        self::assertStringNotContainsString("\u{2019}", $details['Title']);

        self::assertStringNotContainsString("\u{00AE}", $details['Creator']);

        $detailsXMP = $document->getDetails();

        // Test two fields for special characters that getHeader() does
        // not handle properly.
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $detailsXMP['Title']);
        self::assertStringContainsString("Microsoft\u{00AE} Word for Microsoft 365", $detailsXMP['Creator']);

        // Test that getDetails() data NOT contained in the XMP Metadata
        // is still accessible and not discarded/overwritten.
        self::assertEquals(1, $detailsXMP['Pages']);
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			*
8			* @date 2020-06-01
9			*
10			* @author Sébastien MALOT <[email protected]>
11			*
12			* @date 2017-01-03
13			*
14			* @license LGPLv3
15			*
16			* @url <https://github.com/smalot/pdfparser>
17			*
18			* PdfParser is a pdf library written in PHP, extraction oriented.
19			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20			*
21			* This program is free software: you can redistribute it and/or modify
22			* it under the terms of the GNU Lesser General Public License as published by
23			* the Free Software Foundation, either version 3 of the License, or
24			* (at your option) any later version.
25			*
26			* This program is distributed in the hope that it will be useful,
27			* but WITHOUT ANY WARRANTY; without even the implied warranty of
28			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29			* GNU Lesser General Public License for more details.
30			*
31			* You should have received a copy of the GNU Lesser General Public License
32			* along with this program.
33			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34			*/
35
36			namespace PHPUnitTests\Integration;
37
38			use PHPUnitTests\TestCase;
39			use Smalot\PdfParser\Document;
40			use Smalot\PdfParser\Header;
41			use Smalot\PdfParser\Page;
42			use Smalot\PdfParser\Pages;
43			use Smalot\PdfParser\Parser;
44			use Smalot\PdfParser\PDFObject;
45
46			class DocumentTest extends TestCase
47			{
48			protected function getDocumentInstance(): Document
49			{
50			return new Document();
51			}
52
53			protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject
54			{
55			return new PDFObject($document, $header);
56			}
57
58			protected function getPageInstance(Document $document, Header $header): PDFObject
59			{
60			return new Page($document, $header);
61			}
62
63			protected function getPagesInstance(Document $document, Header $header): PDFObject
64			{
65			return new Pages($document, $header);
66			}
67
68			public function testSetObjects(): void
69			{
70			$document = $this->getDocumentInstance();
71			$object = $this->getPDFObjectInstance($document);
72
73			// Obj #1 is missing
74			$this->assertNull($document->getObjectById(1));
75			$document->setObjects([1 => $object]);
76
77			// Obj #1 exists
78			$this->assertTrue($document->getObjectById(1) instanceof PDFObject);
79
80			$content = '<</Type/Page>>';
81			$header = Header::parse($content, $document);
82			$object = $this->getPDFObjectInstance($document, $header);
83			$document->setObjects([2 => $object]);
84
85			// Obj #1 is missing
86			$this->assertNull($document->getObjectById(1));
87
88			// Obj #2 exists
89			$this->assertTrue($document->getObjectById(2) instanceof PDFObject);
90			}
91
92			public function testGetObjects(): void
93			{
94			$document = $this->getDocumentInstance();
95			$object1 = $this->getPDFObjectInstance($document);
96			$content = '<</Type/Page>>unparsed content';
97			$header = Header::parse($content, $document);
98
99			$object2 = $this->getPageInstance($document, $header);
100			$document->setObjects([1 => $object1, 2 => $object2]);
101
102			$objects = $document->getObjects();
103			$this->assertEquals(2, \count($objects));
104			$this->assertTrue($objects[1] instanceof PDFObject);
105			$this->assertTrue($objects[2] instanceof PDFObject);
106			$this->assertTrue($objects[2] instanceof Page);
107			}
108
109			public function testDictionary(): void
110			{
111			$document = $this->getDocumentInstance();
112			$objects = $document->getDictionary();
113			$this->assertEquals(0, \count($objects));
114			$object1 = $this->getPDFObjectInstance($document);
115
116			$content = '<</Type/Page>>';
117			$header = Header::parse($content, $document);
118			$object2 = $this->getPageInstance($document, $header);
119			$document->setObjects([1 => $object1, 2 => $object2]);
120
121			$objects = $document->getDictionary();
122			$this->assertEquals(1, \count($objects));
123			$this->assertEquals(1, \count($objects['Page']['all']));
124			$this->assertEquals($object2, $objects['Page']['all'][2]);
125			}
126
127			public function testGetObjectsByType(): void
128			{
129			$document = $this->getDocumentInstance();
130			$object1 = $this->getPDFObjectInstance($document);
131			$content = '<</Type/Page>>';
132			$header = Header::parse($content, $document);
133			$object2 = $this->getPageInstance($document, $header);
134			$document->setObjects([1 => $object1, 2 => $object2]);
135
136			$objects = $document->getObjectsByType('Page');
137			$this->assertEquals(1, \count($objects));
138			$this->assertTrue($objects[2] instanceof PDFObject);
139			$this->assertTrue($objects[2] instanceof Page);
140			}
141
142			public function testGetPages(): void
143			{
144			$document = $this->getDocumentInstance();
145
146			// Listing pages from type Page
147			$content = '<</Type/Page>>';
148			$header = Header::parse($content, $document);
149			$object1 = $this->getPageInstance($document, $header);
150			$header = Header::parse($content, $document);
151			$object2 = $this->getPageInstance($document, $header);
152			$document->setObjects([1 => $object1, 2 => $object2]);
153			$pages = $document->getPages();
154
155			$this->assertEquals(2, \count($pages));
156			$this->assertTrue($pages[0] instanceof Page);
157			$this->assertTrue($pages[1] instanceof Page);
158
159			// Listing pages from type Pages (kids)
160			$content = '<</Type/Page>>';
161			$header = Header::parse($content, $document);
162			$object1 = $this->getPageInstance($document, $header);
163			$header = Header::parse($content, $document);
164			$object2 = $this->getPageInstance($document, $header);
165			$header = Header::parse($content, $document);
166			$object3 = $this->getPageInstance($document, $header);
167
168			$content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
169			$header = Header::parse($content, $document);
170			$object4 = $this->getPagesInstance($document, $header);
171
172			$content = '<</Type/Pages/Kids[3 0 R]>>';
173			$header = Header::parse($content, $document);
174			$object5 = $this->getPagesInstance($document, $header);
175
176			$document->setObjects([
177			'1_0' => $object1,
178			'2_0' => $object2,
179			'3_0' => $object3,
180			'4_0' => $object4,
181			'5_0' => $object5,
182			]);
183			$pages = $document->getPages();
184
185			$this->assertEquals(3, \count($pages));
186			$this->assertTrue($pages[0] instanceof Page);
187			$this->assertTrue($pages[1] instanceof Page);
188			$this->assertTrue($pages[2] instanceof Page);
189
190			// Listing pages from type Catalog
191			$content = '<</Type/Page>>';
192			$header = Header::parse($content, $document);
193			$object1 = $this->getPageInstance($document, $header);
194			$header = Header::parse($content, $document);
195			$object2 = $this->getPageInstance($document, $header);
196			$header = Header::parse($content, $document);
197			$object3 = $this->getPageInstance($document, $header);
198			$content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>';
199			$header = Header::parse($content, $document);
200			$object4 = $this->getPagesInstance($document, $header);
201			$content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>';
202			$header = Header::parse($content, $document);
203			$object5 = $this->getPagesInstance($document, $header);
204			$content = '<</Type/Catalog/Pages 5 0 R >>';
205			$header = Header::parse($content, $document);
206			$object6 = $this->getPagesInstance($document, $header);
207			$document->setObjects(
208			[
209			'1_0' => $object1,
210			'2_0' => $object2,
211			'3_0' => $object3,
212			'4_0' => $object4,
213			'5_0' => $object5,
214			'6_0' => $object6,
215			]
216			);
217			$pages = $document->getPages();
218			$this->assertEquals(3, \count($pages));
219			$this->assertTrue($pages[0] instanceof Page);
220			$this->assertTrue($pages[1] instanceof Page);
221			$this->assertTrue($pages[2] instanceof Page);
222			}
223
224			public function testGetPagesMissingCatalog(): void
225			{
226			$this->expectException(\Exception::class);
227			$this->expectExceptionMessage('Missing catalog.');
228
229			// Missing catalog
230			$document = $this->getDocumentInstance();
231			$document->getPages();
232			}
233
234			/**
235			* Tests getText method without a given page limit.
236			*
237			* @see https://github.com/smalot/pdfparser/pull/562
238			*/
239			public function testGetTextNoPageLimit(): void
240			{
241			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
242
243			self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
244			}
245
246			/**
247			* Tests getText method with a given page limit.
248			*
249			* @see https://github.com/smalot/pdfparser/pull/562
250			*/
251			public function testGetTextWithPageLimit(): void
252			{
253			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
254
255			// given text is on page 2, it has to be ignored because of that
256			self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
257			}
258
259			/**
260			* Tests extraction of XMP Metadata vs. getHeader() data.
261			*
262			* @see https://github.com/smalot/pdfparser/pull/606
263			*/
264			public function testExtractXMPMetadata(): void
265			{
266			$document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
267
268			// Get the original parsed details from getHeader().
269			$ref = new \ReflectionClass('\Smalot\PdfParser\Document');
270			$prop = $ref->getProperty('trailer');
271			$prop->setAccessible(true);
272			$trailer = $prop->getValue($document);
273
274			if ($trailer->has('Info')) {
275			$info = $trailer->get('Info');
276			if (null !== $info && method_exists($info, 'getHeader')) {
277			$details = $info->getHeader()->getDetails();
278			}
279			}
280
281			// Check that the Title does not contain a UTF-8 Right Single
282			// Quotation Mark, and that the Creator does not contain a UTF-8
283			// Registered Trademark symbol, an indication that getHeader()
284			// did not find the correct values.
285			self::assertStringNotContainsString("\u{2019}", $details['Title']);
			0 ignored issues – show Comprehensibility Best Practice introduced 2023-06-26 14:23 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `$details` does not seem to be defined for all execution paths leading up to this point. Loading history...
286			self::assertStringNotContainsString("\u{00AE}", $details['Creator']);
287
288			$detailsXMP = $document->getDetails();
289
290			// Test two fields for special characters that getHeader() does
291			// not handle properly.
292			self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $detailsXMP['Title']);
293			self::assertStringContainsString("Microsoft\u{00AE} Word for Microsoft 365", $detailsXMP['Creator']);
294
295			// Test that getDetails() data NOT contained in the XMP Metadata
296			// is still accessible and not discarded/overwritten.
297			self::assertEquals(1, $detailsXMP['Pages']);
298			}
299			}
300

smalot / pdfparser

Pull Request — master (#606)

tests/PHPUnit/Integration/DocumentTest.php (1 issue)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like