DocumentIssueFocusTest::testPDFDocEncodingDecode() - Code Metrics - Inspection of "Major Update to PDFObject.php + Ancillary" - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — master (#634)

by Konrad

created 2023-09-19 07:15 UTC

DocumentIssueFocusTest::testPDFDocEncodingDecode() A

↳ Parent: DocumentIssueFocusTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	20
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	1
eloc	8
c	1
b	0
f	0
nc	1
nop	0
dl	0
loc	20
rs	10

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 *
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 *
 * @date    2017-01-03
 *
 * @license LGPLv3
 *
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Parser;

/**
 * Document related tests which are related to certain issues.
 */
class DocumentIssueFocusTest extends TestCase
{
    /**
     * Tests getText method without a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextNoPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
    }

    /**
     * Tests getText method with a given page limit.
     *
     * @see https://github.com/smalot/pdfparser/pull/562
     */
    public function testGetTextWithPageLimit(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');

        // given text is on page 2, it has to be ignored because of that
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
    }

    /**
     * Tests extraction of XMP Metadata vs. getHeader() data.
     *
     * @see https://github.com/smalot/pdfparser/pull/606
     */
    public function testExtractXMPMetadata(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');

        $details = $document->getDetails();

        // Test that the dc:title data was extracted from the XMP
        // Metadata.
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
    }

    /**
     * Tests PDFDocEncoding decode of Document Properties
     *
     * @see https://github.com/smalot/pdfparser/issues/609
     */
    public function testPDFDocEncodingDecode(): void
    {
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf');

        $details = $document->getDetails();

        // These test that Adobe-inserted \r are removed from a UTF-8
        // escaped metadata string, and the surrounding characters are
        // repaired
        $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ﬁﬂŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
        self::assertStringContainsString($testKeywords, $details['Keywords']);

        $testKeywords = 'added line-feeds often destroy multibyte characters';
        self::assertStringContainsString($testKeywords, $details['Keywords']);

        // This tests that the PDFDocEncoding characters that differ
        // from CP-1252 are decoded to their correct UTF-8 code points
        // as well as removing \r line-feeds
        $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
        self::assertStringContainsString($testSubject, $details['Subject']);
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			*
8			* @date 2020-06-01
9			*
10			* @author Sébastien MALOT <[email protected]>
11			*
12			* @date 2017-01-03
13			*
14			* @license LGPLv3
15			*
16			* @url <https://github.com/smalot/pdfparser>
17			*
18			* PdfParser is a pdf library written in PHP, extraction oriented.
19			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20			*
21			* This program is free software: you can redistribute it and/or modify
22			* it under the terms of the GNU Lesser General Public License as published by
23			* the Free Software Foundation, either version 3 of the License, or
24			* (at your option) any later version.
25			*
26			* This program is distributed in the hope that it will be useful,
27			* but WITHOUT ANY WARRANTY; without even the implied warranty of
28			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29			* GNU Lesser General Public License for more details.
30			*
31			* You should have received a copy of the GNU Lesser General Public License
32			* along with this program.
33			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34			*/
35
36			namespace PHPUnitTests\Integration;
37
38			use PHPUnitTests\TestCase;
39			use Smalot\PdfParser\Document;
40			use Smalot\PdfParser\Page;
41			use Smalot\PdfParser\Parser;
42
43			/**
44			* Document related tests which are related to certain issues.
45			*/
46			class DocumentIssueFocusTest extends TestCase
47			{
48			/**
49			* Tests getText method without a given page limit.
50			*
51			* @see https://github.com/smalot/pdfparser/pull/562
52			*/
53			public function testGetTextNoPageLimit(): void
54			{
55			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
56
57			self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
58			}
59
60			/**
61			* Tests getText method with a given page limit.
62			*
63			* @see https://github.com/smalot/pdfparser/pull/562
64			*/
65			public function testGetTextWithPageLimit(): void
66			{
67			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
68
69			// given text is on page 2, it has to be ignored because of that
70			self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
71			}
72
73			/**
74			* Tests extraction of XMP Metadata vs. getHeader() data.
75			*
76			* @see https://github.com/smalot/pdfparser/pull/606
77			*/
78			public function testExtractXMPMetadata(): void
79			{
80			$document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
81
82			$details = $document->getDetails();
83
84			// Test that the dc:title data was extracted from the XMP
85			// Metadata.
86			self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
87			}
88
89			/**
90			* Tests PDFDocEncoding decode of Document Properties
91			*
92			* @see https://github.com/smalot/pdfparser/issues/609
93			*/
94			public function testPDFDocEncodingDecode(): void
95			{
96			$document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf');
97
98			$details = $document->getDetails();
99
100			// These test that Adobe-inserted \r are removed from a UTF-8
101			// escaped metadata string, and the surrounding characters are
102			// repaired
103			$testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ﬁﬂŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
104			self::assertStringContainsString($testKeywords, $details['Keywords']);
105
106			$testKeywords = 'added line-feeds often destroy multibyte characters';
107			self::assertStringContainsString($testKeywords, $details['Keywords']);
108
109			// This tests that the PDFDocEncoding characters that differ
110			// from CP-1252 are decoded to their correct UTF-8 code points
111			// as well as removing \r line-feeds
112			$testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
113			self::assertStringContainsString($testSubject, $details['Subject']);
114			}
115			}
116

smalot / pdfparser

Pull Request — master (#634)

DocumentIssueFocusTest::testPDFDocEncodingDecode() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like