Test Failed
Pull Request — master (#634)
by Konrad
02:31
created

DocumentIssueFocusTest::testPDFDocEncodingDecode()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 20
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 8
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 20
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\Page;
41
use Smalot\PdfParser\Parser;
42
43
/**
44
 * Document related tests which are related to certain issues.
45
 */
46
class DocumentIssueFocusTest extends TestCase
47
{
48
    /**
49
     * Tests getText method without a given page limit.
50
     *
51
     * @see https://github.com/smalot/pdfparser/pull/562
52
     */
53
    public function testGetTextNoPageLimit(): void
54
    {
55
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
56
57
        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
58
    }
59
60
    /**
61
     * Tests getText method with a given page limit.
62
     *
63
     * @see https://github.com/smalot/pdfparser/pull/562
64
     */
65
    public function testGetTextWithPageLimit(): void
66
    {
67
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
68
69
        // given text is on page 2, it has to be ignored because of that
70
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
71
    }
72
73
    /**
74
     * Tests extraction of XMP Metadata vs. getHeader() data.
75
     *
76
     * @see https://github.com/smalot/pdfparser/pull/606
77
     */
78
    public function testExtractXMPMetadata(): void
79
    {
80
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
81
82
        $details = $document->getDetails();
83
84
        // Test that the dc:title data was extracted from the XMP
85
        // Metadata.
86
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
87
    }
88
89
    /**
90
     * Tests PDFDocEncoding decode of Document Properties
91
     *
92
     * @see https://github.com/smalot/pdfparser/issues/609
93
     */
94
    public function testPDFDocEncodingDecode(): void
95
    {
96
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf');
97
98
        $details = $document->getDetails();
99
100
        // These test that Adobe-inserted \r are removed from a UTF-8
101
        // escaped metadata string, and the surrounding characters are
102
        // repaired
103
        $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
104
        self::assertStringContainsString($testKeywords, $details['Keywords']);
105
106
        $testKeywords = 'added line-feeds often destroy multibyte characters';
107
        self::assertStringContainsString($testKeywords, $details['Keywords']);
108
109
        // This tests that the PDFDocEncoding characters that differ
110
        // from CP-1252 are decoded to their correct UTF-8 code points
111
        // as well as removing \r line-feeds
112
        $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
113
        self::assertStringContainsString($testSubject, $details['Subject']);
114
    }
115
}
116