Test Failed
Push — master ( 52c4f6...feaf39 )
by Konrad
07:50
created

DocumentIssueFocusTest   A

Complexity

Total Complexity 4

Size/Duplication

Total Lines 68
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 16
c 1
b 0
f 0
dl 0
loc 68
rs 10
wmc 4

4 Methods

Rating   Name   Duplication   Size   Complexity  
A testPDFDocEncodingDecode() 0 20 1
A testGetTextWithPageLimit() 0 6 1
A testExtractXMPMetadata() 0 9 1
A testGetTextNoPageLimit() 0 5 1
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\Parser;
41
42
/**
43
 * Document related tests which are related to certain issues.
44
 */
45
class DocumentIssueFocusTest extends TestCase
46
{
47
    /**
48
     * Tests getText method without a given page limit.
49
     *
50
     * @see https://github.com/smalot/pdfparser/pull/562
51
     */
52
    public function testGetTextNoPageLimit(): void
53
    {
54
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
55
56
        self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText());
57
    }
58
59
    /**
60
     * Tests getText method with a given page limit.
61
     *
62
     * @see https://github.com/smalot/pdfparser/pull/562
63
     */
64
    public function testGetTextWithPageLimit(): void
65
    {
66
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf');
67
68
        // given text is on page 2, it has to be ignored because of that
69
        self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
70
    }
71
72
    /**
73
     * Tests extraction of XMP Metadata vs. getHeader() data.
74
     *
75
     * @see https://github.com/smalot/pdfparser/pull/606
76
     */
77
    public function testExtractXMPMetadata(): void
78
    {
79
        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
80
81
        $details = $document->getDetails();
82
83
        // Test that the dc:title data was extracted from the XMP
84
        // Metadata.
85
        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
86
    }
87
88
    /**
89
     * Tests PDFDocEncoding decode of Document Properties
90
     *
91
     * @see https://github.com/smalot/pdfparser/issues/609
92
     */
93
    public function testPDFDocEncodingDecode(): void
94
    {
95
        $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf');
96
97
        $details = $document->getDetails();
98
99
        // These test that Adobe-inserted \r are removed from a UTF-8
100
        // escaped metadata string, and the surrounding characters are
101
        // repaired
102
        $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ';
103
        self::assertStringContainsString($testKeywords, $details['Keywords']);
104
105
        $testKeywords = 'added line-feeds often destroy multibyte characters';
106
        self::assertStringContainsString($testKeywords, $details['Keywords']);
107
108
        // This tests that the PDFDocEncoding characters that differ
109
        // from CP-1252 are decoded to their correct UTF-8 code points
110
        // as well as removing \r line-feeds
111
        $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž';
112
        self::assertStringContainsString($testSubject, $details['Subject']);
113
    }
114
}
115