Test Failed
Push — master ( 52c4f6...feaf39 )
by Konrad
07:50
created

testGetTextPull634LibreOffice()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 4
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 7
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @license LGPLv3
11
 *
12
 * @url     <https://github.com/smalot/pdfparser>
13
 *
14
 *  PdfParser is a pdf library written in PHP, extraction oriented.
15
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
16
 *
17
 *  This program is free software: you can redistribute it and/or modify
18
 *  it under the terms of the GNU Lesser General Public License as published by
19
 *  the Free Software Foundation, either version 3 of the License, or
20
 *  (at your option) any later version.
21
 *
22
 *  This program is distributed in the hope that it will be useful,
23
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
24
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25
 *  GNU Lesser General Public License for more details.
26
 *
27
 *  You should have received a copy of the GNU Lesser General Public License
28
 *  along with this program.
29
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
30
 */
31
32
namespace PHPUnitTests\Integration;
33
34
use PHPUnitTests\TestCase;
35
use Smalot\PdfParser\Parser;
36
37
/**
38
 * Document related tests which focus on certain PDF generators.
39
 */
40
class DocumentGeneratorFocusTest extends TestCase
41
{
42
    /**
43
     * Test getText result.
44
     *
45
     * PDF generated with Chromium 116 via SaveAs-dialog.
46
     */
47
    public function testGetTextPull634Chromium(): void
48
    {
49
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf');
50
51
        self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText());
52
    }
53
54
    /**
55
     * Test getText result.
56
     *
57
     * PDF (v 1.4) generated with Inkscape 0.92.
58
     */
59
    public function testGetTextPull634InkscapePDF14(): void
60
    {
61
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf');
62
63
        self::assertEquals('TEST', $document->getText());
64
    }
65
66
    /**
67
     * Test getText result.
68
     *
69
     * PDF (v 1.5) generated with Inkscape 0.92.
70
     */
71
    public function testGetTextPull634InkscapePDF15(): void
72
    {
73
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf');
74
75
        self::assertEquals('TEST', $document->getText());
76
    }
77
78
    /**
79
     * Test getText result.
80
     *
81
     * PDF (1.4) generated with LibreOffice Writer (6.4).
82
     *
83
     * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html
84
     */
85
    public function testGetTextPull634LibreOffice(): void
86
    {
87
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf');
88
89
        self::assertStringContainsString(
90
            'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß',
91
            $document->getText()
92
        );
93
    }
94
95
    /**
96
     * Test getText result.
97
     *
98
     * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox.
99
     */
100
    public function testGetTextPull634MicrosoftPDF17(): void
101
    {
102
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf');
103
104
        $outputText = $document->getText();
105
106
        self::assertStringContainsString(
107
            'Adobe PDF icon'."\n".'Filename'."\n".'extension',
108
            $outputText
109
        );
110
111
        self::assertStringContainsString(
112
            'are necessary to make, use, sell, and distribute PDF-compliant',
113
            $outputText
114
        );
115
    }
116
117
    /**
118
     * Test Document functions.
119
     *
120
     * PDF (v 1.5) generated by Microsoft Word 2016.
121
     */
122
    public function testGetTextPull634MicrosoftWord2016(): void
123
    {
124
        $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf';
125
        $document = (new Parser())->parseFile($path);
126
127
        $outputText = $document->getText();
128
129
        self::assertStringContainsString('(einschließlich Marktpflegequote) von 4 Mrd € angestrebt.', $outputText);
130
131
        // check whitespaces and tab usage
132
        self::assertStringContainsString(
133
            //           ,--- here is a tab
134
            'Fälligkeit: 	19. Oktober 2028 '."\n".
135
            'Zinszahlung: 19. Oktober gzj., Zinslaufbeginn 15. Juni 2023',
136
            $outputText
137
        );
138
    }
139
140
    /**
141
     * Test getText result.
142
     *
143
     * PDF (v 1.5) generated with Power PDF Create.
144
     */
145
    public function testGetTextPull634PowerPDFCreate(): void
146
    {
147
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf');
148
149
        $outputText = $document->getText();
150
151
        // located on page 1
152
        self::assertStringContainsString(
153
            'Index-Verhältniszahl: 1,17812 (am Valutierungstag 7. September 2023)',
154
            $outputText
155
        );
156
157
        // located on page 2
158
        self::assertStringContainsString(
159
            'Einbeziehung in den '."\n".
160
            'Börsenhandel: Dienstag, 5. September 2023 '."\n".
161
            'Valutierungstag: Donnerstag, 7. September 2023',
162
            $outputText
163
        );
164
    }
165
166
    /**
167
     * Test getText result.
168
     *
169
     * PDF generated from .docx with SmallPDF (https://smallpdf.com)
170
     */
171
    public function testGetTextPull634SmallPDF(): void
172
    {
173
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf');
174
175
        $outputText = $document->getText();
176
177
        // Actual encoded spaces in the document are preserved
178
        self::assertStringContainsString(
179
            'SmallPDF                       SMALLPDF                             SmallPDF',
180
            $outputText
181
        );
182
183
        // Hebrew text
184
        self::assertStringContainsString(
185
            'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online',
186
            $outputText
187
        );
188
189
        // Russian text
190
        self::assertStringContainsString(
191
            'Russian Keyboard - русская клавиатура - Type Russian',
192
            $outputText
193
        );
194
    }
195
196
    /**
197
     * Test getText result.
198
     *
199
     * PDF (1.6) generated by Word için Acrobat PDFMaker 17.
200
     */
201
    public function testGetTextPull634WordIcinAcrobatPDFMaker17(): void
202
    {
203
        $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_için_Acrobat_PDFMaker_17.pdf';
204
        $document = (new Parser())->parseFile($path);
205
206
        $outputText = $document->getText();
207
208
        self::assertStringContainsString(
209
            'İhracat ve döviz kazandırıcı hizmetler reeskont kredisi günlük',
210
            $outputText
211
        );
212
213
        // Unnecessary tabs are not inserted due to font-size being 1,
214
        // but the text-matrix scale is 9 or 10
215
        self::assertStringContainsString(
216
            'dikkate alınmasına devam edilecektir.',
217
            $outputText
218
        );
219
220
        // This encoded segment contains an escaped backslash right before
221
        // an octal code: \\\000. Account for this in Font::decodeOctal()
222
        // See: https://github.com/smalot/pdfparser/pull/640
223
        self::assertStringContainsString('Sayı: 2023-34', $outputText);
224
    }
225
}
226