Test Failed
Pull Request — master (#634)
by Konrad
02:31
created

testGetTextPull634SmallPDF()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 22
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 11
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 22
rs 9.9
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @license LGPLv3
11
 *
12
 * @url     <https://github.com/smalot/pdfparser>
13
 *
14
 *  PdfParser is a pdf library written in PHP, extraction oriented.
15
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
16
 *
17
 *  This program is free software: you can redistribute it and/or modify
18
 *  it under the terms of the GNU Lesser General Public License as published by
19
 *  the Free Software Foundation, either version 3 of the License, or
20
 *  (at your option) any later version.
21
 *
22
 *  This program is distributed in the hope that it will be useful,
23
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
24
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25
 *  GNU Lesser General Public License for more details.
26
 *
27
 *  You should have received a copy of the GNU Lesser General Public License
28
 *  along with this program.
29
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
30
 */
31
32
namespace PHPUnitTests\Integration;
33
34
use PHPUnitTests\TestCase;
35
use Smalot\PdfParser\Parser;
36
37
/**
38
 * Document related tests which focus on certain PDF generators.
39
 */
40
class DocumentGeneratorFocusTest extends TestCase
41
{
42
    /**
43
     * Test getText result.
44
     *
45
     * PDF generated with Chromium 116 via SaveAs-dialog.
46
     */
47
    public function testGetTextPull634Chromium(): void
48
    {
49
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf');
50
51
        self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText());
52
    }
53
54
    /**
55
     * Test getText result.
56
     *
57
     * PDF (v 1.4) generated with Inkscape 0.92.
58
     */
59
    public function testGetTextPull634InkscapePDF14(): void
60
    {
61
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf');
62
63
        self::assertEquals('TEST', $document->getText());
64
    }
65
66
    /**
67
     * Test getText result.
68
     *
69
     * PDF (v 1.5) generated with Inkscape 0.92.
70
     */
71
    public function testGetTextPull634InkscapePDF15(): void
72
    {
73
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf');
74
75
        self::assertEquals('TEST', $document->getText());
76
    }
77
78
    /**
79
     * Test getText result.
80
     *
81
     * PDF (1.4) generated with LibreOffice Writer (6.4).
82
     *
83
     * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html
84
     */
85
    public function testGetTextPull634LibreOffice(): void
86
    {
87
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf');
88
89
        self::assertStringContainsString(
90
            'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß',
91
            $document->getText()
92
        );
93
    }
94
95
    /**
96
     * Test getText result.
97
     *
98
     * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox.
99
     */
100
    public function testGetTextPull634MicrosoftPDF17(): void
101
    {
102
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf');
103
104
        $outputText = $document->getText();
105
106
        self::assertStringContainsString(
107
            'Adobe PDF icon'."\n".'Filename'."\n".'extension',
108
            $outputText
109
        );
110
111
        self::assertStringContainsString(
112
            'are necessary to make, use, sell, and distribute PDF-compliant',
113
            $outputText
114
        );
115
    }
116
117
    /**
118
     * Test Document functions.
119
     *
120
     * PDF (v 1.5) generated by Microsoft Word 2016.
121
     */
122
    public function testGetTextPull634MicrosoftWord2016(): void
123
    {
124
        $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf';
125
        $document = (new Parser())->parseFile($path);
126
127
        $outputText = $document->getText();
128
129
        self::assertStringContainsString('(einschließlich Marktpflegequote) von 4 Mrd € angestrebt.', $outputText);
130
131
        // check whitespaces and tab usage
132
        self::assertStringContainsString(
133
            //           ,--- here is a tab
134
            'Fälligkeit: 	19. Oktober 2028 '."\n".
135
            'Zinszahlung: 19. Oktober gzj., Zinslaufbeginn 15. Juni 2023',
136
            $outputText
137
        );
138
139
        // get the two images at the top of the PDF
140
        $images = $document->getObjectsByType('Image');
141
        self::assertCount(2, $images);
142
        // I am not sure if your changes are related to images too, thats why I added this failing test
143
        // so you have an example for further investigation. If your code is unrelated, just remove it.
144
    }
145
146
    /**
147
     * Test getText result.
148
     *
149
     * PDF (v 1.5) generated with Power PDF Create.
150
     */
151
    public function testGetTextPull634PowerPDFCreate(): void
152
    {
153
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf');
154
155
        $outputText = $document->getText();
156
157
        // located on page 1
158
        self::assertStringContainsString(
159
            'Index-Verhältniszahl: 1,17812 (am Valutierungstag 7. September 2023)',
160
            $outputText
161
        );
162
163
        // located on page 2
164
        self::assertStringContainsString(
165
            'Einbeziehung in den '."\n".
166
            'Börsenhandel: Dienstag, 5. September 2023 '."\n".
167
            'Valutierungstag: Donnerstag, 7. September 2023',
168
            $outputText
169
        );
170
    }
171
172
    /**
173
     * Test getText result.
174
     *
175
     * PDF generated from .docx with SmallPDF (https://smallpdf.com)
176
     */
177
    public function testGetTextPull634SmallPDF(): void
178
    {
179
        $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf');
180
181
        $outputText = $document->getText();
182
183
        // Actual encoded spaces in the document are preserved
184
        self::assertStringContainsString(
185
            'SmallPDF                       SMALLPDF                             SmallPDF',
186
            $outputText
187
        );
188
189
        // Hebrew text
190
        self::assertStringContainsString(
191
            'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online',
192
            $outputText
193
        );
194
195
        // Russian text
196
        self::assertStringContainsString(
197
            'Russian Keyboard - русская клавиатура - Type Russian',
198
            $outputText
199
        );
200
    }
201
202
    /**
203
     * Test getText result.
204
     *
205
     * PDF (1.6) generated by Word için Acrobat PDFMaker 17.
206
     */
207
    public function testGetTextPull634WordIcinAcrobatPDFMaker17(): void
208
    {
209
        $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_için_Acrobat_PDFMaker_17.pdf';
210
        $document = (new Parser())->parseFile($path);
211
212
        $outputText = $document->getText();
213
214
        self::assertStringContainsString(
215
            'İhracat ve döviz kazandırıcı hizmetler reeskont kredisi günlük',
216
            $outputText
217
        );
218
219
        // white spaces and tabs are preserved
220
        self::assertStringContainsString(
221
            //                  ,--- tab
222
            'dikkate alınmasına 	devam edilecektir.',
223
            $outputText
224
        );
225
226
        // I assume it fails because of a wrongly de/encoded character after Say000
227
        self::assertStringContainsString('Say000�: 20 23-34', $outputText);
228
229
        // get the image at the top of the PDF
230
        $images = $document->getObjectsByType('Image');
231
        self::assertCount(2, $images);
232
        // I am not sure if your changes are related to images too, thats why I added this failing test
233
        // so you have an example for further investigation. If your code is unrelated, just remove it.
234
    }
235
}
236