Passed
Push — master ( f7cc41...5d3746 )
by Konrad
02:11
created

ParserTest::testChangedFontSpaceLimit()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 6
nc 1
nop 0
dl 0
loc 11
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 * @date    2020-06-01
8
 *
9
 * @author  Sébastien MALOT <[email protected]>
10
 * @date    2017-01-03
11
 *
12
 * @license LGPLv3
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Tests\Smalot\PdfParser\Integration;
34
35
use Exception;
36
use Smalot\PdfParser\Config;
37
use Smalot\PdfParser\Document;
38
use Smalot\PdfParser\Parser;
39
use Smalot\PdfParser\XObject\Image;
40
use Tests\Smalot\PdfParser\TestCase;
41
42
class ParserTest extends TestCase
43
{
44
    protected function setUp()
45
    {
46
        parent::setUp();
47
48
        $this->fixture = new Parser();
49
    }
50
51
    public function testParseFile()
52
    {
53
        $directory = $this->rootDir.'/samples/bugs';
54
55
        if (is_dir($directory)) {
56
            $files = scandir($directory);
57
58
            foreach ($files as $file) {
59
                if (preg_match('/^.*\.pdf$/i', $file)) {
60
                    try {
61
                        $document = $this->fixture->parseFile($directory.'/'.$file);
62
                        $pages = $document->getPages();
63
                        $this->assertTrue(0 < \count($pages));
64
65
                        foreach ($pages as $page) {
66
                            $content = $page->getText();
67
                            $this->assertTrue(0 < \strlen($content));
68
                        }
69
                    } catch (Exception $e) {
70
                        if (
71
                            'Secured pdf file are currently not supported.' !== $e->getMessage()
72
                            && 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
73
                        ) {
74
                            throw $e;
75
                        }
76
                    }
77
                }
78
            }
79
        }
80
    }
81
82
    /**
83
     * Properly decode international unicode characters
84
     *
85
     * @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support
86
     */
87
    public function testUnicodeDecoding()
88
    {
89
        $filename = $this->rootDir.'/samples/InternationalChars.pdf';
90
91
        $document = $this->fixture->parseFile($filename);
92
93
        $testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
94
        $testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
95
        $testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
96
        $testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
97
        $testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
98
        $testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';
99
100
        $this->assertStringContainsString($testString_cyrillic, $document->getText());
101
        $this->assertStringContainsString($testString_greek, $document->getText());
102
        $this->assertStringContainsString($testString_armenian, $document->getText());
103
        $this->assertStringContainsString($testString_georgean, $document->getText());
104
        $this->assertStringContainsString($testString_korean, $document->getText());
105
        $this->assertStringContainsString($testString_western, $document->getText());
106
    }
107
108
    /**
109
     * Tests that xrefs with line breaks between id and position are parsed correctly
110
     *
111
     * @see https://github.com/smalot/pdfparser/issues/336
112
     */
113
    public function testIssue19()
114
    {
115
        $fixture = new ParserSub();
116
        $structure = [
117
            [
118
                '<<',
119
                [
120
                    [
121
                        '/',
122
                        'Type',
123
                        7735,
124
                    ],
125
                    [
126
                        '/',
127
                        'ObjStm',
128
                        7742,
129
                    ],
130
                ],
131
            ],
132
            [
133
                'stream',
134
                '',
135
                7804,
136
                [
137
                    "17\n0",
138
                    [],
139
                ],
140
            ],
141
        ];
142
        $document = new Document();
143
144
        $fixture->exposedParseObject('19_0', $structure, $document);
145
        $objects = $fixture->getObjects();
146
147
        $this->assertArrayHasKey('17_0', $objects);
148
    }
149
150
    /**
151
     * Properly decode ANSI encodings without producing scrambled UTF-8 characters
152
     *
153
     * @see https://github.com/smalot/pdfparser/issues/202
154
     * @see https://github.com/smalot/pdfparser/pull/257
155
     */
156
    public function testIssue202()
157
    {
158
        $filename = $this->rootDir.'/samples/bugs/Issue202.pdf';
159
160
        $document = $this->fixture->parseFile($filename);
161
162
        $this->assertEquals('„fööbär“', $document->getText());
163
    }
164
165
    /**
166
     * Test that issue related pdf can now be parsed
167
     *
168
     * @see https://github.com/smalot/pdfparser/issues/267
169
     */
170
    public function testIssue267()
171
    {
172
        $filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';
173
174
        $document = $this->fixture->parseFile($filename);
175
176
        $this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
177
        $this->assertStringContainsString('4 von 4', $document->getText());
178
    }
179
180
    /**
181
     * Test that issue related pdf can now be parsed:
182
     * Too many slashes were being stripped and resulted
183
     * in malformed encoding of parts of the text content.
184
     *
185
     * @see https://github.com/smalot/pdfparser/issues/322
186
     */
187
    public function testIssue322()
188
    {
189
        $filename = $this->rootDir.'/samples/bugs/Issue322.pdf';
190
191
        $document = $this->fixture->parseFile($filename);
192
193
        $this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
194
    }
195
196
    /**
197
     * Test that issue related pdf can now be parsed:
198
     * Too many slashes were being stripped and resulted
199
     * in malformed encoding of parts of the text content.
200
     *
201
     * License of the content taken from https://stackoverflow.com in the sample PDF:
202
     * CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
203
     *
204
     * @see https://github.com/smalot/pdfparser/issues/334
205
     */
206
    public function testIssue334()
207
    {
208
        $filename = $this->rootDir.'/samples/bugs/Issue334.pdf';
209
210
        $document = $this->fixture->parseFile($filename);
211
212
        $this->assertStringContainsString('This question already has an answer here', $document->getText());
213
    }
214
215
    /**
216
     * Test that issue related pdf can now be parsed:
217
     * Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset"
218
     *
219
     * @see https://github.com/smalot/pdfparser/issues/359
220
     */
221
    public function testIssue359()
222
    {
223
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
224
225
        $document = $this->fixture->parseFile($filename);
226
227
        $this->assertStringContainsString(
228
            'dnia 10 maja 2018 roku o ochronie danych osobowych',
229
            $document->getText()
230
        );
231
        $this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText());
232
        /*
233
         * @todo Note that the "ł" in przepływu is decoded as a space character. This was already
234
         * the case before the PR that caused this issue and is not currently covered by this
235
         * test case. However, this issue should be addressed in the future and its fix can then
236
         * be incorporated into this test by uncommenting the following assertion.
237
         */
238
        // $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText());
239
    }
240
241
    /**
242
     * Tests behavior when changing default font space limit (-50).
243
     *
244
     * Test is based on testIssue359 (above).
245
     */
246
    public function testChangedFontSpaceLimit()
247
    {
248
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
249
250
        $config = new Config();
251
        $config->setFontSpaceLimit(1); // change default value
252
253
        $this->fixture = new Parser([], $config);
254
        $document = $this->fixture->parseFile($filename);
255
256
        $this->assertStringContainsString('dni a  10  maj a  2018', $document->getText());
257
    }
258
259
    /**
260
     * Tests if a given Config object is really used.
261
     * Or if a default one is generated, if null was given.
262
     */
263
    public function testUsageOfConfigObject()
264
    {
265
        // check default
266
        $this->fixture = new Parser([]);
267
        $this->assertEquals(new Config(), $this->fixture->getConfig());
268
269
        // check default 2
270
        $this->fixture = new Parser([], null);
271
        $this->assertEquals(new Config(), $this->fixture->getConfig());
272
273
        // check given
274
        $config = new Config();
275
        $config->setFontSpaceLimit(1000);
276
        $this->fixture = new Parser([], $config);
277
        $this->assertEquals($config, $this->fixture->getConfig());
278
    }
279
}
280
281
class ParserSub extends Parser
282
{
283
    public function exposedParseObject($id, $structure, $document)
284
    {
285
        return $this->parseObject($id, $structure, $document);
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->parseObject($id, $structure, $document) targeting Smalot\PdfParser\Parser::parseObject() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
286
    }
287
288
    public function getObjects()
289
    {
290
        return $this->objects;
291
    }
292
}
293