Test Failed
Push — master ( ce434c...c97499 )
by Konrad
02:01
created

ParserTest::testIssue334()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 0
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Config;
40
use Smalot\PdfParser\Document;
41
use Smalot\PdfParser\Parser;
42
use Smalot\PdfParser\XObject\Image;
43
44
class ParserTest extends TestCase
45
{
46
    protected function setUp(): void
47
    {
48
        parent::setUp();
49
50
        $this->fixture = new Parser();
51
    }
52
53
    /**
54
     * Notice: it may fail to run in Scrutinizer because of memory limitations.
55
     *
56
     * @group memory-heavy
57
     */
58
    public function testParseFile(): void
59
    {
60
        $directory = $this->rootDir.'/samples/bugs';
61
62
        if (is_dir($directory)) {
63
            $files = scandir($directory);
64
65
            foreach ($files as $file) {
66
                if (preg_match('/^.*\.pdf$/i', $file)) {
67
                    try {
68
                        $document = $this->fixture->parseFile($directory.'/'.$file);
69
                        $pages = $document->getPages();
70
                        $this->assertTrue(0 < \count($pages));
71
72
                        foreach ($pages as $page) {
73
                            $content = $page->getText();
74
                            $this->assertTrue('' !== $content);
75
                        }
76
                    } catch (\Exception $e) {
77
                        if (
78
                            'Secured pdf file are currently not supported.' !== $e->getMessage()
79
                            && 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
80
                        ) {
81
                            throw $e;
82
                        }
83
                    }
84
                }
85
            }
86
        }
87
    }
88
89
    /**
90
     * Properly decode international unicode characters
91
     *
92
     * @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support
93
     */
94
    public function testUnicodeDecoding(): void
95
    {
96
        $filename = $this->rootDir.'/samples/InternationalChars.pdf';
97
98
        $document = $this->fixture->parseFile($filename);
99
100
        $testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
101
        $testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
102
        $testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
103
        $testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
104
        $testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
105
        $testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';
106
107
        $this->assertStringContainsString($testString_cyrillic, $document->getText());
108
        $this->assertStringContainsString($testString_greek, $document->getText());
109
        $this->assertStringContainsString($testString_armenian, $document->getText());
110
        $this->assertStringContainsString($testString_georgean, $document->getText());
111
        $this->assertStringContainsString($testString_korean, $document->getText());
112
        $this->assertStringContainsString($testString_western, $document->getText());
113
    }
114
115
    /**
116
     * Tests that xrefs with line breaks between id and position are parsed correctly
117
     *
118
     * @see https://github.com/smalot/pdfparser/issues/336
119
     */
120
    public function testIssue19(): void
121
    {
122
        $fixture = new ParserSub();
123
        $structure = [
124
            [
125
                '<<',
126
                [
127
                    [
128
                        '/',
129
                        'Type',
130
                        7735,
131
                    ],
132
                    [
133
                        '/',
134
                        'ObjStm',
135
                        7742,
136
                    ],
137
                ],
138
            ],
139
            [
140
                'stream',
141
                '',
142
                7804,
143
                [
144
                    "17\n0",
145
                    [],
146
                ],
147
            ],
148
        ];
149
        $document = new Document();
150
151
        $fixture->exposedParseObject('19_0', $structure, $document);
152
        $objects = $fixture->getObjects();
153
154
        $this->assertArrayHasKey('17_0', $objects);
155
    }
156
157
    /**
158
     * Properly decode ANSI encodings without producing scrambled UTF-8 characters
159
     *
160
     * @see https://github.com/smalot/pdfparser/issues/202
161
     * @see https://github.com/smalot/pdfparser/pull/257
162
     */
163
    public function testIssue202(): void
164
    {
165
        $filename = $this->rootDir.'/samples/bugs/Issue202.pdf';
166
167
        $document = $this->fixture->parseFile($filename);
168
169
        $this->assertEquals('„fööbär“', $document->getText());
170
    }
171
172
    /**
173
     * Test that issue related pdf can now be parsed
174
     *
175
     * @see https://github.com/smalot/pdfparser/issues/267
176
     */
177
    public function testIssue267(): void
178
    {
179
        $filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';
180
181
        $document = $this->fixture->parseFile($filename);
182
183
        $this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
184
        $this->assertStringContainsString('4 von 4', $document->getText());
185
    }
186
187
    /**
188
     * Test that issue related pdf can now be parsed:
189
     * Too many slashes were being stripped and resulted
190
     * in malformed encoding of parts of the text content.
191
     *
192
     * @see https://github.com/smalot/pdfparser/issues/322
193
     */
194
    public function testIssue322(): void
195
    {
196
        $filename = $this->rootDir.'/samples/bugs/Issue322.pdf';
197
198
        $document = $this->fixture->parseFile($filename);
199
200
        $this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
201
    }
202
203
    /**
204
     * Test that issue related pdf can now be parsed:
205
     * Too many slashes were being stripped and resulted
206
     * in malformed encoding of parts of the text content.
207
     *
208
     * License of the content taken from https://stackoverflow.com in the sample PDF:
209
     * CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
210
     *
211
     * @see https://github.com/smalot/pdfparser/issues/334
212
     */
213
    public function testIssue334(): void
214
    {
215
        $filename = $this->rootDir.'/samples/bugs/Issue334.pdf';
216
217
        $document = $this->fixture->parseFile($filename);
218
219
        $this->assertStringContainsString('This question already has an answer here', $document->getText());
220
    }
221
222
    /**
223
     * Test that issue related pdf can now be parsed:
224
     * Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset"
225
     *
226
     * @see https://github.com/smalot/pdfparser/issues/359
227
     */
228
    public function testIssue359(): void
229
    {
230
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
231
232
        $document = $this->fixture->parseFile($filename);
233
234
        $this->assertStringContainsString(
235
            'dnia 10 maja 2018 roku o ochronie danych osobowych',
236
            $document->getText()
237
        );
238
        $this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText());
239
        /*
240
         * @todo Note that the "ł" in przepływu is decoded as a space character. This was already
241
         * the case before the PR that caused this issue and is not currently covered by this
242
         * test case. However, this issue should be addressed in the future and its fix can then
243
         * be incorporated into this test by uncommenting the following assertion.
244
         */
245
        // $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText());
246
    }
247
248
    /**
249
     * Tests if PDF triggers "Call to undefined method Smalot\PdfParser\Header::__toString()".
250
     *
251
     * It happened because there was a check missing in Font.php (~ line 109).
252
     *
253
     * @see https://github.com/smalot/pdfparser/issues/391
254
     */
255
    public function testIssue391(): void
256
    {
257
        /**
258
         * PDF provided by @dhildreth for usage in our test environment.
259
         *
260
         * @see https://github.com/smalot/pdfparser/issues/391#issuecomment-783504599
261
         */
262
        $filename = $this->rootDir.'/samples/bugs/Issue391.pdf';
263
264
        $document = $this->fixture->parseFile($filename);
265
266
        // check for an example string (PDF consists of many pages)
267
        $this->assertStringContainsString(
268
            '(This Code will be changed while mass production)',
269
            $document->getText()
270
        );
271
    }
272
273
    /**
274
     * Tests if a PDF with null or empty string headers trigger an Exception.
275
     *
276
     * It happened because there was a check missing in Parser.php (parseHeaderElement function).
277
     *
278
     * @see https://github.com/smalot/pdfparser/issues/557
279
     */
280
    public function testIssue557(): void
281
    {
282
        /**
283
         * PDF provided by @DogLoc for usage in our test environment.
284
         *
285
         * @see https://github.com/smalot/pdfparser/pull/560#issue-1461437944
286
         */
287
        $filename = $this->rootDir.'/samples/bugs/Issue557.pdf';
288
289
        $document = $this->fixture->parseFile($filename);
290
291
        $this->assertStringContainsString(
292
            'Metal Face Inductive Sensor',
293
            $document->getText()
294
        );
295
    }
296
297
    /**
298
     * Tests if an integer overflow triggers a TypeError in Font::uchr.
299
     *
300
     * @see https://github.com/smalot/pdfparser/issues/621
301
     */
302
    public function testIssue621(): void
303
    {
304
        $document = $this->fixture->parseFile($this->rootDir.'/samples/bugs/Issue621.pdf');
305
306
        $this->assertStringContainsString('What is a biological product?', $document->getText());
307
    }
308
309
    /**
310
     * Tests behavior when changing default font space limit (-50).
311
     *
312
     * Test is based on testIssue359 (above).
313
     */
314
    public function testChangedFontSpaceLimit(): void
315
    {
316
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
317
318
        $config = new Config();
319
        $config->setFontSpaceLimit(1); // change default value
320
321
        $this->fixture = new Parser([], $config);
322
        $document = $this->fixture->parseFile($filename);
323
324
        $this->assertStringContainsString('dni a  10  maj a  2018', $document->getText());
325
    }
326
327
    /**
328
     * Tests if a given Config object is really used.
329
     * Or if a default one is generated, if null was given.
330
     */
331
    public function testUsageOfConfigObject(): void
332
    {
333
        // check default
334
        $this->fixture = new Parser([]);
335
        $this->assertEquals(new Config(), $this->fixture->getConfig());
336
337
        // check default 2
338
        $this->fixture = new Parser([], null);
339
        $this->assertEquals(new Config(), $this->fixture->getConfig());
340
341
        // check given
342
        $config = new Config();
343
        $config->setFontSpaceLimit(1000);
344
        $this->fixture = new Parser([], $config);
345
        $this->assertEquals($config, $this->fixture->getConfig());
346
    }
347
348
    /**
349
     * Tests the impact of the retainImageContent config setting on memory usage
350
     *
351
     * @group memory-heavy
352
     *
353
     * @see https://github.com/smalot/pdfparser/issues/104#issuecomment-883422508
354
     */
355
    public function testRetainImageContentImpact(): void
356
    {
357
        if (version_compare(\PHP_VERSION, '7.3.0', '<')) {
358
            $this->markTestSkipped('Garbage collection doesn\'t work reliably enough for this test in PHP < 7.3');
359
        }
360
361
        gc_collect_cycles();
362
        $baselineMemory = memory_get_usage(true);
363
364
        $filename = $this->rootDir.'/samples/bugs/Issue104a.pdf';
365
        $iterations = 2;
366
367
        /*
368
         * check default (= true)
369
         */
370
        $this->fixture = new Parser([]);
371
        $this->assertTrue($this->fixture->getConfig()->getRetainImageContent());
372
        $document = null;
373
374
        for ($i = 0; $i < $iterations; ++$i) {
375
            $document = $this->fixture->parseFile($filename);
376
        }
377
378
        $usedMemory = memory_get_usage(true);
379
        $this->assertTrue($usedMemory > ($baselineMemory * 1.5), 'Memory is only '.$usedMemory);
380
        $this->assertTrue(null != $document && '' !== $document->getText());
381
382
        // force garbage collection
383
        $this->fixture = $document = null;
384
        gc_collect_cycles();
385
386
        /*
387
         * check false
388
         */
389
        $config = new Config();
390
        $config->setRetainImageContent(false);
391
        $this->fixture = new Parser([], $config);
392
        $this->assertEquals($config, $this->fixture->getConfig());
393
394
        for ($i = 0; $i < $iterations; ++$i) {
395
            $document = $this->fixture->parseFile($filename);
396
        }
397
398
        $usedMemory = memory_get_usage(true);
399
        /*
400
         * note: the following memory value is set manually and may differ from system to system.
401
         *       it must be high enough to not produce a false negative though.
402
         */
403
        $this->assertTrue($usedMemory < ($baselineMemory * 1.05), 'Memory is '.$usedMemory);
404
        $this->assertTrue('' !== $document->getText());
405
    }
406
}
407
408
class ParserSub extends Parser
409
{
410
    public function exposedParseObject($id, $structure, $document)
411
    {
412
        return $this->parseObject($id, $structure, $document);
0 ignored issues
show
Bug introduced by
Are you sure the usage of $this->parseObject($id, $structure, $document) targeting Smalot\PdfParser\Parser::parseObject() seems to always return null.

This check looks for function or method calls that always return null and whose return value is used.

class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {

The method getObject() can return nothing but null, so it makes no sense to use the return value.

The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes.

Loading history...
413
    }
414
415
    public function getObjects(): array
416
    {
417
        return $this->objects;
418
    }
419
}
420