Issues (82)

tests/PHPUnit/Integration/PDFObjectTest.php (2 issues)

1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\PDFObject;
41
42
class PDFObjectTest extends TestCase
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    protected function getPdfObjectInstance($document): PDFObject
51
    {
52
        return new PDFObject($document);
53
    }
54
55
    public function testGetCommandsText(): void
56
    {
57
        $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4
58
342.561 Tm
59
[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>]
60
TJ /R14 17.16 Tf <20> Tj
61
0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj
62
/R14 20.04 Tf
63
ET Q
64
q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm
65
BI";
66
67
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
68
69
        $offset = 0;
0 ignored issues
show
The assignment to $offset is dead and can be removed.
Loading history...
70
        $parts = [];
71
        foreach ($sections as $section) {
72
            $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0];
73
        }
74
75
        $reference = [
76
            [
77
                self::TYPE => '',
78
                self::OPERATOR => 'BT',
79
                self::COMMAND => '',
80
            ],
81
            [
82
                self::TYPE => '/',
83
                self::OPERATOR => 'Tf',
84
                self::COMMAND => 'R14 30',
85
            ],
86
            [
87
                self::TYPE => '',
88
                self::OPERATOR => 'Tm',
89
                self::COMMAND => '0.999016 0 0 1 137.4 342.561',
90
            ],
91
            [
92
                self::TYPE => '[',
93
                self::OPERATOR => 'TJ',
94
                self::COMMAND => [
95
                    [
96
                        self::TYPE => '(',
97
                        self::OPERATOR => 'TJ',
98
                        self::COMMAND => 'A',
99
                    ],
100
                    [
101
                        self::TYPE => 'n',
102
                        self::OPERATOR => '',
103
                        self::COMMAND => '-168.854',
104
                    ],
105
                    [
106
                        self::TYPE => '(',
107
                        self::OPERATOR => 'TJ',
108
                        self::COMMAND => ' BC D',
109
                    ],
110
                    [
111
                        self::TYPE => 'n',
112
                        self::OPERATOR => '',
113
                        self::COMMAND => '-220.905',
114
                    ],
115
                    [
116
                        self::TYPE => '(',
117
                        self::OPERATOR => 'TJ',
118
                        self::COMMAND => '\\(E\\)',
119
                    ],
120
                    [
121
                        self::TYPE => 'n',
122
                        self::OPERATOR => '',
123
                        self::COMMAND => '20.905',
124
                    ],
125
                    [
126
                        self::TYPE => '<',
127
                        self::OPERATOR => 'TJ',
128
                        self::COMMAND => '20',
129
                    ],
130
                ],
131
            ],
132
            [
133
                self::TYPE => '/',
134
                self::OPERATOR => 'Tf',
135
                self::COMMAND => 'R14 17.16',
136
            ],
137
            [
138
                self::TYPE => '<',
139
                self::OPERATOR => 'Tj',
140
                self::COMMAND => '20',
141
            ],
142
            [
143
                self::TYPE => '',
144
                self::OPERATOR => 'Tm',
145
                self::COMMAND => '0.999014 0 0 1 336.84 319.161',
146
            ],
147
            [
148
                self::TYPE => '',
149
                self::OPERATOR => 'T*',
150
                self::COMMAND => '',
151
            ],
152
            [
153
                self::TYPE => '(',
154
                self::OPERATOR => 'Tj',
155
                self::COMMAND => " \x00m",
156
            ],
157
            [
158
                self::TYPE => '/',
159
                self::OPERATOR => 'Tf',
160
                self::COMMAND => 'R14 20.04',
161
            ],
162
            [
163
                self::TYPE => '',
164
                self::OPERATOR => 'ET',
165
                self::COMMAND => '',
166
            ],
167
            [
168
                self::TYPE => '',
169
                self::OPERATOR => 'Q',
170
                self::COMMAND => '',
171
            ],
172
            [
173
                self::TYPE => '',
174
                self::OPERATOR => 'q',
175
                self::COMMAND => '',
176
            ],
177
            [
178
                self::TYPE => '',
179
                self::OPERATOR => 'cm',
180
                self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95',
181
            ],
182
        ];
183
184
        $this->assertEquals($parts, $reference);
185
    }
186
187
    public function testCleanContent(): void
188
    {
189
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC
190
Q
191
/CS0 cs 1 1 0  scn
192
1 i
193
/GS0 gs
194
BT
195
/TT0 1 Tf
196
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
197
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
198
EMC
199
(ABC) Tj
200
201
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TD
202
203
ET
204
/Shape <</MCID 2 >>BDC
205
q
206
0.03 841';
207
208
        $expected = '_____________________________________
209
Q
210
/CS0 cs 1 1 0  scn
211
1 i
212
/GS0 gs
213
BT
214
/TT0 1 Tf
215
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
216
(________________________________________________)Tj
217
___
218
(___) Tj
219
220
[_____________________________________] TD
221
222
ET
223
______________________
224
q
225
0.03 841';
226
227
        $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content, '_');
0 ignored issues
show
Deprecated Code introduced by
The function Smalot\PdfParser\PDFObject::cleanContent() has been deprecated: function is no longer used and will be removed in a future release ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

227
        $cleaned = /** @scrutinizer ignore-deprecated */ $this->getPdfObjectInstance(new Document())->cleanContent($content, '_');

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
228
229
        $this->assertEquals($cleaned, $expected);
230
    }
231
232
    public function testFormatContent(): void
233
    {
234
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC Q /CS0 cs 1 1 0  scn 1 i
235
/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
236
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj
237
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <</MCID 2 >>BDC q 0.03 841';
238
239
        $expected = '/Shape <</MCID << /Font<8>>> BT >>BDC
240
Q
241
/CS0 cs
242
1 1 0 scn
243
1 i
244
/GS0 gs
245
BT
246
/TT0 1 Tf
247
0.0007 Tc
248
0.0018 Tw
249
0 Ts
250
100 Tz
251
0 Tr
252
24 0 0 24 51.3 639.26025 Tm
253
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
254
EMC
255
(ABC) Tj
256
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ
257
ET
258
/Shape <</MCID 2 >>BDC
259
q
260
0.03 841';
261
262
        // Normalize line-endings
263
        $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected);
264
265
        $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent');
266
267
        // TODO: remove this if-clause when dropping 8.0.x support
268
        // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php:
269
        // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default.
270
        if (version_compare(PHP_VERSION, '8.1.0', '<')) {
271
            $formatContent->setAccessible(true);
272
        }
273
274
        $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
275
276
        $this->assertEquals($expected, $cleaned);
277
278
        // Check that binary data is rejected
279
        $content = hex2bin('a670c89d4a324e47');
280
281
        $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
282
283
        $this->assertEquals('', $cleaned);
284
285
        // See: https://github.com/smalot/pdfparser/issues/668
286
        $filename = $this->rootDir.'/samples/bugs/Issue668.pdf';
287
288
        $parser = $this->getParserInstance();
289
        $document = $parser->parseFile($filename);
290
        $pages = $document->getPages();
291
292
        // Binary check is done before a regexp that causes an error
293
        $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
294
295
        // mb_check_encoding(..., 'UTF-8') returns true here,
296
        // necessitating a test for UTF-8 that's more strict
297
        $content = hex2bin('0101010101010101');
298
        $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
299
300
        $this->assertEquals('', $cleaned);
301
    }
302
303
    /**
304
     * Check that escaped slashes and parentheses are accounted for,
305
     * formatContent would emit a PHP Warning for "regular expression
306
     * is too large" here without fix for issue #709
307
     *
308
     * @see https://github.com/smalot/pdfparser/issues/709
309
     */
310
    public function testFormatContentIssue709()
311
    {
312
        $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent');
313
314
        // TODO: remove this if-clause when dropping 8.0.x support
315
        // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php:
316
        // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default.
317
        if (version_compare(PHP_VERSION, '8.1.0', '<')) {
318
            $formatContent->setAccessible(true);
319
        }
320
321
        $content = '(String \\\\\\(string)Tj '.str_repeat('(Test)Tj ', 4500);
322
        $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
323
324
        $this->assertStringContainsString('(String \\\\\\(string)Tj'."\r\n", $cleaned);
325
    }
326
327
    /**
328
     * Check that inline image data does not corrupt the stream
329
     *
330
     * @see: https://github.com/smalot/pdfparser/issues/691
331
     */
332
    public function testFormatContentInlineImages(): void
333
    {
334
        $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent');
335
336
        // TODO: remove this if-clause when dropping 8.0.x support
337
        // From documentation > http://php.net/manual/en/reflectionproperty.setaccessible.php:
338
        // As of PHP 8.1.0, calling this method has no effect; all properties are accessible by default.
339
        if (version_compare(PHP_VERSION, '8.1.0', '<')) {
340
            $formatContent->setAccessible(true);
341
        }
342
343
        $cleaned = $formatContent->invoke(
344
            $this->getPdfObjectInstance(new Document()),
345
            'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150
346
/BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g'
347
        );
348
349
        // PdfParser should not be fooled by Q's in inline image data;
350
        // Only one 'Q' command should be found
351
        $commandQ = preg_match_all('/Q\r\n/', $cleaned);
352
        $this->assertEquals(1, $commandQ);
353
354
        // The 'BI' inside a string should not be interpreted as the
355
        // beginning of an inline image command
356
        $this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned);
357
358
        $cleaned = $formatContent->invoke(
359
            $this->getPdfObjectInstance(new Document()),
360
            'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET'
361
        );
362
363
        $this->assertEquals('BT'."\r\n".
364
'(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n".
365
'(There is no ID inline image in this data) TD'."\r\n".
366
'(Nothing but text EI should be found) TD'."\r\n".
367
'ET', $cleaned);
368
    }
369
370
    public function testGetSectionsText(): void
371
    {
372
        $content = '/Shape <</MCID 1 >>BDC
373
Q
374
/CS0 cs 1 1 0  scn
375
1 i
376
/GS0 gs
377
BT
378
/TT0 1 Tf
379
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
380
(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
381
EMC
382
(ABC) Tj
383
384
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD
385
386
ET
387
/Shape <</MCID [BT] >>BDC BT /TT1 1.5 Tf (BT )Tj ET
388
q
389
0.03 841';
390
391
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
392
393
        $this->assertEquals(
394
            [
395
                '/Shape <</MCID 1 >>BDC',
396
                'Q',
397
                'BT',
398
                '/TT0 1 Tf',
399
                '0.0007 Tc',
400
                '0.0018 Tw',
401
                '0 Ts',
402
                '100 Tz',
403
                '0 Tr',
404
                '24 0 0 24 51.3 639.26025 Tm',
405
                '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj',
406
                'EMC',
407
                '(ABC) Tj',
408
                '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD',
409
                'ET',
410
                '/Shape <</MCID [BT] >>BDC',
411
                'BT',
412
                '/TT1 1.5 Tf',
413
                '(BT )Tj',
414
                'ET',
415
                'q',
416
            ],
417
            $sections
418
        );
419
420
        // Test that a Name containing 'ET' doesn't close a 'BT' block
421
        // See: https://github.com/smalot/pdfparser/issues/474
422
        $content = 'BT
423
/FTxkPETkkj 8 Tf
424
1 0 0 1 535.55 627.4 Tm
425
(Hello World)TJ
426
ET';
427
428
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
429
430
        $this->assertNotEquals('/FTxkP', $sections[0]);
431
        $this->assertNotEquals('/FTxkP', $sections[1]);
432
    }
433
434
    public function testParseDictionary(): void
435
    {
436
        $data = '<</ActualText(text)/XObject<</F2 6 0 R /F3 [/Sub /Array]>> /Array[/Parsed /Data/Actual]/Silent<>>>';
437
438
        $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data);
439
440
        $this->assertArrayHasKey('ActualText', $dictionary);
441
        $this->assertArrayHasKey('XObject', $dictionary);
442
        $this->assertArrayHasKey('Array', $dictionary);
443
        $this->assertArrayHasKey('Silent', $dictionary);
444
445
        $this->assertCount(3, $dictionary['Array']);
446
447
        $this->assertEquals('<>', $dictionary['Silent']);
448
    }
449
450
    /**
451
     * Tests that graphics position (cm) is taken into account when
452
     * positioning text
453
     *
454
     * @see: https://github.com/smalot/pdfparser/issues/608
455
     */
456
    public function testGraphicsPositioning(): void
457
    {
458
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
459
460
        $parser = $this->getParserInstance();
461
        $document = $parser->parseFile($filename);
462
        $pages = $document->getPages();
463
464
        // The \n is not added if 'cm' commands are ignored
465
        $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText());
466
    }
467
468
    /**
469
     * Tests that ActualText text is printed for a block instead of the
470
     * contents of the Tj or TJ commands in the block.
471
     *
472
     * @see: https://github.com/smalot/pdfparser/issues/464
473
     */
474
    public function testActualText(): void
475
    {
476
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
477
478
        $parser = $this->getParserInstance();
479
        $document = $parser->parseFile($filename);
480
        $pages = $document->getPages();
481
482
        // An ActualText command subs in the three literal characters
483
        // 'ffi' for the single character ligature here
484
        // In addition, if $last_written_position isn't used to store
485
        // the position to insert, \n's would be erroniously inserted
486
        // on either side of the 'ffi'
487
        $this->assertStringContainsString('efficitur', $pages[0]->getText());
488
    }
489
490
    /**
491
     * Tests for the correct decoding of an Em-dash character in
492
     * certain font contexts
493
     *
494
     * See: https://github.com/smalot/pdfparser/issues/585
495
     */
496
    public function testDecodeEmDash(): void
497
    {
498
        $filename = $this->rootDir.'/samples/bugs/Issue585.pdf';
499
500
        $parser = $this->getParserInstance();
501
        $document = $parser->parseFile($filename);
502
        $pages = $document->getPages();
503
504
        $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText());
505
    }
506
507
    /**
508
     * Tests behavior with reversed chars instruction.
509
     *
510
     * @see: https://github.com/smalot/pdfparser/issues/398
511
     */
512
    public function testReversedChars(): void
513
    {
514
        $filename = $this->rootDir.'/samples/bugs/Issue398.pdf';
515
516
        $parser = $this->getParserInstance();
517
        $document = $parser->parseFile($filename);
518
        $pages = $document->getPages();
519
520
        $pageText = $pages[0]->getText();
521
522
        $this->assertStringContainsString('שלומי טסט', $pageText);
523
        $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText);
524
    }
525
526
    /**
527
     * Tests that a text stream with an improperly selected font code
528
     * page falls back to one that maps all characters.
529
     *
530
     * @see: https://github.com/smalot/pdfparser/issues/586
531
     */
532
    public function testImproperFontFallback(): void
533
    {
534
        $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';
535
536
        $parser = $this->getParserInstance();
537
        $document = $parser->parseFile($filename);
538
        $pages = $document->getPages();
539
540
        $this->assertStringContainsString('сделал', $pages[0]->getText());
541
    }
542
543
    /**
544
     * Tests that a font ID containing a hyphen / dash character was
545
     * correctly parsed
546
     *
547
     * @see: https://github.com/smalot/pdfparser/issues/145
548
     */
549
    public function testFontIDWithHyphen(): void
550
    {
551
        $pdfObject = $this->getPdfObjectInstance(new Document());
552
553
        $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf');
554
555
        $this->assertEquals('/', $fontCommandHyphen[0]['t']);
556
        $this->assertEquals('Tf', $fontCommandHyphen[0]['o']);
557
        $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']);
558
    }
559
560
    /**
561
     * Tests that an invalid command does not cause an error, but just
562
     * returns an empty array
563
     */
564
    public function testInvalidCommand(): void
565
    {
566
        $pdfObject = $this->getPdfObjectInstance(new Document());
567
568
        $validCommand = $pdfObject->getCommandsText('75 rg');
569
570
        $this->assertEquals('', $validCommand[0]['t']);
571
        $this->assertEquals('rg', $validCommand[0]['o']);
572
        $this->assertEquals('75', $validCommand[0]['c']);
573
574
        $invalidCommand = $pdfObject->getCommandsText('75');
575
576
        $this->assertEquals([], $invalidCommand);
577
    }
578
}
579