Test Failed
Pull Request — master (#634)
by
unknown
01:58
created

PDFObjectTest::testParseDictionary()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 14
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\PDFObject;
41
42
class PDFObjectTest extends TestCase
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    protected function getPdfObjectInstance($document): PDFObject
51
    {
52
        return new PDFObject($document);
53
    }
54
55
    public function testGetCommandsText(): void
56
    {
57
        $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4
58
342.561 Tm
59
[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>]
60
TJ /R14 17.16 Tf <20> Tj
61
0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj
62
/R14 20.04 Tf
63
ET Q
64
q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm
65
BI";
66
67
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
68
69
        $offset = 0;
0 ignored issues
show
Unused Code introduced by
The assignment to $offset is dead and can be removed.
Loading history...
70
        $parts = [];
71
        foreach ($sections as $section) {
72
            $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0];
73
        }
74
75
        $reference = [
76
            [
77
                self::TYPE => '',
78
                self::OPERATOR => 'BT',
79
                self::COMMAND => '',
80
            ],
81
            [
82
                self::TYPE => '/',
83
                self::OPERATOR => 'Tf',
84
                self::COMMAND => 'R14 30',
85
            ],
86
            [
87
                self::TYPE => '',
88
                self::OPERATOR => 'Tm',
89
                self::COMMAND => '0.999016 0 0 1 137.4 342.561',
90
            ],
91
            [
92
                self::TYPE => '[',
93
                self::OPERATOR => 'TJ',
94
                self::COMMAND => [
95
                    [
96
                        self::TYPE => '(',
97
                        self::OPERATOR => 'TJ',
98
                        self::COMMAND => 'A',
99
                    ],
100
                    [
101
                        self::TYPE => 'n',
102
                        self::OPERATOR => '',
103
                        self::COMMAND => '-168.854',
104
                    ],
105
                    [
106
                        self::TYPE => '(',
107
                        self::OPERATOR => 'TJ',
108
                        self::COMMAND => ' BC D',
109
                    ],
110
                    [
111
                        self::TYPE => 'n',
112
                        self::OPERATOR => '',
113
                        self::COMMAND => '-220.905',
114
                    ],
115
                    [
116
                        self::TYPE => '(',
117
                        self::OPERATOR => 'TJ',
118
                        self::COMMAND => '\\(E\\)',
119
                    ],
120
                    [
121
                        self::TYPE => 'n',
122
                        self::OPERATOR => '',
123
                        self::COMMAND => '20.905',
124
                    ],
125
                    [
126
                        self::TYPE => '<',
127
                        self::OPERATOR => 'TJ',
128
                        self::COMMAND => '20',
129
                    ],
130
                ],
131
            ],
132
            [
133
                self::TYPE => '/',
134
                self::OPERATOR => 'Tf',
135
                self::COMMAND => 'R14 17.16',
136
            ],
137
            [
138
                self::TYPE => '<',
139
                self::OPERATOR => 'Tj',
140
                self::COMMAND => '20',
141
            ],
142
            [
143
                self::TYPE => '',
144
                self::OPERATOR => 'Tm',
145
                self::COMMAND => '0.999014 0 0 1 336.84 319.161',
146
            ],
147
            [
148
                self::TYPE => '',
149
                self::OPERATOR => 'T*',
150
                self::COMMAND => '',
151
            ],
152
            [
153
                self::TYPE => '(',
154
                self::OPERATOR => 'Tj',
155
                self::COMMAND => " \x00m",
156
            ],
157
            [
158
                self::TYPE => '/',
159
                self::OPERATOR => 'Tf',
160
                self::COMMAND => 'R14 20.04',
161
            ],
162
            [
163
                self::TYPE => '',
164
                self::OPERATOR => 'ET',
165
                self::COMMAND => '',
166
            ],
167
            [
168
                self::TYPE => '',
169
                self::OPERATOR => 'Q',
170
                self::COMMAND => '',
171
            ],
172
            [
173
                self::TYPE => '',
174
                self::OPERATOR => 'q',
175
                self::COMMAND => '',
176
            ],
177
            [
178
                self::TYPE => '',
179
                self::OPERATOR => 'cm',
180
                self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95',
181
            ],
182
        ];
183
184
        $this->assertEquals($parts, $reference);
185
    }
186
187
    public function testCleanContent(): void
188
    {
189
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC
190
Q
191
/CS0 cs 1 1 0  scn
192
1 i
193
/GS0 gs
194
BT
195
/TT0 1 Tf
196
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
197
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
198
EMC
199
(ABC) Tj
200
201
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TD
202
203
ET
204
/Shape <</MCID 2 >>BDC
205
q
206
0.03 841';
207
208
        $expected = '_____________________________________
209
Q
210
/CS0 cs 1 1 0  scn
211
1 i
212
/GS0 gs
213
BT
214
/TT0 1 Tf
215
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
216
(________________________________________________)Tj
217
___
218
(___) Tj
219
220
[_____________________________________] TD
221
222
ET
223
______________________
224
q
225
0.03 841';
226
227
        $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content, '_');
228
229
        $this->assertEquals($cleaned, $expected);
230
    }
231
232
    public function testFormatContent(): void
233
    {
234
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC Q /CS0 cs 1 1 0  scn 1 i
235
/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
236
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj
237
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <</MCID 2 >>BDC q 0.03 841';
238
239
        $expected = '/Shape <</MCID << /Font<8>>> BT >>BDC
240
Q
241
/CS0 cs
242
1 1 0 scn
243
1 i
244
/GS0 gs
245
BT
246
/TT0 1 Tf
247
0.0007 Tc
248
0.0018 Tw
249
0 Ts
250
100 Tz
251
0 Tr
252
24 0 0 24 51.3 639.26025 Tm
253
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
254
EMC
255
(ABC) Tj
256
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ
257
ET
258
/Shape <</MCID 2 >>BDC
259
q
260
0.03 841';
261
262
        // Normalize line-endings
263
        $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected);
264
265
        $cleaned = $this->getPdfObjectInstance(new Document())->formatContent($content);
266
267
        $this->assertEquals($expected, $cleaned);
268
269
        // Check that binary data is rejected
270
        $content = hex2bin('a670c89d4a324e47');
271
272
        $cleaned = $this->getPdfObjectInstance(new Document())->formatContent($content);
273
274
        $this->assertEquals('', $cleaned);
275
    }
276
277
    public function testGetSectionsText(): void
278
    {
279
        $content = '/Shape <</MCID 1 >>BDC
280
Q
281
/CS0 cs 1 1 0  scn
282
1 i
283
/GS0 gs
284
BT
285
/TT0 1 Tf
286
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
287
(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
288
EMC
289
(ABC) Tj
290
291
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD
292
293
ET
294
/Shape <</MCID [BT] >>BDC BT /TT1 1.5 Tf (BT )Tj ET
295
q
296
0.03 841';
297
298
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
299
300
        $this->assertEquals(
301
            [
302
                '/Shape <</MCID 1 >>BDC',
303
                'Q',
304
                'BT',
305
                '/TT0 1 Tf',
306
                '0.0007 Tc',
307
                '0.0018 Tw',
308
                '0 Ts',
309
                '100 Tz',
310
                '0 Tr',
311
                '24 0 0 24 51.3 639.26025 Tm',
312
                '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj',
313
                'EMC',
314
                '(ABC) Tj',
315
                '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD',
316
                'ET',
317
                '/Shape <</MCID [BT] >>BDC',
318
                'BT',
319
                '/TT1 1.5 Tf',
320
                '(BT )Tj',
321
                'ET',
322
                'q',
323
            ],
324
            $sections
325
        );
326
327
        // Test that a Name containing 'ET' doesn't close a 'BT' block
328
        // See: https://github.com/smalot/pdfparser/issues/474
329
        $content = 'BT
330
/FTxkPETkkj 8 Tf
331
1 0 0 1 535.55 627.4 Tm
332
(Hello World)TJ
333
ET';
334
335
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
336
337
        $this->assertNotEquals('/FTxkP', $sections[0]);
338
        $this->assertNotEquals('/FTxkP', $sections[1]);
339
    }
340
341
    public function testParseDictionary(): void
342
    {
343
        $data = '<</ActualText(text)/XObject<</F2 6 0 R /F3 [/Sub /Array]>> /Array[/Parsed /Data/Actual]/Silent<>>>';
344
345
        $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data);
346
347
        $this->assertArrayHasKey('ActualText', $dictionary);
348
        $this->assertArrayHasKey('XObject', $dictionary);
349
        $this->assertArrayHasKey('Array', $dictionary);
350
        $this->assertArrayHasKey('Silent', $dictionary);
351
352
        $this->assertCount(3, $dictionary['Array']);
353
354
        $this->assertEquals('<>', $dictionary['Silent']);
355
    }
356
357
    /**
358
     * Tests that graphics position (cm) is taken into account when
359
     * positioning text
360
     *
361
     * @see: https://github.com/smalot/pdfparser/issues/608
362
     */
363
    public function testGraphicsPositioning(): void
364
    {
365
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
366
367
        $parser = $this->getParserInstance();
368
        $document = $parser->parseFile($filename);
369
        $pages = $document->getPages();
370
371
        // The \n is not added if 'cm' commands are ignored
372
        $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText());
373
    }
374
375
    /**
376
     * Tests that ActualText text is printed for a block instead of the
377
     * contents of the Tj or TJ commands in the block.
378
     *
379
     * @see: https://github.com/smalot/pdfparser/issues/464
380
     */
381
    public function testActualText(): void
382
    {
383
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
384
385
        $parser = $this->getParserInstance();
386
        $document = $parser->parseFile($filename);
387
        $pages = $document->getPages();
388
389
        // An ActualText command subs in the three literal characters
390
        // 'ffi' for the single character ligature here
391
        // In addition, if $last_written_position isn't used to store
392
        // the position to insert, \n's would be erroniously inserted
393
        // on either side of the 'ffi'
394
        $this->assertStringContainsString('efficitur', $pages[0]->getText());
395
    }
396
397
    /**
398
     * Tests for the correct decoding of an Em-dash character in
399
     * certain font contexts
400
     *
401
     * See: https://github.com/smalot/pdfparser/issues/585
402
     */
403
    public function testDecodeEmDash(): void
404
    {
405
        $filename = $this->rootDir.'/samples/bugs/Issue585.pdf';
406
407
        $parser = $this->getParserInstance();
408
        $document = $parser->parseFile($filename);
409
        $pages = $document->getPages();
410
411
        $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText());
412
    }
413
414
    /**
415
     * Tests behavior with reversed chars instruction.
416
     *
417
     * @see: https://github.com/smalot/pdfparser/issues/398
418
     */
419
    public function testReversedChars(): void
420
    {
421
        $filename = $this->rootDir.'/samples/bugs/Issue398.pdf';
422
423
        $parser = $this->getParserInstance();
424
        $document = $parser->parseFile($filename);
425
        $pages = $document->getPages();
426
427
        $pageText = $pages[0]->getText();
428
429
        $this->assertStringContainsString('שלומי טסט', $pageText);
430
        $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText);
431
    }
432
433
    /**
434
     * Tests that a text stream with an improperly selected font code
435
     * page falls back to one that maps all characters.
436
     *
437
     * @see: https://github.com/smalot/pdfparser/issues/586
438
     */
439
    public function testImproperFontFallback(): void
440
    {
441
        $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';
442
443
        $parser = $this->getParserInstance();
444
        $document = $parser->parseFile($filename);
445
        $pages = $document->getPages();
446
447
        $this->assertStringContainsString('сделал', $pages[0]->getText());
448
    }
449
450
    /**
451
     * Tests that a font ID containing a hyphen / dash character was
452
     * correctly parsed
453
     *
454
     * @see: https://github.com/smalot/pdfparser/issues/145
455
     */
456
    public function testFontIDWithHyphen(): void
457
    {
458
        $pdfObject = $this->getPdfObjectInstance(new Document());
459
460
        $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf');
461
462
        $this->assertEquals('/', $fontCommandHyphen[0]['t']);
463
        $this->assertEquals('Tf', $fontCommandHyphen[0]['o']);
464
        $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']);
465
    }
466
}
467