Test Failed
Pull Request — master (#634)
by
unknown
02:08
created

PDFObjectTest::testParseDictionary()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 14
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\PDFObject;
41
42
class PDFObjectTest extends TestCase
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    protected function getPdfObjectInstance($document): PDFObject
51
    {
52
        return new PDFObject($document);
53
    }
54
55
    public function testGetCommandsText(): void
56
    {
57
        $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4
58
342.561 Tm
59
[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>]
60
TJ /R14 17.16 Tf <20> Tj
61
0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj
62
/R14 20.04 Tf
63
ET Q
64
q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm
65
BI";
66
67
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
68
69
        $offset = 0;
0 ignored issues
show
Unused Code introduced by
The assignment to $offset is dead and can be removed.
Loading history...
70
        $parts = [];
71
        foreach ($sections as $section) {
72
            $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0];
73
        }
74
75
        $reference = [
76
            [
77
                self::TYPE => '',
78
                self::OPERATOR => 'BT',
79
                self::COMMAND => '',
80
            ],
81
            [
82
                self::TYPE => '/',
83
                self::OPERATOR => 'Tf',
84
                self::COMMAND => 'R14 30',
85
            ],
86
            [
87
                self::TYPE => '',
88
                self::OPERATOR => 'Tm',
89
                self::COMMAND => '0.999016 0 0 1 137.4 342.561',
90
            ],
91
            [
92
                self::TYPE => '[',
93
                self::OPERATOR => 'TJ',
94
                self::COMMAND => [
95
                    [
96
                        self::TYPE => '(',
97
                        self::OPERATOR => 'TJ',
98
                        self::COMMAND => 'A',
99
                    ],
100
                    [
101
                        self::TYPE => 'n',
102
                        self::OPERATOR => '',
103
                        self::COMMAND => '-168.854',
104
                    ],
105
                    [
106
                        self::TYPE => '(',
107
                        self::OPERATOR => 'TJ',
108
                        self::COMMAND => ' BC D',
109
                    ],
110
                    [
111
                        self::TYPE => 'n',
112
                        self::OPERATOR => '',
113
                        self::COMMAND => '-220.905',
114
                    ],
115
                    [
116
                        self::TYPE => '(',
117
                        self::OPERATOR => 'TJ',
118
                        self::COMMAND => '\\(E\\)',
119
                    ],
120
                    [
121
                        self::TYPE => 'n',
122
                        self::OPERATOR => '',
123
                        self::COMMAND => '20.905',
124
                    ],
125
                    [
126
                        self::TYPE => '<',
127
                        self::OPERATOR => 'TJ',
128
                        self::COMMAND => '20',
129
                    ],
130
                ],
131
            ],
132
            [
133
                self::TYPE => '/',
134
                self::OPERATOR => 'Tf',
135
                self::COMMAND => 'R14 17.16',
136
            ],
137
            [
138
                self::TYPE => '<',
139
                self::OPERATOR => 'Tj',
140
                self::COMMAND => '20',
141
            ],
142
            [
143
                self::TYPE => '',
144
                self::OPERATOR => 'Tm',
145
                self::COMMAND => '0.999014 0 0 1 336.84 319.161',
146
            ],
147
            [
148
                self::TYPE => '',
149
                self::OPERATOR => 'T*',
150
                self::COMMAND => '',
151
            ],
152
            [
153
                self::TYPE => '(',
154
                self::OPERATOR => 'Tj',
155
                self::COMMAND => " \x00m",
156
            ],
157
            [
158
                self::TYPE => '/',
159
                self::OPERATOR => 'Tf',
160
                self::COMMAND => 'R14 20.04',
161
            ],
162
            [
163
                self::TYPE => '',
164
                self::OPERATOR => 'ET',
165
                self::COMMAND => '',
166
            ],
167
            [
168
                self::TYPE => '',
169
                self::OPERATOR => 'Q',
170
                self::COMMAND => '',
171
            ],
172
            [
173
                self::TYPE => '',
174
                self::OPERATOR => 'q',
175
                self::COMMAND => '',
176
            ],
177
            [
178
                self::TYPE => '',
179
                self::OPERATOR => 'cm',
180
                self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95',
181
            ],
182
        ];
183
184
        $this->assertEquals($parts, $reference);
185
    }
186
187
    public function testCleanContent(): void
188
    {
189
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC Q /CS0 cs 1 1 0  scn 1 i
190
/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
191
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj
192
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <</MCID 2 >>BDC q 0.03 841';
193
194
        $expected = '/Shape <</MCID << /Font<8>>> BT >>BDC
195
Q
196
/CS0 cs
197
1 1 0 scn
198
1 i
199
/GS0 gs
200
BT
201
/TT0 1 Tf
202
0.0007 Tc
203
0.0018 Tw
204
0 Ts
205
100 Tz
206
0 Tr
207
24 0 0 24 51.3 639.26025 Tm
208
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
209
EMC
210
(ABC) Tj
211
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ
212
ET
213
/Shape <</MCID 2 >>BDC
214
q
215
0.03 841';
216
217
        // Normalize line-endings
218
        $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected);
219
220
        $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content);
221
222
        $this->assertEquals($expected, $cleaned);
223
224
        // Check that binary data is rejected
225
        $content = hex2bin('a670c89d4a324e47');
226
227
        $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content);
228
229
        $this->assertEquals('', $cleaned);
230
    }
231
232
    public function testGetSectionsText(): void
233
    {
234
        $content = '/Shape <</MCID 1 >>BDC
235
Q
236
/CS0 cs 1 1 0  scn
237
1 i
238
/GS0 gs
239
BT
240
/TT0 1 Tf
241
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
242
(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
243
EMC
244
(ABC) Tj
245
246
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD
247
248
ET
249
/Shape <</MCID [BT] >>BDC BT /TT1 1.5 Tf (BT )Tj ET
250
q
251
0.03 841';
252
253
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
254
255
        $this->assertEquals(
256
            [
257
                '/Shape <</MCID 1 >>BDC',
258
                'Q',
259
                'BT',
260
                '/TT0 1 Tf',
261
                '0.0007 Tc',
262
                '0.0018 Tw',
263
                '0 Ts',
264
                '100 Tz',
265
                '0 Tr',
266
                '24 0 0 24 51.3 639.26025 Tm',
267
                '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj',
268
                'EMC',
269
                '(ABC) Tj',
270
                '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD',
271
                'ET',
272
                '/Shape <</MCID [BT] >>BDC',
273
                'BT',
274
                '/TT1 1.5 Tf',
275
                '(BT )Tj',
276
                'ET',
277
                'q',
278
            ],
279
            $sections
280
        );
281
282
        // Test that a Name containing 'ET' doesn't close a 'BT' block
283
        // See: https://github.com/smalot/pdfparser/issues/474
284
        $content = 'BT
285
/FTxkPETkkj 8 Tf
286
1 0 0 1 535.55 627.4 Tm
287
(Hello World)TJ
288
ET';
289
290
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
291
292
        $this->assertNotEquals('/FTxkP', $sections[0]);
293
        $this->assertNotEquals('/FTxkP', $sections[1]);
294
    }
295
296
    public function testParseDictionary(): void
297
    {
298
        $data = '<</ActualText(text)/XObject<</F2 6 0 R /F3 [/Sub /Array]>> /Array[/Parsed /Data/Actual]/Silent<>>>';
299
300
        $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data);
301
302
        $this->assertArrayHasKey('ActualText', $dictionary);
303
        $this->assertArrayHasKey('XObject', $dictionary);
304
        $this->assertArrayHasKey('Array', $dictionary);
305
        $this->assertArrayHasKey('Silent', $dictionary);
306
307
        $this->assertCount(3, $dictionary['Array']);
308
309
        $this->assertEquals('<>', $dictionary['Silent']);
310
    }
311
312
    /**
313
     * Tests that graphics position (cm) is taken into account when
314
     * positioning text
315
     *
316
     * @see: https://github.com/smalot/pdfparser/issues/608
317
     */
318
    public function testGraphicsPositioning(): void
319
    {
320
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
321
322
        $parser = $this->getParserInstance();
323
        $document = $parser->parseFile($filename);
324
        $pages = $document->getPages();
325
326
        // The \n is not added if 'cm' commands are ignored
327
        $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText());
328
    }
329
330
    /**
331
     * Tests that ActualText text is printed for a block instead of the
332
     * contents of the Tj or TJ commands in the block.
333
     *
334
     * @see: https://github.com/smalot/pdfparser/issues/464
335
     */
336
    public function testActualText(): void
337
    {
338
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
339
340
        $parser = $this->getParserInstance();
341
        $document = $parser->parseFile($filename);
342
        $pages = $document->getPages();
343
344
        // An ActualText command subs in the three literal characters
345
        // 'ffi' for the single character ligature here
346
        // In addition, if $last_written_position isn't used to store
347
        // the position to insert, \n's would be erroniously inserted
348
        // on either side of the 'ffi'
349
        $this->assertStringContainsString('efficitur', $pages[0]->getText());
350
    }
351
352
    /**
353
     * Tests for the correct decoding of an Em-dash character in
354
     * certain font contexts
355
     *
356
     * See: https://github.com/smalot/pdfparser/issues/585
357
     */
358
    public function testDecodeEmDash(): void
359
    {
360
        $filename = $this->rootDir.'/samples/bugs/Issue585.pdf';
361
362
        $parser = $this->getParserInstance();
363
        $document = $parser->parseFile($filename);
364
        $pages = $document->getPages();
365
366
        $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText());
367
    }
368
369
    /**
370
     * Tests behavior with reversed chars instruction.
371
     *
372
     * @see: https://github.com/smalot/pdfparser/issues/398
373
     */
374
    public function testReversedChars(): void
375
    {
376
        $filename = $this->rootDir.'/samples/bugs/Issue398.pdf';
377
378
        $parser = $this->getParserInstance();
379
        $document = $parser->parseFile($filename);
380
        $pages = $document->getPages();
381
382
        $pageText = $pages[0]->getText();
383
384
        $this->assertStringContainsString('שלומי טסט', $pageText);
385
        $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText);
386
    }
387
388
    /**
389
     * Tests that a text stream with an improperly selected font code
390
     * page falls back to one that maps all characters.
391
     *
392
     * @see: https://github.com/smalot/pdfparser/issues/586
393
     */
394
    public function testImproperFontFallback(): void
395
    {
396
        $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';
397
398
        $parser = $this->getParserInstance();
399
        $document = $parser->parseFile($filename);
400
        $pages = $document->getPages();
401
402
        $this->assertStringContainsString('сделал', $pages[0]->getText());
403
    }
404
405
    /**
406
     * Tests that a font ID containing a hyphen / dash character was
407
     * correctly parsed
408
     *
409
     * @see: https://github.com/smalot/pdfparser/issues/145
410
     */
411
    public function testFontIDWithHyphen(): void
412
    {
413
        $pdfObject = $this->getPdfObjectInstance(new Document());
414
415
        $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf');
416
417
        $this->assertEquals('/', $fontCommandHyphen[0]['t']);
418
        $this->assertEquals('Tf', $fontCommandHyphen[0]['o']);
419
        $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']);
420
    }
421
}
422