Test Failed
Pull Request — master (#634)
by
unknown
02:03
created

PDFObjectTest::testFontIDWithHyphen()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 5
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 9
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 *
8
 * @date    2020-06-01
9
 *
10
 * @author  Sébastien MALOT <[email protected]>
11
 *
12
 * @date    2017-01-03
13
 *
14
 * @license LGPLv3
15
 *
16
 * @url     <https://github.com/smalot/pdfparser>
17
 *
18
 *  PdfParser is a pdf library written in PHP, extraction oriented.
19
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
20
 *
21
 *  This program is free software: you can redistribute it and/or modify
22
 *  it under the terms of the GNU Lesser General Public License as published by
23
 *  the Free Software Foundation, either version 3 of the License, or
24
 *  (at your option) any later version.
25
 *
26
 *  This program is distributed in the hope that it will be useful,
27
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
28
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29
 *  GNU Lesser General Public License for more details.
30
 *
31
 *  You should have received a copy of the GNU Lesser General Public License
32
 *  along with this program.
33
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
34
 */
35
36
namespace PHPUnitTests\Integration;
37
38
use PHPUnitTests\TestCase;
39
use Smalot\PdfParser\Document;
40
use Smalot\PdfParser\PDFObject;
41
42
class PDFObjectTest extends TestCase
43
{
44
    public const TYPE = 't';
45
46
    public const OPERATOR = 'o';
47
48
    public const COMMAND = 'c';
49
50
    protected function getPdfObjectInstance($document): PDFObject
51
    {
52
        return new PDFObject($document);
53
    }
54
55
    public function testGetCommandsText(): void
56
    {
57
        $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4
58
342.561 Tm
59
[(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>]
60
TJ /R14 17.16 Tf <20> Tj
61
0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj
62
/R14 20.04 Tf
63
ET Q
64
q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm
65
BI";
66
67
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
68
69
        $offset = 0;
0 ignored issues
show
Unused Code introduced by
The assignment to $offset is dead and can be removed.
Loading history...
70
        $parts = [];
71
        foreach ($sections as $section) {
72
            $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0];
73
        }
74
75
        $reference = [
76
            [
77
                self::TYPE => '',
78
                self::OPERATOR => 'BT',
79
                self::COMMAND => '',
80
            ],
81
            [
82
                self::TYPE => '/',
83
                self::OPERATOR => 'Tf',
84
                self::COMMAND => 'R14 30',
85
            ],
86
            [
87
                self::TYPE => '',
88
                self::OPERATOR => 'Tm',
89
                self::COMMAND => '0.999016 0 0 1 137.4 342.561',
90
            ],
91
            [
92
                self::TYPE => '[',
93
                self::OPERATOR => 'TJ',
94
                self::COMMAND => [
95
                    [
96
                        self::TYPE => '(',
97
                        self::OPERATOR => 'TJ',
98
                        self::COMMAND => 'A',
99
                    ],
100
                    [
101
                        self::TYPE => 'n',
102
                        self::OPERATOR => '',
103
                        self::COMMAND => '-168.854',
104
                    ],
105
                    [
106
                        self::TYPE => '(',
107
                        self::OPERATOR => 'TJ',
108
                        self::COMMAND => ' BC D',
109
                    ],
110
                    [
111
                        self::TYPE => 'n',
112
                        self::OPERATOR => '',
113
                        self::COMMAND => '-220.905',
114
                    ],
115
                    [
116
                        self::TYPE => '(',
117
                        self::OPERATOR => 'TJ',
118
                        self::COMMAND => '\\(E\\)',
119
                    ],
120
                    [
121
                        self::TYPE => 'n',
122
                        self::OPERATOR => '',
123
                        self::COMMAND => '20.905',
124
                    ],
125
                    [
126
                        self::TYPE => '<',
127
                        self::OPERATOR => 'TJ',
128
                        self::COMMAND => '20',
129
                    ],
130
                ],
131
            ],
132
            [
133
                self::TYPE => '/',
134
                self::OPERATOR => 'Tf',
135
                self::COMMAND => 'R14 17.16',
136
            ],
137
            [
138
                self::TYPE => '<',
139
                self::OPERATOR => 'Tj',
140
                self::COMMAND => '20',
141
            ],
142
            [
143
                self::TYPE => '',
144
                self::OPERATOR => 'Tm',
145
                self::COMMAND => '0.999014 0 0 1 336.84 319.161',
146
            ],
147
            [
148
                self::TYPE => '',
149
                self::OPERATOR => 'T*',
150
                self::COMMAND => '',
151
            ],
152
            [
153
                self::TYPE => '(',
154
                self::OPERATOR => 'Tj',
155
                self::COMMAND => " \x00m",
156
            ],
157
            [
158
                self::TYPE => '/',
159
                self::OPERATOR => 'Tf',
160
                self::COMMAND => 'R14 20.04',
161
            ],
162
            [
163
                self::TYPE => '',
164
                self::OPERATOR => 'ET',
165
                self::COMMAND => '',
166
            ],
167
            [
168
                self::TYPE => '',
169
                self::OPERATOR => 'Q',
170
                self::COMMAND => '',
171
            ],
172
            [
173
                self::TYPE => '',
174
                self::OPERATOR => 'q',
175
                self::COMMAND => '',
176
            ],
177
            [
178
                self::TYPE => '',
179
                self::OPERATOR => 'cm',
180
                self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95',
181
            ],
182
        ];
183
184
        $this->assertEquals($parts, $reference);
185
    }
186
187
    public function testCleanContent(): void
188
    {
189
        $content = '/Shape <</MCID << /Font<8>>> BT >>BDC Q /CS0 cs 1 1 0  scn 1 i
190
/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
191
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj
192
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <</MCID 2 >>BDC q 0.03 841';
193
194
        $expected = '/Shape <</MCID << /Font<8>>> BT >>BDC
195
Q
196
/CS0 cs
197
1 1 0 scn
198
1 i
199
/GS0 gs
200
BT
201
/TT0 1 Tf
202
0.0007 Tc
203
0.0018 Tw
204
0 Ts
205
100 Tz
206
0 Tr
207
24 0 0 24 51.3 639.26025 Tm
208
(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
209
EMC
210
(ABC) Tj
211
[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ
212
ET
213
/Shape <</MCID 2 >>BDC
214
q
215
0.03 841';
216
217
        // Normalize line-endings
218
        $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected);
219
220
        $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content);
221
222
        $this->assertEquals($expected, $cleaned);
223
224
        // Test that a Name containing 'ET' doesn't close a 'BT' block
225
        // See: https://github.com/smalot/pdfparser/issues/474
226
        $content = 'BT
227
/FTxkPETkkj 8 Tf
228
1 0 0 1 535.55 627.4 Tm
229
(Hello World)TJ
230
ET';
231
232
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
233
234
        $this->assertNotEquals('/FTxkP', $sections[0]);
235
        $this->assertNotEquals('/FTxkP', $sections[1]);
236
    }
237
238
    public function testGetSectionText(): void
239
    {
240
        $content = '/Shape <</MCID 1 >>BDC
241
Q
242
/CS0 cs 1 1 0  scn
243
1 i
244
/GS0 gs
245
BT
246
/TT0 1 Tf
247
0.0007 Tc 0.0018 Tw 0  Ts 100  Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm
248
(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj
249
EMC
250
(ABC) Tj
251
252
[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD
253
254
ET
255
/Shape <</MCID [BT] >>BDC BT /TT1 1.5 Tf (BT )Tj ET
256
q
257
0.03 841';
258
259
        $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content);
260
261
        $this->assertEquals(
262
            [
263
                '/Shape <</MCID 1 >>BDC',
264
                'Q',
265
                'BT',
266
                '/TT0 1 Tf',
267
                '0.0007 Tc',
268
                '0.0018 Tw',
269
                '0 Ts',
270
                '100 Tz',
271
                '0 Tr',
272
                '24 0 0 24 51.3 639.26025 Tm',
273
                '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj',
274
                'EMC',
275
                '(ABC) Tj',
276
                '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD',
277
                'ET',
278
                '/Shape <</MCID [BT] >>BDC',
279
                'BT',
280
                '/TT1 1.5 Tf',
281
                '(BT )Tj',
282
                'ET',
283
                'q',
284
            ],
285
            $sections
286
        );
287
    }
288
289
    public function testParseDictionary(): void
290
    {
291
        $data = '<</ActualText(text)/XObject<</F2 6 0 R /F3 [/Sub /Array]>> /Array[/Parsed /Data/Actual]/Silent<>>>';
292
293
        $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data);
294
295
        $this->assertArrayHasKey('ActualText', $dictionary);
296
        $this->assertArrayHasKey('XObject', $dictionary);
297
        $this->assertArrayHasKey('Array', $dictionary);
298
        $this->assertArrayHasKey('Silent', $dictionary);
299
300
        $this->assertCount(3, $dictionary['Array']);
301
302
        $this->assertEquals('<>', $dictionary['Silent']);
303
    }
304
305
    /**
306
     * Tests that graphics position (cm) is taken into account when
307
     * positioning text
308
     *
309
     * @see: https://github.com/smalot/pdfparser/issues/608
310
     */
311
    public function testGraphicsPositioning(): void
312
    {
313
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
314
315
        $parser = $this->getParserInstance();
316
        $document = $parser->parseFile($filename);
317
        $pages = $document->getPages();
318
319
        // The \n is not added if 'cm' commands are ignored
320
        $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText());
321
    }
322
323
    /**
324
     * Tests that ActualText text is printed for a block instead of the
325
     * contents of the Tj or TJ commands in the block.
326
     *
327
     * @see: https://github.com/smalot/pdfparser/issues/464
328
     */
329
    public function testActualText(): void
330
    {
331
        $filename = $this->rootDir.'/samples/bugs/Issue608.pdf';
332
333
        $parser = $this->getParserInstance();
334
        $document = $parser->parseFile($filename);
335
        $pages = $document->getPages();
336
337
        // An ActualText command subs in the three literal characters
338
        // 'ffi' for the single character ligature here
339
        // In addition, if $last_written_position isn't used to store
340
        // the position to insert, \n's would be erroniously inserted
341
        // on either side of the 'ffi'
342
        $this->assertStringContainsString('efficitur', $pages[0]->getText());
343
    }
344
345
    /**
346
     * Tests for the correct decoding of an Em-dash character in
347
     * certain font contexts
348
     *
349
     * See: https://github.com/smalot/pdfparser/issues/585
350
     */
351
    public function testDecodeEmDash(): void
352
    {
353
        $filename = $this->rootDir.'/samples/bugs/Issue585.pdf';
354
355
        $parser = $this->getParserInstance();
356
        $document = $parser->parseFile($filename);
357
        $pages = $document->getPages();
358
359
        $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText());
360
    }
361
362
    /**
363
     * Tests behavior with reversed chars instruction.
364
     *
365
     * @see: https://github.com/smalot/pdfparser/issues/398
366
     */
367
    public function testReversedChars(): void
368
    {
369
        $filename = $this->rootDir.'/samples/bugs/Issue398.pdf';
370
371
        $parser = $this->getParserInstance();
372
        $document = $parser->parseFile($filename);
373
        $pages = $document->getPages();
374
375
        $pageText = $pages[0]->getText();
376
377
        $this->assertStringContainsString('שלומי טסט', $pageText);
378
        $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText);
379
    }
380
381
    /**
382
     * Tests that a text stream with an improperly selected font code
383
     * page falls back to one that maps all characters.
384
     *
385
     * @see: https://github.com/smalot/pdfparser/issues/586
386
     */
387
    public function testImproperFontFallback(): void
388
    {
389
        $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf';
390
391
        $parser = $this->getParserInstance();
392
        $document = $parser->parseFile($filename);
393
        $pages = $document->getPages();
394
395
        $this->assertStringContainsString('сделал', $pages[0]->getText());
396
    }
397
398
    /**
399
     * Tests that a font ID containing a hyphen / dash character was
400
     * correctly parsed
401
     *
402
     * @see: https://github.com/smalot/pdfparser/issues/145
403
     */
404
    public function testFontIDWithHyphen(): void
405
    {
406
        $pdfObject = $this->getPdfObjectInstance(new Document());
407
408
        $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf');
409
410
        $this->assertEquals('/', $fontCommandHyphen[0]['t']);
411
        $this->assertEquals('Tf', $fontCommandHyphen[0]['o']);
412
        $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']);
413
    }
414
}
415