Passed
Push — fix/corrupted-pdf ( dfc539 )
by Jeremy
15:22 queued 06:51
created

PageTest::testExtractRawData()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 20
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 14
c 1
b 0
f 0
dl 0
loc 20
rs 9.7998
cc 1
nc 1
nop 0
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 * @date    2020-06-01
8
 *
9
 * @author  Sébastien MALOT <[email protected]>
10
 * @date    2017-01-03
11
 *
12
 * @license LGPLv3
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Tests\Smalot\PdfParser\Integration;
34
35
use Smalot\PdfParser\Font;
36
use Tests\Smalot\PdfParser\TestCase;
37
38
class PageTest extends TestCase
39
{
40
    public function testGetFonts()
41
    {
42
        // Document with text.
43
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
44
        $parser = $this->getParserInstance();
45
        $document = $parser->parseFile($filename);
46
        $pages = $document->getPages();
47
        $page = $pages[0];
48
49
        // the first to load data.
50
        $fonts = $page->getFonts();
51
        $this->assertTrue(0 < \count($fonts));
52
        foreach ($fonts as $font) {
53
            $this->assertTrue($font instanceof Font);
54
        }
55
        // the second to use cache.
56
        $fonts = $page->getFonts();
57
        $this->assertTrue(0 < \count($fonts));
58
59
        // ------------------------------------------------------
60
        // Document without text.
61
        $filename = $this->rootDir.'/samples/Document3_pdfcreator_nocompressed.pdf';
62
        $document = $parser->parseFile($filename);
63
        $pages = $document->getPages();
64
        $page = $pages[0];
65
66
        // the first to load data.
67
        $fonts = $page->getFonts();
68
        $this->assertEquals(0, \count($fonts));
69
        // the second to use cache.
70
        $fonts = $page->getFonts();
71
        $this->assertEquals(0, \count($fonts));
72
    }
73
74
    public function testGetFont()
75
    {
76
        // Document with text.
77
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
78
        $parser = $this->getParserInstance();
79
        $document = $parser->parseFile($filename);
80
        $pages = $document->getPages();
81
        $page = $pages[0];
82
83
        // the first to load data.
84
        $font = $page->getFont('R7');
85
        $this->assertTrue($font instanceof Font);
86
87
        $font = $page->getFont('ABC7');
88
        $this->assertTrue($font instanceof Font);
89
    }
90
91
    public function testGetText()
92
    {
93
        // Document with text.
94
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
95
        $parser = $this->getParserInstance();
96
        $document = $parser->parseFile($filename);
97
        $pages = $document->getPages();
98
        $page = $pages[0];
99
        $text = $page->getText();
100
101
        $this->assertTrue(150 < \strlen($text));
102
        $this->assertContains('Document title', $text);
103
        $this->assertContains('Lorem ipsum', $text);
104
105
        $this->assertContains('Calibri', $text);
106
        $this->assertContains('Arial', $text);
107
        $this->assertContains('Times', $text);
108
        $this->assertContains('Courier New', $text);
109
        $this->assertContains('Verdana', $text);
110
    }
111
112
    public function testExtractRawData()
113
    {
114
        // Document with text.
115
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
116
        $parser = $this->getParserInstance();
117
        $document = $parser->parseFile($filename);
118
        $pages = $document->getPages();
119
        $page = $pages[0];
120
        $extractedRawData = $page->extractRawData();
121
        $tmItem = $extractedRawData[1];
122
123
        $this->assertcount(172, $extractedRawData);
124
        $this->assertCount(3, $tmItem);
125
126
        $this->assertArrayHasKey('t', $tmItem);
127
        $this->assertArrayHasKey('o', $tmItem);
128
        $this->assertArrayHasKey('c', $tmItem);
129
130
        $this->assertContains('Tm', $tmItem['o']);
131
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
132
    }
133
134
    public function testExtractDecodedRawData()
135
    {
136
        // Document with text.
137
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
138
        $parser = $this->getParserInstance();
139
        $document = $parser->parseFile($filename);
140
        $pages = $document->getPages();
141
        $page = $pages[0];
142
        $extractedDecodedRawData = $page->extractDecodedRawData();
143
        $tmItem = $extractedDecodedRawData[1];
144
        $this->assertCount(172, $extractedDecodedRawData);
145
        $this->assertCount(3, $tmItem);
146
147
        $this->assertArrayHasKey('t', $tmItem);
148
        $this->assertArrayHasKey('o', $tmItem);
149
        $this->assertArrayHasKey('c', $tmItem);
150
151
        $this->assertContains('Tm', $tmItem['o']);
152
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
153
154
        $this->assertCount(3, $tmItem);
155
        $this->assertArrayHasKey('t', $tmItem);
156
        $this->assertArrayHasKey('o', $tmItem);
157
        $this->assertArrayHasKey('c', $tmItem);
158
159
        $tjItem = $extractedDecodedRawData[2];
160
        $this->assertContains('TJ', $tjItem['o']);
161
        $this->assertContains('(', $tjItem['c'][0]['t']);
162
        $this->assertContains('D', $tjItem['c'][0]['c']);
163
        $this->assertContains('n', $tjItem['c'][1]['t']);
164
        $this->assertContains('0.325008', $tjItem['c'][1]['c']);
165
        $this->assertContains('(', $tjItem['c'][2]['t']);
166
        $this->assertContains('o', $tjItem['c'][2]['c']);
167
    }
168
169
    public function testExtractRawDataWithCorruptedPdf()
170
    {
171
        $this->expectException(\Exception::class);
172
        $this->expectExceptionMessage('Unable to find xref (PDF corrupted?)');
173
174
        $this
175
            ->getParserInstance()
176
            ->parseFile($this->rootDir.'/samples/corrupted.pdf')
177
            ->getPages();
178
    }
179
180
    public function testGetDataCommands()
181
    {
182
        // Document with text.
183
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
184
        $parser = $this->getParserInstance();
185
        $document = $parser->parseFile($filename);
186
        $pages = $document->getPages();
187
        $page = $pages[0];
188
        $dataCommands = $page->getDataCommands();
189
        $this->assertCount(166, $dataCommands);
190
191
        $tmItem = $dataCommands[0];
192
        $this->assertCount(3, $tmItem);
193
        $this->assertArrayHasKey('t', $tmItem);
194
        $this->assertArrayHasKey('o', $tmItem);
195
        $this->assertArrayHasKey('c', $tmItem);
196
197
        $this->assertContains('Tm', $tmItem['o']);
198
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
199
        $tjItem = $dataCommands[1];
200
201
        $this->assertCount(3, $tjItem);
202
        $this->assertArrayHasKey('t', $tjItem);
203
        $this->assertArrayHasKey('o', $tjItem);
204
        $this->assertArrayHasKey('c', $tjItem);
205
206
        $this->assertContains('TJ', $tjItem['o']);
207
        $this->assertContains('(', $tjItem['c'][0]['t']);
208
        $this->assertContains('D', $tjItem['c'][0]['c']);
209
        $this->assertContains('n', $tjItem['c'][1]['t']);
210
        $this->assertContains('0.325008', $tjItem['c'][1]['c']);
211
        $this->assertContains('(', $tjItem['c'][2]['t']);
212
        $this->assertContains('o', $tjItem['c'][2]['c']);
213
    }
214
215
    public function testGetDataTm()
216
    {
217
        // Document with text.
218
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
219
        $parser = $this->getParserInstance();
220
        $document = $parser->parseFile($filename);
221
        $pages = $document->getPages();
222
        $page = $pages[0];
223
224
        $dataTm = $page->getDataTm();
225
        $this->assertCount(81, $dataTm);
226
227
        $item = $dataTm[0];
228
        $this->assertCount(2, $item);
229
        $this->assertCount(6, $item[0]);
230
        $this->assertEquals(
231
            [
232
                '0.999429',
233
                '0',
234
                '0',
235
                '1',
236
                '201.96',
237
                '720.68',
238
            ],
239
            $item[0]
240
        );
241
242
        $this->assertContains('Document title', $item[1]);
243
        $item = $dataTm[2];
244
        $this->assertEquals(
245
            [
246
                '0.999402',
247
                '0',
248
                '0',
249
                '1',
250
                '70.8',
251
                '673.64',
252
            ],
253
            $item[0]
254
        );
255
256
        $this->assertContains('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]);
257
258
        $item = $dataTm[80];
259
        $this->assertEquals(
260
            [
261
                '0.999402',
262
                '0',
263
                '0',
264
                '1',
265
                '343.003',
266
                '81.44',
267
            ],
268
            $item[0]
269
        );
270
        $this->assertContains('nenatis.', $item[1]);
271
272
        // ------------------------------------------------------
273
        // Document is a form
274
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
275
        $document = $parser->parseFile($filename);
276
        $pages = $document->getPages();
277
        $page = $pages[0];
278
        $dataTm = $page->getDataTm();
279
        $item = $dataTm[2];
280
        $this->assertCount(105, $dataTm);
281
        $this->assertCount(2, $item);
282
        $this->assertCount(6, $item[0]);
283
        $this->assertEquals(
284
            [
285
                '1',
286
                '0',
287
                '0',
288
                '1',
289
                '167.3',
290
                '894.58',
291
            ],
292
            $item[0]
293
        );
294
        $this->assertContains('MyName  MyLastName', $item[1]);
295
296
        $item = $dataTm[6];
297
        $this->assertEquals(
298
            [
299
                '1',
300
                '0',
301
                '0',
302
                '1',
303
                '681.94',
304
                '877.42',
305
            ],
306
            $item[0]
307
        );
308
        $this->assertContains('1/1/2020', $item[1]);
309
310
        $item = $dataTm[8];
311
        $this->assertEquals(
312
            [
313
                '1',
314
                '0',
315
                '0',
316
                '1',
317
                '174.86',
318
                '827.14',
319
            ],
320
            $item[0]
321
        );
322
        $this->assertContains('Purchase 1', $item[1]);
323
324
        // ------------------------------------------------------
325
        // Document is another form of the same type
326
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
327
        $document = $parser->parseFile($filename);
328
        $pages = $document->getPages();
329
        $page = $pages[0];
330
        $dataTm = $page->getDataTm();
331
332
        $item = $dataTm[2];
333
        $this->assertCount(105, $dataTm);
334
        $this->assertCount(2, $item);
335
        $this->assertCount(6, $item[0]);
336
        $this->assertEquals(
337
            [
338
                '1',
339
                '0',
340
                '0',
341
                '1',
342
                '167.3',
343
                '894.58',
344
            ],
345
            $item[0]
346
        );
347
        $this->assertContains("Other'sName  Other'sLastName", $item[1]);
348
349
        $item = $dataTm[6];
350
        $this->assertEquals(
351
            [
352
                '1',
353
                '0',
354
                '0',
355
                '1',
356
                '681.94',
357
                '877.42',
358
            ],
359
            $item[0]
360
        );
361
        $this->assertContains('2/2/2020', $item[1]);
362
363
        $item = $dataTm[8];
364
        $this->assertEquals(
365
            [
366
                '1',
367
                '0',
368
                '0',
369
                '1',
370
                '174.86',
371
                '827.14',
372
            ],
373
            $item[0]
374
        );
375
        $this->assertContains('Purchase 2', $item[1]);
376
    }
377
378
    public function testGetTextXY()
379
    {
380
        // Document with text.
381
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
382
        $parser = $this->getParserInstance();
383
        $document = $parser->parseFile($filename);
384
        $pages = $document->getPages();
385
        $page = $pages[0];
386
        $result = $page->getTextXY(201.96, 720.68);
387
        $this->assertCount(1, $result);
388
        $this->assertCount(2, $result[0]);
389
        $this->assertEquals(
390
            [
391
                '0.999429',
392
                '0',
393
                '0',
394
                '1',
395
                '201.96',
396
                '720.68',
397
            ],
398
            $result[0][0]
399
        );
400
        $this->assertContains('Document title', $result[0][1]);
401
402
        $result = $page->getTextXY(201, 720);
403
        $this->assertCount(0, $result);
404
405
        $result = $page->getTextXY(201, 720, 1, 1);
406
        $this->assertCount(1, $result);
407
        $this->assertCount(2, $result[0]);
408
        $this->assertEquals(
409
            [
410
                '0.999429',
411
                '0',
412
                '0',
413
                '1',
414
                '201.96',
415
                '720.68',
416
            ],
417
            $result[0][0]
418
        );
419
        $this->assertContains('Document title', $result[0][1]);
420
421
        // ------------------------------------------------------
422
        // Document is a form
423
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
424
        $document = $parser->parseFile($filename);
425
        $pages = $document->getPages();
426
        $page = $pages[0];
427
        $result = $page->getTextXY(167, 894, 1, 1);
428
        $this->assertCount(1, $result);
429
        $this->assertCount(2, $result[0]);
430
        $this->assertEquals(
431
            [
432
                '1',
433
                '0',
434
                '0',
435
                '1',
436
                '167.3',
437
                '894.58',
438
            ],
439
            $result[0][0]
440
        );
441
        $this->assertContains('MyName  MyLastName', $result[0][1]);
442
443
        $result = $page->getTextXY(681, 877, 1, 1);
444
        $this->assertContains('1/1/2020', $result[0][1]);
445
446
        $result = $page->getTextXY(174, 827, 1, 1);
447
        $this->assertContains('Purchase 1', $result[0][1]);
448
449
        // ------------------------------------------------------
450
        // Document is another form of the same type
451
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
452
        $document = $parser->parseFile($filename);
453
        $pages = $document->getPages();
454
        $page = $pages[0];
455
        $result = $page->getTextXY(167, 894, 1, 1);
456
        $this->assertEquals(
457
            [
458
                '1',
459
                '0',
460
                '0',
461
                '1',
462
                '167.3',
463
                '894.58',
464
            ],
465
            $result[0][0]
466
        );
467
        $this->assertContains("Other'sName  Other'sLastName", $result[0][1]);
468
469
        $result = $page->getTextXY(681, 877, 1, 1);
470
        $this->assertContains('2/2/2020', $result[0][1]);
471
472
        $result = $page->getTextXY(174, 827, 1, 1);
473
        $this->assertContains('Purchase 2', $result[0][1]);
474
    }
475
}
476