Completed
Push — master ( 72a877...c9c2be )
by Konrad
08:04
created

PageTest::testGetPages()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 15
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 6
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 15
rs 10
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 * @date    2020-06-01
8
 *
9
 * @author  Sébastien MALOT <[email protected]>
10
 * @date    2017-01-03
11
 *
12
 * @license LGPLv3
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Tests\Smalot\PdfParser\Integration;
34
35
use Smalot\PdfParser\Document;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Font;
38
use Smalot\PdfParser\Page;
39
use Tests\Smalot\PdfParser\TestCase;
40
41
class PageTest extends TestCase
42
{
43
    public function testGetFonts()
44
    {
45
        // Document with text.
46
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
47
        $parser = $this->getParserInstance();
48
        $document = $parser->parseFile($filename);
49
        $pages = $document->getPages();
50
        $page = $pages[0];
51
52
        // the first to load data.
53
        $fonts = $page->getFonts();
54
        $this->assertTrue(0 < \count($fonts));
55
        foreach ($fonts as $font) {
56
            $this->assertTrue($font instanceof Font);
57
        }
58
        // the second to use cache.
59
        $fonts = $page->getFonts();
60
        $this->assertTrue(0 < \count($fonts));
61
62
        // ------------------------------------------------------
63
        // Document without text.
64
        $filename = $this->rootDir.'/samples/Document3_pdfcreator_nocompressed.pdf';
65
        $document = $parser->parseFile($filename);
66
        $pages = $document->getPages();
67
        $page = $pages[0];
68
69
        // the first to load data.
70
        $fonts = $page->getFonts();
71
        $this->assertEquals(0, \count($fonts));
72
        // the second to use cache.
73
        $fonts = $page->getFonts();
74
        $this->assertEquals(0, \count($fonts));
75
    }
76
77
    public function testGetFontsElementMissing()
78
    {
79
        $headerResources = $this->getMockBuilder('Smalot\PdfParser\Header')
80
            ->disableOriginalConstructor()
81
            ->getMock();
82
83
        $headerResources->expects($this->once())
84
            ->method('has')
85
            ->willReturn(true);
86
87
        $headerResources->expects($this->once())
88
            ->method('get')
89
            ->willReturn(new ElementMissing());
90
91
        $header = $this->getMockBuilder('Smalot\PdfParser\Header')
92
            ->disableOriginalConstructor()
93
            ->getMock();
94
95
        $header->expects($this->once())
96
            ->method('get')
97
            ->willReturn($headerResources);
98
99
        $page = new Page(new Document(), $header);
100
        $fonts = $page->getFonts();
101
102
        $this->assertEmpty($fonts);
103
        $this->assertEquals([], $fonts);
104
    }
105
106
    public function testGetFont()
107
    {
108
        // Document with text.
109
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
110
        $parser = $this->getParserInstance();
111
        $document = $parser->parseFile($filename);
112
        $pages = $document->getPages();
113
        $page = $pages[0];
114
115
        // the first to load data.
116
        $font = $page->getFont('R7');
117
        $this->assertTrue($font instanceof Font);
118
119
        $font = $page->getFont('ABC7');
120
        $this->assertTrue($font instanceof Font);
121
    }
122
123
    public function testGetText()
124
    {
125
        // Document with text.
126
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
127
        $parser = $this->getParserInstance();
128
        $document = $parser->parseFile($filename);
129
        $pages = $document->getPages();
130
        $page = $pages[0];
131
        $text = $page->getText();
132
133
        $this->assertTrue(150 < \strlen($text));
134
        $this->assertContains('Document title', $text);
135
        $this->assertContains('Lorem ipsum', $text);
136
137
        $this->assertContains('Calibri', $text);
138
        $this->assertContains('Arial', $text);
139
        $this->assertContains('Times', $text);
140
        $this->assertContains('Courier New', $text);
141
        $this->assertContains('Verdana', $text);
142
    }
143
144
    public function testExtractRawData()
145
    {
146
        // Document with text.
147
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
148
        $parser = $this->getParserInstance();
149
        $document = $parser->parseFile($filename);
150
        $pages = $document->getPages();
151
        $page = $pages[0];
152
        $extractedRawData = $page->extractRawData();
153
154
        $btItem = $extractedRawData[0];
155
        $this->assertCount(3, $btItem);
156
        $this->assertArrayHasKey('t', $btItem);
157
        $this->assertArrayHasKey('o', $btItem);
158
        $this->assertArrayHasKey('c', $btItem);
159
160
        $this->assertEquals('BT', $btItem['o']);
161
162
        $tmItem = $extractedRawData[2];
163
164
        $this->assertcount(174, $extractedRawData);
165
        $this->assertCount(3, $tmItem);
166
167
        $this->assertArrayHasKey('t', $tmItem);
168
        $this->assertArrayHasKey('o', $tmItem);
169
        $this->assertArrayHasKey('c', $tmItem);
170
171
        $this->assertContains('Tm', $tmItem['o']);
172
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
173
    }
174
175
    public function testExtractDecodedRawData()
176
    {
177
        // Document with text.
178
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
179
        $parser = $this->getParserInstance();
180
        $document = $parser->parseFile($filename);
181
        $pages = $document->getPages();
182
        $page = $pages[0];
183
        $extractedDecodedRawData = $page->extractDecodedRawData();
184
        $tmItem = $extractedDecodedRawData[2];
185
        $this->assertCount(174, $extractedDecodedRawData);
186
        $this->assertCount(3, $tmItem);
187
188
        $this->assertArrayHasKey('t', $tmItem);
189
        $this->assertArrayHasKey('o', $tmItem);
190
        $this->assertArrayHasKey('c', $tmItem);
191
192
        $this->assertContains('Tm', $tmItem['o']);
193
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
194
195
        $this->assertCount(3, $tmItem);
196
        $this->assertArrayHasKey('t', $tmItem);
197
        $this->assertArrayHasKey('o', $tmItem);
198
        $this->assertArrayHasKey('c', $tmItem);
199
200
        $tjItem = $extractedDecodedRawData[3];
201
        $this->assertContains('TJ', $tjItem['o']);
202
        $this->assertContains('(', $tjItem['c'][0]['t']);
203
        $this->assertContains('D', $tjItem['c'][0]['c']);
204
        $this->assertContains('n', $tjItem['c'][1]['t']);
205
        $this->assertContains('0.325008', $tjItem['c'][1]['c']);
206
        $this->assertContains('(', $tjItem['c'][2]['t']);
207
        $this->assertContains('o', $tjItem['c'][2]['c']);
208
    }
209
210
    public function testExtractRawDataWithCorruptedPdf()
211
    {
212
        $this->expectException(\Exception::class);
213
        $this->expectExceptionMessage('Unable to find xref (PDF corrupted?)');
214
215
        $this
216
            ->getParserInstance()
217
            ->parseFile($this->rootDir.'/samples/corrupted.pdf')
218
            ->getPages();
219
    }
220
221
    public function testGetDataCommands()
222
    {
223
        // Document with text.
224
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
225
        $parser = $this->getParserInstance();
226
        $document = $parser->parseFile($filename);
227
        $pages = $document->getPages();
228
        $page = $pages[0];
229
        $dataCommands = $page->getDataCommands();
230
        $this->assertCount(168, $dataCommands);
231
232
        $tmItem = $dataCommands[1];
233
        $this->assertCount(3, $tmItem);
234
        $this->assertArrayHasKey('t', $tmItem);
235
        $this->assertArrayHasKey('o', $tmItem);
236
        $this->assertArrayHasKey('c', $tmItem);
237
238
        $this->assertContains('Tm', $tmItem['o']);
239
        $this->assertContains('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
240
241
        $tjItem = $dataCommands[2];
242
        $this->assertCount(3, $tjItem);
243
        $this->assertArrayHasKey('t', $tjItem);
244
        $this->assertArrayHasKey('o', $tjItem);
245
        $this->assertArrayHasKey('c', $tjItem);
246
247
        $this->assertContains('TJ', $tjItem['o']);
248
        $this->assertContains('(', $tjItem['c'][0]['t']);
249
        $this->assertContains('D', $tjItem['c'][0]['c']);
250
        $this->assertContains('n', $tjItem['c'][1]['t']);
251
        $this->assertContains('0.325008', $tjItem['c'][1]['c']);
252
        $this->assertContains('(', $tjItem['c'][2]['t']);
253
        $this->assertContains('o', $tjItem['c'][2]['c']);
254
    }
255
256
    public function testGetDataTm()
257
    {
258
        // Document with text.
259
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
260
        $parser = $this->getParserInstance();
261
        $document = $parser->parseFile($filename);
262
        $pages = $document->getPages();
263
        $page = $pages[0];
264
265
        $dataTm = $page->getDataTm();
266
        $this->assertCount(81, $dataTm);
267
268
        $item = $dataTm[0];
269
        $this->assertCount(2, $item);
270
        $this->assertCount(6, $item[0]);
271
        $this->assertEquals(
272
            [
273
                '0.999429',
274
                '0',
275
                '0',
276
                '1',
277
                '201.96',
278
                '720.68',
279
            ],
280
            $item[0]
281
        );
282
283
        $this->assertContains('Document title', $item[1]);
284
        $item = $dataTm[2];
285
        $this->assertEquals(
286
            [
287
                '0.999402',
288
                '0',
289
                '0',
290
                '1',
291
                '70.8',
292
                '673.64',
293
            ],
294
            $item[0]
295
        );
296
297
        $this->assertContains('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]);
298
299
        $item = $dataTm[80];
300
        $this->assertEquals(
301
            [
302
                '0.999402',
303
                '0',
304
                '0',
305
                '1',
306
                '343.003',
307
                '81.44',
308
            ],
309
            $item[0]
310
        );
311
        $this->assertContains('nenatis.', $item[1]);
312
313
        // ------------------------------------------------------
314
        // Document is a form
315
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
316
        $document = $parser->parseFile($filename);
317
        $pages = $document->getPages();
318
        $page = $pages[0];
319
        $dataTm = $page->getDataTm();
320
        $item = $dataTm[2];
321
        $this->assertCount(105, $dataTm);
322
        $this->assertCount(2, $item);
323
        $this->assertCount(6, $item[0]);
324
        $this->assertEquals(
325
            [
326
                '1',
327
                '0',
328
                '0',
329
                '1',
330
                '167.3',
331
                '894.58',
332
            ],
333
            $item[0]
334
        );
335
        $this->assertContains('MyName  MyLastName', $item[1]);
336
337
        $item = $dataTm[6];
338
        $this->assertEquals(
339
            [
340
                '1',
341
                '0',
342
                '0',
343
                '1',
344
                '681.94',
345
                '877.42',
346
            ],
347
            $item[0]
348
        );
349
        $this->assertContains('1/1/2020', $item[1]);
350
351
        $item = $dataTm[8];
352
        $this->assertEquals(
353
            [
354
                '1',
355
                '0',
356
                '0',
357
                '1',
358
                '174.86',
359
                '827.14',
360
            ],
361
            $item[0]
362
        );
363
        $this->assertContains('Purchase 1', $item[1]);
364
365
        // ------------------------------------------------------
366
        // Document is another form of the same type
367
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
368
        $document = $parser->parseFile($filename);
369
        $pages = $document->getPages();
370
        $page = $pages[0];
371
        $dataTm = $page->getDataTm();
372
373
        $item = $dataTm[2];
374
        $this->assertCount(105, $dataTm);
375
        $this->assertCount(2, $item);
376
        $this->assertCount(6, $item[0]);
377
        $this->assertEquals(
378
            [
379
                '1',
380
                '0',
381
                '0',
382
                '1',
383
                '167.3',
384
                '894.58',
385
            ],
386
            $item[0]
387
        );
388
        $this->assertContains("Other'sName  Other'sLastName", $item[1]);
389
390
        $item = $dataTm[6];
391
        $this->assertEquals(
392
            [
393
                '1',
394
                '0',
395
                '0',
396
                '1',
397
                '681.94',
398
                '877.42',
399
            ],
400
            $item[0]
401
        );
402
        $this->assertContains('2/2/2020', $item[1]);
403
404
        $item = $dataTm[8];
405
        $this->assertEquals(
406
            [
407
                '1',
408
                '0',
409
                '0',
410
                '1',
411
                '174.86',
412
                '827.14',
413
            ],
414
            $item[0]
415
        );
416
        $this->assertContains('Purchase 2', $item[1]);
417
    }
418
419
    /**
420
     * Tests getDataTm with hexadecimal encoded document text.
421
     *
422
     * @see https://github.com/smalot/pdfparser/issues/336
423
     */
424
    public function testGetDataTmIssue336()
425
    {
426
        $filename = $this->rootDir.'/samples/bugs/Issue336_decode_hexadecimal.pdf';
427
        $document = $this->getParserInstance()->parseFile($filename);
428
        $pages = $document->getPages();
429
        $page = $pages[0];
430
        $dataTm = $page->getDataTm();
431
432
        $item = $dataTm[2];
433
        $this->assertCount(13, $dataTm);
434
        $this->assertCount(2, $item);
435
        $this->assertCount(6, $item[0]);
436
        $this->assertEquals(
437
            [
438
                '1',
439
                '0',
440
                '0',
441
                '1',
442
                '318.185',
443
                '665.044',
444
            ],
445
            $item[0]
446
        );
447
        $this->assertEquals('Lorem', $item[1]);
448
    }
449
450
    /**
451
     * Tests that getPages() only returns Page objects
452
     *
453
     * @see https://github.com/smalot/pdfparser/issues/331
454
     *
455
     * Sample pdf file provided by @Reqrefusion, see
456
     * https://github.com/smalot/pdfparser/pull/350#issuecomment-703195220
457
     */
458
    public function testGetPages()
459
    {
460
        $filename = $this->rootDir.'/samples/bugs/Issue331.pdf';
461
        $document = $this->getParserInstance()->parseFile($filename);
462
        $pages = $document->getPages();
463
464
        // This should actually be 3 pages, but as long as the cause for issue #331
465
        // has not been found and the issue is not fixed, we'll settle for 2 here.
466
        // We still test for the count, so in case the bug should be fixed
467
        // unknowingly, we don't forget to resolve the issue as well and make sure
468
        // this assertion is present.
469
        $this->assertCount(2, $pages);
470
471
        foreach ($pages as $page) {
472
            $this->assertTrue($page instanceof Page);
473
        }
474
    }
475
476
    public function testGetTextXY()
477
    {
478
        // Document with text.
479
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
480
        $parser = $this->getParserInstance();
481
        $document = $parser->parseFile($filename);
482
        $pages = $document->getPages();
483
        $page = $pages[0];
484
        $result = $page->getTextXY(201.96, 720.68);
485
        $this->assertCount(1, $result);
486
        $this->assertCount(2, $result[0]);
487
        $this->assertEquals(
488
            [
489
                '0.999429',
490
                '0',
491
                '0',
492
                '1',
493
                '201.96',
494
                '720.68',
495
            ],
496
            $result[0][0]
497
        );
498
        $this->assertContains('Document title', $result[0][1]);
499
500
        $result = $page->getTextXY(201, 720);
501
        $this->assertCount(0, $result);
502
503
        $result = $page->getTextXY(201, 720, 1, 1);
504
        $this->assertCount(1, $result);
505
        $this->assertCount(2, $result[0]);
506
        $this->assertEquals(
507
            [
508
                '0.999429',
509
                '0',
510
                '0',
511
                '1',
512
                '201.96',
513
                '720.68',
514
            ],
515
            $result[0][0]
516
        );
517
        $this->assertContains('Document title', $result[0][1]);
518
519
        // ------------------------------------------------------
520
        // Document is a form
521
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
522
        $document = $parser->parseFile($filename);
523
        $pages = $document->getPages();
524
        $page = $pages[0];
525
        $result = $page->getTextXY(167, 894, 1, 1);
526
        $this->assertCount(1, $result);
527
        $this->assertCount(2, $result[0]);
528
        $this->assertEquals(
529
            [
530
                '1',
531
                '0',
532
                '0',
533
                '1',
534
                '167.3',
535
                '894.58',
536
            ],
537
            $result[0][0]
538
        );
539
        $this->assertContains('MyName  MyLastName', $result[0][1]);
540
541
        $result = $page->getTextXY(681, 877, 1, 1);
542
        $this->assertContains('1/1/2020', $result[0][1]);
543
544
        $result = $page->getTextXY(174, 827, 1, 1);
545
        $this->assertContains('Purchase 1', $result[0][1]);
546
547
        // ------------------------------------------------------
548
        // Document is another form of the same type
549
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
550
        $document = $parser->parseFile($filename);
551
        $pages = $document->getPages();
552
        $page = $pages[0];
553
        $result = $page->getTextXY(167, 894, 1, 1);
554
        $this->assertEquals(
555
            [
556
                '1',
557
                '0',
558
                '0',
559
                '1',
560
                '167.3',
561
                '894.58',
562
            ],
563
            $result[0][0]
564
        );
565
        $this->assertContains("Other'sName  Other'sLastName", $result[0][1]);
566
567
        $result = $page->getTextXY(681, 877, 1, 1);
568
        $this->assertContains('2/2/2020', $result[0][1]);
569
570
        $result = $page->getTextXY(174, 827, 1, 1);
571
        $this->assertContains('Purchase 2', $result[0][1]);
572
    }
573
}
574