Test Failed
Pull Request — master (#457)
by
unknown
02:29
created

PageTest::testGetTextPullRequest457()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 18
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
eloc 14
c 1
b 1
f 0
nc 1
nop 0
dl 0
loc 18
rs 9.7998
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 * @date    2020-06-01
8
 *
9
 * @author  Sébastien MALOT <[email protected]>
10
 * @date    2017-01-03
11
 *
12
 * @license LGPLv3
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Tests\Smalot\PdfParser\Integration;
34
35
use Smalot\PdfParser\Document;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Font;
38
use Smalot\PdfParser\Page;
39
use Tests\Smalot\PdfParser\TestCase;
40
41
class PageTest extends TestCase
42
{
43
    public function testGetFonts()
44
    {
45
        // Document with text.
46
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
47
        $parser = $this->getParserInstance();
48
        $document = $parser->parseFile($filename);
49
        $pages = $document->getPages();
50
        $page = $pages[0];
51
52
        // the first to load data.
53
        $fonts = $page->getFonts();
54
        $this->assertTrue(0 < \count($fonts));
55
        foreach ($fonts as $font) {
56
            $this->assertTrue($font instanceof Font);
57
        }
58
        // the second to use cache.
59
        $fonts = $page->getFonts();
60
        $this->assertTrue(0 < \count($fonts));
61
62
        // ------------------------------------------------------
63
        // Document without text.
64
        $filename = $this->rootDir.'/samples/Document3_pdfcreator_nocompressed.pdf';
65
        $document = $parser->parseFile($filename);
66
        $pages = $document->getPages();
67
        $page = $pages[0];
68
69
        // the first to load data.
70
        $fonts = $page->getFonts();
71
        $this->assertEquals(0, \count($fonts));
72
        // the second to use cache.
73
        $fonts = $page->getFonts();
74
        $this->assertEquals(0, \count($fonts));
75
    }
76
77
    public function testGetFontsElementMissing()
78
    {
79
        $headerResources = $this->getMockBuilder('Smalot\PdfParser\Header')
80
            ->disableOriginalConstructor()
81
            ->getMock();
82
83
        $headerResources->expects($this->once())
84
            ->method('has')
85
            ->willReturn(true);
86
87
        $headerResources->expects($this->once())
88
            ->method('get')
89
            ->willReturn(new ElementMissing());
90
91
        $header = $this->getMockBuilder('Smalot\PdfParser\Header')
92
            ->disableOriginalConstructor()
93
            ->getMock();
94
95
        $header->expects($this->once())
96
            ->method('get')
97
            ->willReturn($headerResources);
98
99
        $page = new Page(new Document(), $header);
100
        $fonts = $page->getFonts();
101
102
        $this->assertEmpty($fonts);
103
        $this->assertEquals([], $fonts);
104
    }
105
106
    public function testGetFont()
107
    {
108
        // Document with text.
109
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
110
        $parser = $this->getParserInstance();
111
        $document = $parser->parseFile($filename);
112
        $pages = $document->getPages();
113
        $page = $pages[0];
114
115
        // the first to load data.
116
        $font = $page->getFont('R7');
117
        $this->assertTrue($font instanceof Font);
118
119
        $font = $page->getFont('ABC7');
120
        $this->assertTrue($font instanceof Font);
121
    }
122
123
    public function testGetText()
124
    {
125
        // Document with text.
126
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
127
        $parser = $this->getParserInstance();
128
        $document = $parser->parseFile($filename);
129
        $pages = $document->getPages();
130
        $page = $pages[0];
131
        $text = $page->getText();
132
133
        $this->assertTrue(150 < \strlen($text));
134
        $this->assertStringContainsString('Document title', $text);
135
        $this->assertStringContainsString('Lorem ipsum', $text);
136
137
        $this->assertStringContainsString('Calibri', $text);
138
        $this->assertStringContainsString('Arial', $text);
139
        $this->assertStringContainsString('Times', $text);
140
        $this->assertStringContainsString('Courier New', $text);
141
        $this->assertStringContainsString('Verdana', $text);
142
    }
143
144
    public function testGetTextPullRequest457()
145
    {
146
        // Document with text.
147
        $filename = $this->rootDir.'/samples/bugs/PullRequest457.pdf';
148
        $parser = $this->getParserInstance();
149
        $document = $parser->parseFile($filename);
150
        $pages = $document->getPages();
151
        $page = $pages[0];
152
        $text = $page->getText();
153
154
        $this->assertTrue(1000 < \strlen($text));
155
        $this->assertStringContainsString('SUPER', $text);
156
        $this->assertStringContainsString('VOORDEEL', $text);
157
        $this->assertStringContainsString('KRANT', $text);
158
        $this->assertStringContainsString('DINSDAG', $text);
159
        $this->assertStringContainsString('Snelfilterkoffie', $text);
160
        $this->assertStringContainsString('AardappelenZak', $text);
161
        $this->assertStringContainsString('ALL', $text);
162
    }
163
164
    public function testExtractRawData()
165
    {
166
        // Document with text.
167
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
168
        $parser = $this->getParserInstance();
169
        $document = $parser->parseFile($filename);
170
        $pages = $document->getPages();
171
        $page = $pages[0];
172
        $extractedRawData = $page->extractRawData();
173
174
        $btItem = $extractedRawData[0];
175
        $this->assertCount(3, $btItem);
176
        $this->assertArrayHasKey('t', $btItem);
177
        $this->assertArrayHasKey('o', $btItem);
178
        $this->assertArrayHasKey('c', $btItem);
179
180
        $this->assertEquals('BT', $btItem['o']);
181
182
        $tmItem = $extractedRawData[2];
183
184
        $this->assertcount(174, $extractedRawData);
185
        $this->assertCount(3, $tmItem);
186
187
        $this->assertArrayHasKey('t', $tmItem);
188
        $this->assertArrayHasKey('o', $tmItem);
189
        $this->assertArrayHasKey('c', $tmItem);
190
191
        $this->assertStringContainsString('Tm', $tmItem['o']);
192
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
193
    }
194
195
    public function testExtractDecodedRawData()
196
    {
197
        // Document with text.
198
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
199
        $parser = $this->getParserInstance();
200
        $document = $parser->parseFile($filename);
201
        $pages = $document->getPages();
202
        $page = $pages[0];
203
        $extractedDecodedRawData = $page->extractDecodedRawData();
204
        $tmItem = $extractedDecodedRawData[2];
205
        $this->assertCount(174, $extractedDecodedRawData);
206
        $this->assertCount(3, $tmItem);
207
208
        $this->assertArrayHasKey('t', $tmItem);
209
        $this->assertArrayHasKey('o', $tmItem);
210
        $this->assertArrayHasKey('c', $tmItem);
211
212
        $this->assertStringContainsString('Tm', $tmItem['o']);
213
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
214
215
        $this->assertCount(3, $tmItem);
216
        $this->assertArrayHasKey('t', $tmItem);
217
        $this->assertArrayHasKey('o', $tmItem);
218
        $this->assertArrayHasKey('c', $tmItem);
219
220
        $tjItem = $extractedDecodedRawData[3];
221
        $this->assertStringContainsString('TJ', $tjItem['o']);
222
        $this->assertStringContainsString('(', $tjItem['c'][0]['t']);
223
        $this->assertStringContainsString('D', $tjItem['c'][0]['c']);
224
        $this->assertStringContainsString('n', $tjItem['c'][1]['t']);
225
        $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']);
226
        $this->assertStringContainsString('(', $tjItem['c'][2]['t']);
227
        $this->assertStringContainsString('o', $tjItem['c'][2]['c']);
228
    }
229
230
    public function testExtractRawDataWithCorruptedPdf()
231
    {
232
        $this->expectException(\Exception::class);
233
        $this->expectExceptionMessage('Unable to find xref (PDF corrupted?)');
234
235
        $this
236
            ->getParserInstance()
237
            ->parseFile($this->rootDir.'/samples/corrupted.pdf')
238
            ->getPages();
239
    }
240
241
    public function testGetDataCommands()
242
    {
243
        // Document with text.
244
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
245
        $parser = $this->getParserInstance();
246
        $document = $parser->parseFile($filename);
247
        $pages = $document->getPages();
248
        $page = $pages[0];
249
        $dataCommands = $page->getDataCommands();
250
        $this->assertCount(168, $dataCommands);
251
252
        $tmItem = $dataCommands[1];
253
        $this->assertCount(3, $tmItem);
254
        $this->assertArrayHasKey('t', $tmItem);
255
        $this->assertArrayHasKey('o', $tmItem);
256
        $this->assertArrayHasKey('c', $tmItem);
257
258
        $this->assertStringContainsString('Tm', $tmItem['o']);
259
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
260
261
        $tjItem = $dataCommands[2];
262
        $this->assertCount(3, $tjItem);
263
        $this->assertArrayHasKey('t', $tjItem);
264
        $this->assertArrayHasKey('o', $tjItem);
265
        $this->assertArrayHasKey('c', $tjItem);
266
267
        $this->assertStringContainsString('TJ', $tjItem['o']);
268
        $this->assertStringContainsString('(', $tjItem['c'][0]['t']);
269
        $this->assertStringContainsString('D', $tjItem['c'][0]['c']);
270
        $this->assertStringContainsString('n', $tjItem['c'][1]['t']);
271
        $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']);
272
        $this->assertStringContainsString('(', $tjItem['c'][2]['t']);
273
        $this->assertStringContainsString('o', $tjItem['c'][2]['c']);
274
    }
275
276
    public function testGetDataTm()
277
    {
278
        // Document with text.
279
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
280
        $parser = $this->getParserInstance();
281
        $document = $parser->parseFile($filename);
282
        $pages = $document->getPages();
283
        $page = $pages[0];
284
285
        $dataTm = $page->getDataTm();
286
        $this->assertCount(81, $dataTm);
287
288
        $item = $dataTm[0];
289
        $this->assertCount(2, $item);
290
        $this->assertCount(6, $item[0]);
291
        $this->assertEquals(
292
            [
293
                '0.999429',
294
                '0',
295
                '0',
296
                '1',
297
                '201.96',
298
                '720.68',
299
            ],
300
            $item[0]
301
        );
302
303
        $this->assertStringContainsString('Document title', $item[1]);
304
        $item = $dataTm[2];
305
        $this->assertEquals(
306
            [
307
                '0.999402',
308
                '0',
309
                '0',
310
                '1',
311
                '70.8',
312
                '673.64',
313
            ],
314
            $item[0]
315
        );
316
317
        $this->assertStringContainsString('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]);
318
319
        $item = $dataTm[80];
320
        $this->assertEquals(
321
            [
322
                '0.999402',
323
                '0',
324
                '0',
325
                '1',
326
                '343.003',
327
                '81.44',
328
            ],
329
            $item[0]
330
        );
331
        $this->assertStringContainsString('nenatis.', $item[1]);
332
333
        // ------------------------------------------------------
334
        // Document is a form
335
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
336
        $document = $parser->parseFile($filename);
337
        $pages = $document->getPages();
338
        $page = $pages[0];
339
        $dataTm = $page->getDataTm();
340
        $item = $dataTm[2];
341
        $this->assertCount(105, $dataTm);
342
        $this->assertCount(2, $item);
343
        $this->assertCount(6, $item[0]);
344
        $this->assertEquals(
345
            [
346
                '1',
347
                '0',
348
                '0',
349
                '1',
350
                '167.3',
351
                '894.58',
352
            ],
353
            $item[0]
354
        );
355
        $this->assertStringContainsString('MyName  MyLastName', $item[1]);
356
357
        $item = $dataTm[6];
358
        $this->assertEquals(
359
            [
360
                '1',
361
                '0',
362
                '0',
363
                '1',
364
                '681.94',
365
                '877.42',
366
            ],
367
            $item[0]
368
        );
369
        $this->assertStringContainsString('1/1/2020', $item[1]);
370
371
        $item = $dataTm[8];
372
        $this->assertEquals(
373
            [
374
                '1',
375
                '0',
376
                '0',
377
                '1',
378
                '174.86',
379
                '827.14',
380
            ],
381
            $item[0]
382
        );
383
        $this->assertStringContainsString('Purchase 1', $item[1]);
384
385
        // ------------------------------------------------------
386
        // Document is another form of the same type
387
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
388
        $document = $parser->parseFile($filename);
389
        $pages = $document->getPages();
390
        $page = $pages[0];
391
        $dataTm = $page->getDataTm();
392
393
        $item = $dataTm[2];
394
        $this->assertCount(105, $dataTm);
395
        $this->assertCount(2, $item);
396
        $this->assertCount(6, $item[0]);
397
        $this->assertEquals(
398
            [
399
                '1',
400
                '0',
401
                '0',
402
                '1',
403
                '167.3',
404
                '894.58',
405
            ],
406
            $item[0]
407
        );
408
        $this->assertStringContainsString("Other'sName  Other'sLastName", $item[1]);
409
410
        $item = $dataTm[6];
411
        $this->assertEquals(
412
            [
413
                '1',
414
                '0',
415
                '0',
416
                '1',
417
                '681.94',
418
                '877.42',
419
            ],
420
            $item[0]
421
        );
422
        $this->assertStringContainsString('2/2/2020', $item[1]);
423
424
        $item = $dataTm[8];
425
        $this->assertEquals(
426
            [
427
                '1',
428
                '0',
429
                '0',
430
                '1',
431
                '174.86',
432
                '827.14',
433
            ],
434
            $item[0]
435
        );
436
        $this->assertStringContainsString('Purchase 2', $item[1]);
437
    }
438
439
    /**
440
     * Tests getDataTm with hexadecimal encoded document text.
441
     *
442
     * @see https://github.com/smalot/pdfparser/issues/336
443
     */
444
    public function testGetDataTmIssue336()
445
    {
446
        $filename = $this->rootDir.'/samples/bugs/Issue336_decode_hexadecimal.pdf';
447
        $document = $this->getParserInstance()->parseFile($filename);
448
        $pages = $document->getPages();
449
        $page = $pages[0];
450
        $dataTm = $page->getDataTm();
451
452
        $item = $dataTm[2];
453
        $this->assertCount(13, $dataTm);
454
        $this->assertCount(2, $item);
455
        $this->assertCount(6, $item[0]);
456
        $this->assertEquals(
457
            [
458
                '1',
459
                '0',
460
                '0',
461
                '1',
462
                '318.185',
463
                '665.044',
464
            ],
465
            $item[0]
466
        );
467
        $this->assertEquals('Lorem', $item[1]);
468
    }
469
470
    /**
471
     * Tests that getPages() only returns Page objects
472
     *
473
     * @see https://github.com/smalot/pdfparser/issues/331
474
     *
475
     * Sample pdf file provided by @Reqrefusion, see
476
     * https://github.com/smalot/pdfparser/pull/350#issuecomment-703195220
477
     */
478
    public function testGetPages()
479
    {
480
        $filename = $this->rootDir.'/samples/bugs/Issue331.pdf';
481
        $document = $this->getParserInstance()->parseFile($filename);
482
        $pages = $document->getPages();
483
484
        // This should actually be 3 pages, but as long as the cause for issue #331
485
        // has not been found and the issue is not fixed, we'll settle for 2 here.
486
        // We still test for the count, so in case the bug should be fixed
487
        // unknowingly, we don't forget to resolve the issue as well and make sure
488
        // this assertion is present.
489
        $this->assertCount(2, $pages);
490
491
        foreach ($pages as $page) {
492
            $this->assertTrue($page instanceof Page);
493
        }
494
    }
495
496
    public function testGetTextXY()
497
    {
498
        // Document with text.
499
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
500
        $parser = $this->getParserInstance();
501
        $document = $parser->parseFile($filename);
502
        $pages = $document->getPages();
503
        $page = $pages[0];
504
        $result = $page->getTextXY(201.96, 720.68);
505
        $this->assertCount(1, $result);
506
        $this->assertCount(2, $result[0]);
507
        $this->assertEquals(
508
            [
509
                '0.999429',
510
                '0',
511
                '0',
512
                '1',
513
                '201.96',
514
                '720.68',
515
            ],
516
            $result[0][0]
517
        );
518
        $this->assertStringContainsString('Document title', $result[0][1]);
519
520
        $result = $page->getTextXY(201, 720);
521
        $this->assertCount(0, $result);
522
523
        $result = $page->getTextXY(201, 720, 1, 1);
524
        $this->assertCount(1, $result);
525
        $this->assertCount(2, $result[0]);
526
        $this->assertEquals(
527
            [
528
                '0.999429',
529
                '0',
530
                '0',
531
                '1',
532
                '201.96',
533
                '720.68',
534
            ],
535
            $result[0][0]
536
        );
537
        $this->assertStringContainsString('Document title', $result[0][1]);
538
539
        // ------------------------------------------------------
540
        // Document is a form
541
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
542
        $document = $parser->parseFile($filename);
543
        $pages = $document->getPages();
544
        $page = $pages[0];
545
        $result = $page->getTextXY(167, 894, 1, 1);
546
        $this->assertCount(1, $result);
547
        $this->assertCount(2, $result[0]);
548
        $this->assertEquals(
549
            [
550
                '1',
551
                '0',
552
                '0',
553
                '1',
554
                '167.3',
555
                '894.58',
556
            ],
557
            $result[0][0]
558
        );
559
        $this->assertStringContainsString('MyName  MyLastName', $result[0][1]);
560
561
        $result = $page->getTextXY(681, 877, 1, 1);
562
        $this->assertStringContainsString('1/1/2020', $result[0][1]);
563
564
        $result = $page->getTextXY(174, 827, 1, 1);
565
        $this->assertStringContainsString('Purchase 1', $result[0][1]);
566
567
        // ------------------------------------------------------
568
        // Document is another form of the same type
569
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
570
        $document = $parser->parseFile($filename);
571
        $pages = $document->getPages();
572
        $page = $pages[0];
573
        $result = $page->getTextXY(167, 894, 1, 1);
574
        $this->assertEquals(
575
            [
576
                '1',
577
                '0',
578
                '0',
579
                '1',
580
                '167.3',
581
                '894.58',
582
            ],
583
            $result[0][0]
584
        );
585
        $this->assertStringContainsString("Other'sName  Other'sLastName", $result[0][1]);
586
587
        $result = $page->getTextXY(681, 877, 1, 1);
588
        $this->assertStringContainsString('2/2/2020', $result[0][1]);
589
590
        $result = $page->getTextXY(174, 827, 1, 1);
591
        $this->assertStringContainsString('Purchase 2', $result[0][1]);
592
    }
593
594
    public function testExtractDecodedRawDataIssue450()
595
    {
596
        $filename = $this->rootDir.'/samples/bugs/Issue450.pdf';
597
        $parser = $this->getParserInstance();
598
        $document = $parser->parseFile($filename);
599
        $pages = $document->getPages();
600
        $page = $pages[0];
601
        $extractedDecodedRawData = $page->extractDecodedRawData();
602
        $this->assertIsArray($extractedDecodedRawData);
603
        $this->assertGreaterThan(3, \count($extractedDecodedRawData));
604
        $this->assertIsArray($extractedDecodedRawData[3]);
605
        $this->assertEquals('TJ', $extractedDecodedRawData[3]['o']);
606
        $this->assertIsArray($extractedDecodedRawData[3]['c']);
607
        $this->assertIsArray($extractedDecodedRawData[3]['c'][0]);
608
        $this->assertEquals(3, \count($extractedDecodedRawData[3]['c'][0]));
609
        $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $extractedDecodedRawData[3]['c'][0]['c']);
610
    }
611
612
    public function testGetDataTmIssue450()
613
    {
614
        $filename = $this->rootDir.'/samples/bugs/Issue450.pdf';
615
        $parser = $this->getParserInstance();
616
        $document = $parser->parseFile($filename);
617
        $pages = $document->getPages();
618
        $page = $pages[0];
619
        $dataTm = $page->getDataTm();
620
        $this->assertIsArray($dataTm);
621
        $this->assertEquals(1, \count($dataTm));
622
        $this->assertIsArray($dataTm[0]);
623
        $this->assertEquals(2, \count($dataTm[0]));
624
        $this->assertIsArray($dataTm[0][0]);
625
        $this->assertEquals(6, \count($dataTm[0][0]));
626
        $this->assertEquals(1, $dataTm[0][0][0]);
627
        $this->assertEquals(0, $dataTm[0][0][1]);
628
        $this->assertEquals(0, $dataTm[0][0][2]);
629
        $this->assertEquals(1, $dataTm[0][0][3]);
630
        $this->assertEquals(67.5, $dataTm[0][0][4]);
631
        $this->assertEquals(756.25, $dataTm[0][0][5]);
632
        $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]);
633
    }
634
}
635