Test Failed
Pull Request — master (#457)
by
unknown
02:52
created

PageTest::testGetTextPullRequest457()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 27
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 1 Features 0
Metric Value
cc 1
eloc 21
c 3
b 1
f 0
nc 1
nop 0
dl 0
loc 27
rs 9.584
1
<?php
2
3
/**
4
 * @file This file is part of the PdfParser library.
5
 *
6
 * @author  Konrad Abicht <[email protected]>
7
 * @date    2020-06-01
8
 *
9
 * @author  Sébastien MALOT <[email protected]>
10
 * @date    2017-01-03
11
 *
12
 * @license LGPLv3
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Tests\Smalot\PdfParser\Integration;
34
35
use Smalot\PdfParser\Document;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Font;
38
use Smalot\PdfParser\Page;
39
use Tests\Smalot\PdfParser\TestCase;
40
41
class PageTest extends TestCase
42
{
43
    public function testGetFonts()
44
    {
45
        // Document with text.
46
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
47
        $parser = $this->getParserInstance();
48
        $document = $parser->parseFile($filename);
49
        $pages = $document->getPages();
50
        $page = $pages[0];
51
52
        // the first to load data.
53
        $fonts = $page->getFonts();
54
        $this->assertTrue(0 < \count($fonts));
55
        foreach ($fonts as $font) {
56
            $this->assertTrue($font instanceof Font);
57
        }
58
        // the second to use cache.
59
        $fonts = $page->getFonts();
60
        $this->assertTrue(0 < \count($fonts));
61
62
        // ------------------------------------------------------
63
        // Document without text.
64
        $filename = $this->rootDir.'/samples/Document3_pdfcreator_nocompressed.pdf';
65
        $document = $parser->parseFile($filename);
66
        $pages = $document->getPages();
67
        $page = $pages[0];
68
69
        // the first to load data.
70
        $fonts = $page->getFonts();
71
        $this->assertEquals(0, \count($fonts));
72
        // the second to use cache.
73
        $fonts = $page->getFonts();
74
        $this->assertEquals(0, \count($fonts));
75
    }
76
77
    public function testGetFontsElementMissing()
78
    {
79
        $headerResources = $this->getMockBuilder('Smalot\PdfParser\Header')
80
            ->disableOriginalConstructor()
81
            ->getMock();
82
83
        $headerResources->expects($this->once())
84
            ->method('has')
85
            ->willReturn(true);
86
87
        $headerResources->expects($this->once())
88
            ->method('get')
89
            ->willReturn(new ElementMissing());
90
91
        $header = $this->getMockBuilder('Smalot\PdfParser\Header')
92
            ->disableOriginalConstructor()
93
            ->getMock();
94
95
        $header->expects($this->once())
96
            ->method('get')
97
            ->willReturn($headerResources);
98
99
        $page = new Page(new Document(), $header);
100
        $fonts = $page->getFonts();
101
102
        $this->assertEmpty($fonts);
103
        $this->assertEquals([], $fonts);
104
    }
105
106
    public function testGetFont()
107
    {
108
        // Document with text.
109
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
110
        $parser = $this->getParserInstance();
111
        $document = $parser->parseFile($filename);
112
        $pages = $document->getPages();
113
        $page = $pages[0];
114
115
        // the first to load data.
116
        $font = $page->getFont('R7');
117
        $this->assertTrue($font instanceof Font);
118
119
        $font = $page->getFont('ABC7');
120
        $this->assertTrue($font instanceof Font);
121
    }
122
123
    public function testGetText()
124
    {
125
        // Document with text.
126
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
127
        $parser = $this->getParserInstance();
128
        $document = $parser->parseFile($filename);
129
        $pages = $document->getPages();
130
        $page = $pages[0];
131
        $text = $page->getText();
132
133
        $this->assertTrue(150 < \strlen($text));
134
        $this->assertStringContainsString('Document title', $text);
135
        $this->assertStringContainsString('Lorem ipsum', $text);
136
137
        $this->assertStringContainsString('Calibri', $text);
138
        $this->assertStringContainsString('Arial', $text);
139
        $this->assertStringContainsString('Times', $text);
140
        $this->assertStringContainsString('Courier New', $text);
141
        $this->assertStringContainsString('Verdana', $text);
142
    }
143
144
    /**
145
     * @see https://github.com/smalot/pdfparser/pull/457
146
     */
147
    public function testGetTextPullRequest457()
148
    {
149
        // Document with text.
150
        $filename = $this->rootDir.'/samples/bugs/PullRequest457.pdf';
151
        $parser = $this->getParserInstance();
152
        $document = $parser->parseFile($filename);
153
        $pages = $document->getPages();
154
        $page = $pages[0];
155
        $text = $page->getText();
156
157
        $this->assertTrue(1000 < \strlen($text));
158
        $this->assertStringContainsString('SUPER', $text);
159
        $this->assertStringContainsString('VOORDEEL', $text);
160
        $this->assertStringContainsString('KRANT', $text);
161
        $this->assertStringContainsString('DINSDAG', $text);
162
        $this->assertStringContainsString('Snelfilterkoffie', $text);
163
        $this->assertStringContainsString('AardappelenZak', $text);
164
        $this->assertStringContainsString('ALL', $text);
165
166
        // Force garbage collection to not interfere with other tests.
167
        unset($filename);
168
        unset($parser);
169
        unset($document);
170
        unset($pages);
171
        unset($page);
172
        unset($text);
173
        gc_collect_cycles();
174
    }
175
176
    public function testExtractRawData()
177
    {
178
        // Document with text.
179
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
180
        $parser = $this->getParserInstance();
181
        $document = $parser->parseFile($filename);
182
        $pages = $document->getPages();
183
        $page = $pages[0];
184
        $extractedRawData = $page->extractRawData();
185
186
        $btItem = $extractedRawData[0];
187
        $this->assertCount(3, $btItem);
188
        $this->assertArrayHasKey('t', $btItem);
189
        $this->assertArrayHasKey('o', $btItem);
190
        $this->assertArrayHasKey('c', $btItem);
191
192
        $this->assertEquals('BT', $btItem['o']);
193
194
        $tmItem = $extractedRawData[2];
195
196
        $this->assertcount(174, $extractedRawData);
197
        $this->assertCount(3, $tmItem);
198
199
        $this->assertArrayHasKey('t', $tmItem);
200
        $this->assertArrayHasKey('o', $tmItem);
201
        $this->assertArrayHasKey('c', $tmItem);
202
203
        $this->assertStringContainsString('Tm', $tmItem['o']);
204
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
205
    }
206
207
    public function testExtractDecodedRawData()
208
    {
209
        // Document with text.
210
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
211
        $parser = $this->getParserInstance();
212
        $document = $parser->parseFile($filename);
213
        $pages = $document->getPages();
214
        $page = $pages[0];
215
        $extractedDecodedRawData = $page->extractDecodedRawData();
216
        $tmItem = $extractedDecodedRawData[2];
217
        $this->assertCount(174, $extractedDecodedRawData);
218
        $this->assertCount(3, $tmItem);
219
220
        $this->assertArrayHasKey('t', $tmItem);
221
        $this->assertArrayHasKey('o', $tmItem);
222
        $this->assertArrayHasKey('c', $tmItem);
223
224
        $this->assertStringContainsString('Tm', $tmItem['o']);
225
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
226
227
        $this->assertCount(3, $tmItem);
228
        $this->assertArrayHasKey('t', $tmItem);
229
        $this->assertArrayHasKey('o', $tmItem);
230
        $this->assertArrayHasKey('c', $tmItem);
231
232
        $tjItem = $extractedDecodedRawData[3];
233
        $this->assertStringContainsString('TJ', $tjItem['o']);
234
        $this->assertStringContainsString('(', $tjItem['c'][0]['t']);
235
        $this->assertStringContainsString('D', $tjItem['c'][0]['c']);
236
        $this->assertStringContainsString('n', $tjItem['c'][1]['t']);
237
        $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']);
238
        $this->assertStringContainsString('(', $tjItem['c'][2]['t']);
239
        $this->assertStringContainsString('o', $tjItem['c'][2]['c']);
240
    }
241
242
    public function testExtractRawDataWithCorruptedPdf()
243
    {
244
        $this->expectException(\Exception::class);
245
        $this->expectExceptionMessage('Unable to find xref (PDF corrupted?)');
246
247
        $this
248
            ->getParserInstance()
249
            ->parseFile($this->rootDir.'/samples/corrupted.pdf')
250
            ->getPages();
251
    }
252
253
    public function testGetDataCommands()
254
    {
255
        // Document with text.
256
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
257
        $parser = $this->getParserInstance();
258
        $document = $parser->parseFile($filename);
259
        $pages = $document->getPages();
260
        $page = $pages[0];
261
        $dataCommands = $page->getDataCommands();
262
        $this->assertCount(168, $dataCommands);
263
264
        $tmItem = $dataCommands[1];
265
        $this->assertCount(3, $tmItem);
266
        $this->assertArrayHasKey('t', $tmItem);
267
        $this->assertArrayHasKey('o', $tmItem);
268
        $this->assertArrayHasKey('c', $tmItem);
269
270
        $this->assertStringContainsString('Tm', $tmItem['o']);
271
        $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']);
272
273
        $tjItem = $dataCommands[2];
274
        $this->assertCount(3, $tjItem);
275
        $this->assertArrayHasKey('t', $tjItem);
276
        $this->assertArrayHasKey('o', $tjItem);
277
        $this->assertArrayHasKey('c', $tjItem);
278
279
        $this->assertStringContainsString('TJ', $tjItem['o']);
280
        $this->assertStringContainsString('(', $tjItem['c'][0]['t']);
281
        $this->assertStringContainsString('D', $tjItem['c'][0]['c']);
282
        $this->assertStringContainsString('n', $tjItem['c'][1]['t']);
283
        $this->assertStringContainsString('0.325008', $tjItem['c'][1]['c']);
284
        $this->assertStringContainsString('(', $tjItem['c'][2]['t']);
285
        $this->assertStringContainsString('o', $tjItem['c'][2]['c']);
286
    }
287
288
    public function testGetDataTm()
289
    {
290
        // Document with text.
291
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
292
        $parser = $this->getParserInstance();
293
        $document = $parser->parseFile($filename);
294
        $pages = $document->getPages();
295
        $page = $pages[0];
296
297
        $dataTm = $page->getDataTm();
298
        $this->assertCount(81, $dataTm);
299
300
        $item = $dataTm[0];
301
        $this->assertCount(2, $item);
302
        $this->assertCount(6, $item[0]);
303
        $this->assertEquals(
304
            [
305
                '0.999429',
306
                '0',
307
                '0',
308
                '1',
309
                '201.96',
310
                '720.68',
311
            ],
312
            $item[0]
313
        );
314
315
        $this->assertStringContainsString('Document title', $item[1]);
316
        $item = $dataTm[2];
317
        $this->assertEquals(
318
            [
319
                '0.999402',
320
                '0',
321
                '0',
322
                '1',
323
                '70.8',
324
                '673.64',
325
            ],
326
            $item[0]
327
        );
328
329
        $this->assertStringContainsString('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]);
330
331
        $item = $dataTm[80];
332
        $this->assertEquals(
333
            [
334
                '0.999402',
335
                '0',
336
                '0',
337
                '1',
338
                '343.003',
339
                '81.44',
340
            ],
341
            $item[0]
342
        );
343
        $this->assertStringContainsString('nenatis.', $item[1]);
344
345
        // ------------------------------------------------------
346
        // Document is a form
347
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
348
        $document = $parser->parseFile($filename);
349
        $pages = $document->getPages();
350
        $page = $pages[0];
351
        $dataTm = $page->getDataTm();
352
        $item = $dataTm[2];
353
        $this->assertCount(105, $dataTm);
354
        $this->assertCount(2, $item);
355
        $this->assertCount(6, $item[0]);
356
        $this->assertEquals(
357
            [
358
                '1',
359
                '0',
360
                '0',
361
                '1',
362
                '167.3',
363
                '894.58',
364
            ],
365
            $item[0]
366
        );
367
        $this->assertStringContainsString('MyName  MyLastName', $item[1]);
368
369
        $item = $dataTm[6];
370
        $this->assertEquals(
371
            [
372
                '1',
373
                '0',
374
                '0',
375
                '1',
376
                '681.94',
377
                '877.42',
378
            ],
379
            $item[0]
380
        );
381
        $this->assertStringContainsString('1/1/2020', $item[1]);
382
383
        $item = $dataTm[8];
384
        $this->assertEquals(
385
            [
386
                '1',
387
                '0',
388
                '0',
389
                '1',
390
                '174.86',
391
                '827.14',
392
            ],
393
            $item[0]
394
        );
395
        $this->assertStringContainsString('Purchase 1', $item[1]);
396
397
        // ------------------------------------------------------
398
        // Document is another form of the same type
399
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
400
        $document = $parser->parseFile($filename);
401
        $pages = $document->getPages();
402
        $page = $pages[0];
403
        $dataTm = $page->getDataTm();
404
405
        $item = $dataTm[2];
406
        $this->assertCount(105, $dataTm);
407
        $this->assertCount(2, $item);
408
        $this->assertCount(6, $item[0]);
409
        $this->assertEquals(
410
            [
411
                '1',
412
                '0',
413
                '0',
414
                '1',
415
                '167.3',
416
                '894.58',
417
            ],
418
            $item[0]
419
        );
420
        $this->assertStringContainsString("Other'sName  Other'sLastName", $item[1]);
421
422
        $item = $dataTm[6];
423
        $this->assertEquals(
424
            [
425
                '1',
426
                '0',
427
                '0',
428
                '1',
429
                '681.94',
430
                '877.42',
431
            ],
432
            $item[0]
433
        );
434
        $this->assertStringContainsString('2/2/2020', $item[1]);
435
436
        $item = $dataTm[8];
437
        $this->assertEquals(
438
            [
439
                '1',
440
                '0',
441
                '0',
442
                '1',
443
                '174.86',
444
                '827.14',
445
            ],
446
            $item[0]
447
        );
448
        $this->assertStringContainsString('Purchase 2', $item[1]);
449
    }
450
451
    /**
452
     * Tests getDataTm with hexadecimal encoded document text.
453
     *
454
     * @see https://github.com/smalot/pdfparser/issues/336
455
     */
456
    public function testGetDataTmIssue336()
457
    {
458
        $filename = $this->rootDir.'/samples/bugs/Issue336_decode_hexadecimal.pdf';
459
        $document = $this->getParserInstance()->parseFile($filename);
460
        $pages = $document->getPages();
461
        $page = $pages[0];
462
        $dataTm = $page->getDataTm();
463
464
        $item = $dataTm[2];
465
        $this->assertCount(13, $dataTm);
466
        $this->assertCount(2, $item);
467
        $this->assertCount(6, $item[0]);
468
        $this->assertEquals(
469
            [
470
                '1',
471
                '0',
472
                '0',
473
                '1',
474
                '318.185',
475
                '665.044',
476
            ],
477
            $item[0]
478
        );
479
        $this->assertEquals('Lorem', $item[1]);
480
    }
481
482
    /**
483
     * Tests that getPages() only returns Page objects
484
     *
485
     * @see https://github.com/smalot/pdfparser/issues/331
486
     *
487
     * Sample pdf file provided by @Reqrefusion, see
488
     * https://github.com/smalot/pdfparser/pull/350#issuecomment-703195220
489
     */
490
    public function testGetPages()
491
    {
492
        $filename = $this->rootDir.'/samples/bugs/Issue331.pdf';
493
        $document = $this->getParserInstance()->parseFile($filename);
494
        $pages = $document->getPages();
495
496
        // This should actually be 3 pages, but as long as the cause for issue #331
497
        // has not been found and the issue is not fixed, we'll settle for 2 here.
498
        // We still test for the count, so in case the bug should be fixed
499
        // unknowingly, we don't forget to resolve the issue as well and make sure
500
        // this assertion is present.
501
        $this->assertCount(2, $pages);
502
503
        foreach ($pages as $page) {
504
            $this->assertTrue($page instanceof Page);
505
        }
506
    }
507
508
    public function testGetTextXY()
509
    {
510
        // Document with text.
511
        $filename = $this->rootDir.'/samples/Document1_pdfcreator_nocompressed.pdf';
512
        $parser = $this->getParserInstance();
513
        $document = $parser->parseFile($filename);
514
        $pages = $document->getPages();
515
        $page = $pages[0];
516
        $result = $page->getTextXY(201.96, 720.68);
517
        $this->assertCount(1, $result);
518
        $this->assertCount(2, $result[0]);
519
        $this->assertEquals(
520
            [
521
                '0.999429',
522
                '0',
523
                '0',
524
                '1',
525
                '201.96',
526
                '720.68',
527
            ],
528
            $result[0][0]
529
        );
530
        $this->assertStringContainsString('Document title', $result[0][1]);
531
532
        $result = $page->getTextXY(201, 720);
533
        $this->assertCount(0, $result);
534
535
        $result = $page->getTextXY(201, 720, 1, 1);
536
        $this->assertCount(1, $result);
537
        $this->assertCount(2, $result[0]);
538
        $this->assertEquals(
539
            [
540
                '0.999429',
541
                '0',
542
                '0',
543
                '1',
544
                '201.96',
545
                '720.68',
546
            ],
547
            $result[0][0]
548
        );
549
        $this->assertStringContainsString('Document title', $result[0][1]);
550
551
        // ------------------------------------------------------
552
        // Document is a form
553
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample1.pdf';
554
        $document = $parser->parseFile($filename);
555
        $pages = $document->getPages();
556
        $page = $pages[0];
557
        $result = $page->getTextXY(167, 894, 1, 1);
558
        $this->assertCount(1, $result);
559
        $this->assertCount(2, $result[0]);
560
        $this->assertEquals(
561
            [
562
                '1',
563
                '0',
564
                '0',
565
                '1',
566
                '167.3',
567
                '894.58',
568
            ],
569
            $result[0][0]
570
        );
571
        $this->assertStringContainsString('MyName  MyLastName', $result[0][1]);
572
573
        $result = $page->getTextXY(681, 877, 1, 1);
574
        $this->assertStringContainsString('1/1/2020', $result[0][1]);
575
576
        $result = $page->getTextXY(174, 827, 1, 1);
577
        $this->assertStringContainsString('Purchase 1', $result[0][1]);
578
579
        // ------------------------------------------------------
580
        // Document is another form of the same type
581
        $filename = $this->rootDir.'/samples/SimpleInvoiceFilledExample2.pdf';
582
        $document = $parser->parseFile($filename);
583
        $pages = $document->getPages();
584
        $page = $pages[0];
585
        $result = $page->getTextXY(167, 894, 1, 1);
586
        $this->assertEquals(
587
            [
588
                '1',
589
                '0',
590
                '0',
591
                '1',
592
                '167.3',
593
                '894.58',
594
            ],
595
            $result[0][0]
596
        );
597
        $this->assertStringContainsString("Other'sName  Other'sLastName", $result[0][1]);
598
599
        $result = $page->getTextXY(681, 877, 1, 1);
600
        $this->assertStringContainsString('2/2/2020', $result[0][1]);
601
602
        $result = $page->getTextXY(174, 827, 1, 1);
603
        $this->assertStringContainsString('Purchase 2', $result[0][1]);
604
    }
605
606
    public function testExtractDecodedRawDataIssue450()
607
    {
608
        $filename = $this->rootDir.'/samples/bugs/Issue450.pdf';
609
        $parser = $this->getParserInstance();
610
        $document = $parser->parseFile($filename);
611
        $pages = $document->getPages();
612
        $page = $pages[0];
613
        $extractedDecodedRawData = $page->extractDecodedRawData();
614
        $this->assertIsArray($extractedDecodedRawData);
615
        $this->assertGreaterThan(3, \count($extractedDecodedRawData));
616
        $this->assertIsArray($extractedDecodedRawData[3]);
617
        $this->assertEquals('TJ', $extractedDecodedRawData[3]['o']);
618
        $this->assertIsArray($extractedDecodedRawData[3]['c']);
619
        $this->assertIsArray($extractedDecodedRawData[3]['c'][0]);
620
        $this->assertEquals(3, \count($extractedDecodedRawData[3]['c'][0]));
621
        $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $extractedDecodedRawData[3]['c'][0]['c']);
622
    }
623
624
    public function testGetDataTmIssue450()
625
    {
626
        $filename = $this->rootDir.'/samples/bugs/Issue450.pdf';
627
        $parser = $this->getParserInstance();
628
        $document = $parser->parseFile($filename);
629
        $pages = $document->getPages();
630
        $page = $pages[0];
631
        $dataTm = $page->getDataTm();
632
        $this->assertIsArray($dataTm);
633
        $this->assertEquals(1, \count($dataTm));
634
        $this->assertIsArray($dataTm[0]);
635
        $this->assertEquals(2, \count($dataTm[0]));
636
        $this->assertIsArray($dataTm[0][0]);
637
        $this->assertEquals(6, \count($dataTm[0][0]));
638
        $this->assertEquals(1, $dataTm[0][0][0]);
639
        $this->assertEquals(0, $dataTm[0][0][1]);
640
        $this->assertEquals(0, $dataTm[0][0][2]);
641
        $this->assertEquals(1, $dataTm[0][0][3]);
642
        $this->assertEquals(67.5, $dataTm[0][0][4]);
643
        $this->assertEquals(756.25, $dataTm[0][0][5]);
644
        $this->assertEquals('{signature:signer505906:Please+Sign+Here}', $dataTm[0][1]);
645
    }
646
}
647