1 | <?php |
||||
2 | |||||
3 | /** |
||||
4 | * @file This file is part of the PdfParser library. |
||||
5 | * |
||||
6 | * @author Konrad Abicht <[email protected]> |
||||
7 | * |
||||
8 | * @date 2020-06-01 |
||||
9 | * |
||||
10 | * @author Sébastien MALOT <[email protected]> |
||||
11 | * |
||||
12 | * @date 2017-01-03 |
||||
13 | * |
||||
14 | * @license LGPLv3 |
||||
15 | * |
||||
16 | * @url <https://github.com/smalot/pdfparser> |
||||
17 | * |
||||
18 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||
19 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||
20 | * |
||||
21 | * This program is free software: you can redistribute it and/or modify |
||||
22 | * it under the terms of the GNU Lesser General Public License as published by |
||||
23 | * the Free Software Foundation, either version 3 of the License, or |
||||
24 | * (at your option) any later version. |
||||
25 | * |
||||
26 | * This program is distributed in the hope that it will be useful, |
||||
27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
29 | * GNU Lesser General Public License for more details. |
||||
30 | * |
||||
31 | * You should have received a copy of the GNU Lesser General Public License |
||||
32 | * along with this program. |
||||
33 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||
34 | */ |
||||
35 | |||||
36 | namespace PHPUnitTests\Integration; |
||||
37 | |||||
38 | use PHPUnitTests\TestCase; |
||||
39 | use Smalot\PdfParser\Document; |
||||
40 | use Smalot\PdfParser\PDFObject; |
||||
41 | |||||
42 | class PDFObjectTest extends TestCase |
||||
43 | { |
||||
44 | public const TYPE = 't'; |
||||
45 | |||||
46 | public const OPERATOR = 'o'; |
||||
47 | |||||
48 | public const COMMAND = 'c'; |
||||
49 | |||||
50 | protected function getPdfObjectInstance($document): PDFObject |
||||
51 | { |
||||
52 | return new PDFObject($document); |
||||
53 | } |
||||
54 | |||||
55 | public function testGetCommandsText(): void |
||||
56 | { |
||||
57 | $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4 |
||||
58 | 342.561 Tm |
||||
59 | [(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>] |
||||
60 | TJ /R14 17.16 Tf <20> Tj |
||||
61 | 0.999014 0 0 1 336.84 319.161 Tm T* ( \x00m)Tj |
||||
62 | /R14 20.04 Tf |
||||
63 | ET Q |
||||
64 | q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm |
||||
65 | BI"; |
||||
66 | |||||
67 | $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); |
||||
68 | |||||
69 | $offset = 0; |
||||
0 ignored issues
–
show
Unused Code
introduced
by
![]() |
|||||
70 | $parts = []; |
||||
71 | foreach ($sections as $section) { |
||||
72 | $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0]; |
||||
73 | } |
||||
74 | |||||
75 | $reference = [ |
||||
76 | [ |
||||
77 | self::TYPE => '', |
||||
78 | self::OPERATOR => 'BT', |
||||
79 | self::COMMAND => '', |
||||
80 | ], |
||||
81 | [ |
||||
82 | self::TYPE => '/', |
||||
83 | self::OPERATOR => 'Tf', |
||||
84 | self::COMMAND => 'R14 30', |
||||
85 | ], |
||||
86 | [ |
||||
87 | self::TYPE => '', |
||||
88 | self::OPERATOR => 'Tm', |
||||
89 | self::COMMAND => '0.999016 0 0 1 137.4 342.561', |
||||
90 | ], |
||||
91 | [ |
||||
92 | self::TYPE => '[', |
||||
93 | self::OPERATOR => 'TJ', |
||||
94 | self::COMMAND => [ |
||||
95 | [ |
||||
96 | self::TYPE => '(', |
||||
97 | self::OPERATOR => 'TJ', |
||||
98 | self::COMMAND => 'A', |
||||
99 | ], |
||||
100 | [ |
||||
101 | self::TYPE => 'n', |
||||
102 | self::OPERATOR => '', |
||||
103 | self::COMMAND => '-168.854', |
||||
104 | ], |
||||
105 | [ |
||||
106 | self::TYPE => '(', |
||||
107 | self::OPERATOR => 'TJ', |
||||
108 | self::COMMAND => ' BC D', |
||||
109 | ], |
||||
110 | [ |
||||
111 | self::TYPE => 'n', |
||||
112 | self::OPERATOR => '', |
||||
113 | self::COMMAND => '-220.905', |
||||
114 | ], |
||||
115 | [ |
||||
116 | self::TYPE => '(', |
||||
117 | self::OPERATOR => 'TJ', |
||||
118 | self::COMMAND => '\\(E\\)', |
||||
119 | ], |
||||
120 | [ |
||||
121 | self::TYPE => 'n', |
||||
122 | self::OPERATOR => '', |
||||
123 | self::COMMAND => '20.905', |
||||
124 | ], |
||||
125 | [ |
||||
126 | self::TYPE => '<', |
||||
127 | self::OPERATOR => 'TJ', |
||||
128 | self::COMMAND => '20', |
||||
129 | ], |
||||
130 | ], |
||||
131 | ], |
||||
132 | [ |
||||
133 | self::TYPE => '/', |
||||
134 | self::OPERATOR => 'Tf', |
||||
135 | self::COMMAND => 'R14 17.16', |
||||
136 | ], |
||||
137 | [ |
||||
138 | self::TYPE => '<', |
||||
139 | self::OPERATOR => 'Tj', |
||||
140 | self::COMMAND => '20', |
||||
141 | ], |
||||
142 | [ |
||||
143 | self::TYPE => '', |
||||
144 | self::OPERATOR => 'Tm', |
||||
145 | self::COMMAND => '0.999014 0 0 1 336.84 319.161', |
||||
146 | ], |
||||
147 | [ |
||||
148 | self::TYPE => '', |
||||
149 | self::OPERATOR => 'T*', |
||||
150 | self::COMMAND => '', |
||||
151 | ], |
||||
152 | [ |
||||
153 | self::TYPE => '(', |
||||
154 | self::OPERATOR => 'Tj', |
||||
155 | self::COMMAND => " \x00m", |
||||
156 | ], |
||||
157 | [ |
||||
158 | self::TYPE => '/', |
||||
159 | self::OPERATOR => 'Tf', |
||||
160 | self::COMMAND => 'R14 20.04', |
||||
161 | ], |
||||
162 | [ |
||||
163 | self::TYPE => '', |
||||
164 | self::OPERATOR => 'ET', |
||||
165 | self::COMMAND => '', |
||||
166 | ], |
||||
167 | [ |
||||
168 | self::TYPE => '', |
||||
169 | self::OPERATOR => 'Q', |
||||
170 | self::COMMAND => '', |
||||
171 | ], |
||||
172 | [ |
||||
173 | self::TYPE => '', |
||||
174 | self::OPERATOR => 'q', |
||||
175 | self::COMMAND => '', |
||||
176 | ], |
||||
177 | [ |
||||
178 | self::TYPE => '', |
||||
179 | self::OPERATOR => 'cm', |
||||
180 | self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95', |
||||
181 | ], |
||||
182 | ]; |
||||
183 | |||||
184 | $this->assertEquals($parts, $reference); |
||||
185 | } |
||||
186 | |||||
187 | public function testCleanContent(): void |
||||
188 | { |
||||
189 | $content = '/Shape <</MCID << /Font<8>>> BT >>BDC |
||||
190 | Q |
||||
191 | /CS0 cs 1 1 0 scn |
||||
192 | 1 i |
||||
193 | /GS0 gs |
||||
194 | BT |
||||
195 | /TT0 1 Tf |
||||
196 | 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm |
||||
197 | (Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj |
||||
198 | EMC |
||||
199 | (ABC) Tj |
||||
200 | |||||
201 | [ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TD |
||||
202 | |||||
203 | ET |
||||
204 | /Shape <</MCID 2 >>BDC |
||||
205 | q |
||||
206 | 0.03 841'; |
||||
207 | |||||
208 | $expected = '_____________________________________ |
||||
209 | Q |
||||
210 | /CS0 cs 1 1 0 scn |
||||
211 | 1 i |
||||
212 | /GS0 gs |
||||
213 | BT |
||||
214 | /TT0 1 Tf |
||||
215 | 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm |
||||
216 | (________________________________________________)Tj |
||||
217 | ___ |
||||
218 | (___) Tj |
||||
219 | |||||
220 | [_____________________________________] TD |
||||
221 | |||||
222 | ET |
||||
223 | ______________________ |
||||
224 | q |
||||
225 | 0.03 841'; |
||||
226 | |||||
227 | $cleaned = $this->getPdfObjectInstance(new Document())->cleanContent($content, '_'); |
||||
0 ignored issues
–
show
The function
Smalot\PdfParser\PDFObject::cleanContent() has been deprecated: function is no longer used and will be removed in a future release
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This function has been deprecated. The supplier of the function has supplied an explanatory message. The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead. ![]() |
|||||
228 | |||||
229 | $this->assertEquals($cleaned, $expected); |
||||
230 | } |
||||
231 | |||||
232 | public function testFormatContent(): void |
||||
233 | { |
||||
234 | $content = '/Shape <</MCID << /Font<8>>> BT >>BDC Q /CS0 cs 1 1 0 scn 1 i |
||||
235 | /GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm |
||||
236 | (Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj |
||||
237 | [ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <</MCID 2 >>BDC q 0.03 841'; |
||||
238 | |||||
239 | $expected = '/Shape <</MCID << /Font<8>>> BT >>BDC |
||||
240 | Q |
||||
241 | /CS0 cs |
||||
242 | 1 1 0 scn |
||||
243 | 1 i |
||||
244 | /GS0 gs |
||||
245 | BT |
||||
246 | /TT0 1 Tf |
||||
247 | 0.0007 Tc |
||||
248 | 0.0018 Tw |
||||
249 | 0 Ts |
||||
250 | 100 Tz |
||||
251 | 0 Tr |
||||
252 | 24 0 0 24 51.3 639.26025 Tm |
||||
253 | (Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj |
||||
254 | EMC |
||||
255 | (ABC) Tj |
||||
256 | [ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ |
||||
257 | ET |
||||
258 | /Shape <</MCID 2 >>BDC |
||||
259 | q |
||||
260 | 0.03 841'; |
||||
261 | |||||
262 | // Normalize line-endings |
||||
263 | $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected); |
||||
264 | |||||
265 | $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); |
||||
266 | $formatContent->setAccessible(true); |
||||
267 | $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); |
||||
268 | |||||
269 | $this->assertEquals($expected, $cleaned); |
||||
270 | |||||
271 | // Check that binary data is rejected |
||||
272 | $content = hex2bin('a670c89d4a324e47'); |
||||
273 | |||||
274 | $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); |
||||
275 | |||||
276 | $this->assertEquals('', $cleaned); |
||||
277 | |||||
278 | // See: https://github.com/smalot/pdfparser/issues/668 |
||||
279 | $filename = $this->rootDir.'/samples/bugs/Issue668.pdf'; |
||||
280 | |||||
281 | $parser = $this->getParserInstance(); |
||||
282 | $document = $parser->parseFile($filename); |
||||
283 | $pages = $document->getPages(); |
||||
284 | |||||
285 | // Binary check is done before a regexp that causes an error |
||||
286 | $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText()); |
||||
287 | |||||
288 | // mb_check_encoding(..., 'UTF-8') returns true here, |
||||
289 | // necessitating a test for UTF-8 that's more strict |
||||
290 | $content = hex2bin('0101010101010101'); |
||||
291 | $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); |
||||
292 | |||||
293 | $this->assertEquals('', $cleaned); |
||||
294 | } |
||||
295 | |||||
296 | /** |
||||
297 | * Check that escaped slashes and parentheses are accounted for, |
||||
298 | * formatContent would emit a PHP Warning for "regular expression |
||||
299 | * is too large" here without fix for issue #709 |
||||
300 | * |
||||
301 | * @see https://github.com/smalot/pdfparser/issues/709 |
||||
302 | */ |
||||
303 | public function testFormatContentIssue709() |
||||
304 | { |
||||
305 | $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); |
||||
306 | $formatContent->setAccessible(true); |
||||
307 | |||||
308 | $content = '(String \\\\\\(string)Tj '.str_repeat('(Test)Tj ', 4500); |
||||
309 | $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); |
||||
310 | |||||
311 | $this->assertStringContainsString('(String \\\\\\(string)Tj'."\r\n", $cleaned); |
||||
312 | } |
||||
313 | |||||
314 | /** |
||||
315 | * Check that inline image data does not corrupt the stream |
||||
316 | * |
||||
317 | * @see: https://github.com/smalot/pdfparser/issues/691 |
||||
318 | */ |
||||
319 | public function testFormatContentInlineImages(): void |
||||
320 | { |
||||
321 | $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); |
||||
322 | $formatContent->setAccessible(true); |
||||
323 | |||||
324 | $cleaned = $formatContent->invoke( |
||||
325 | $this->getPdfObjectInstance(new Document()), |
||||
326 | 'BT (This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD ET q 65.30 0 0 18.00 412 707 cm BI /W 544 /H 150 |
||||
327 | /BPC 1 /IM true /F [/A85 /Fl] ID Gb"0F_$L6!$j/a\$:ma&h\'JnJJ9S?O_EA-W+%D^ClCH=FP3s5M-gStQm\'5/hc`C?<Q)riWgtEe:Po0dY_-er6$jM@#?n`E+#(sa"0Gk3&K>CqL(^pV$_-er6Ik`"-1]Q ;~> EI Q /F002 10.00 Tf 0.00 Tw 0 g' |
||||
328 | ); |
||||
329 | |||||
330 | // PdfParser should not be fooled by Q's in inline image data; |
||||
331 | // Only one 'Q' command should be found |
||||
332 | $commandQ = preg_match_all('/Q\r\n/', $cleaned); |
||||
333 | $this->assertEquals(1, $commandQ); |
||||
334 | |||||
335 | // The 'BI' inside a string should not be interpreted as the |
||||
336 | // beginning of an inline image command |
||||
337 | $this->assertStringContainsString('(This BI /W 258 /H 51 /should not trigger /as a /PDF command) TD', $cleaned); |
||||
338 | |||||
339 | $cleaned = $formatContent->invoke( |
||||
340 | $this->getPdfObjectInstance(new Document()), |
||||
341 | 'BT (This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD (There is no ID inline image in this data) TD (Nothing but text EI should be found) TD ET' |
||||
342 | ); |
||||
343 | |||||
344 | $this->assertEquals('BT'."\r\n". |
||||
345 | '(This BI /W 258 /H 51 /should not () \) trigger /as a /PDF command) TD'."\r\n". |
||||
346 | '(There is no ID inline image in this data) TD'."\r\n". |
||||
347 | '(Nothing but text EI should be found) TD'."\r\n". |
||||
348 | 'ET', $cleaned); |
||||
349 | } |
||||
350 | |||||
351 | public function testGetSectionsText(): void |
||||
352 | { |
||||
353 | $content = '/Shape <</MCID 1 >>BDC |
||||
354 | Q |
||||
355 | /CS0 cs 1 1 0 scn |
||||
356 | 1 i |
||||
357 | /GS0 gs |
||||
358 | BT |
||||
359 | /TT0 1 Tf |
||||
360 | 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm |
||||
361 | (Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj |
||||
362 | EMC |
||||
363 | (ABC) Tj |
||||
364 | |||||
365 | [ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD |
||||
366 | |||||
367 | ET |
||||
368 | /Shape <</MCID [BT] >>BDC BT /TT1 1.5 Tf (BT )Tj ET |
||||
369 | q |
||||
370 | 0.03 841'; |
||||
371 | |||||
372 | $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); |
||||
373 | |||||
374 | $this->assertEquals( |
||||
375 | [ |
||||
376 | '/Shape <</MCID 1 >>BDC', |
||||
377 | 'Q', |
||||
378 | 'BT', |
||||
379 | '/TT0 1 Tf', |
||||
380 | '0.0007 Tc', |
||||
381 | '0.0018 Tw', |
||||
382 | '0 Ts', |
||||
383 | '100 Tz', |
||||
384 | '0 Tr', |
||||
385 | '24 0 0 24 51.3 639.26025 Tm', |
||||
386 | '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj', |
||||
387 | 'EMC', |
||||
388 | '(ABC) Tj', |
||||
389 | '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD', |
||||
390 | 'ET', |
||||
391 | '/Shape <</MCID [BT] >>BDC', |
||||
392 | 'BT', |
||||
393 | '/TT1 1.5 Tf', |
||||
394 | '(BT )Tj', |
||||
395 | 'ET', |
||||
396 | 'q', |
||||
397 | ], |
||||
398 | $sections |
||||
399 | ); |
||||
400 | |||||
401 | // Test that a Name containing 'ET' doesn't close a 'BT' block |
||||
402 | // See: https://github.com/smalot/pdfparser/issues/474 |
||||
403 | $content = 'BT |
||||
404 | /FTxkPETkkj 8 Tf |
||||
405 | 1 0 0 1 535.55 627.4 Tm |
||||
406 | (Hello World)TJ |
||||
407 | ET'; |
||||
408 | |||||
409 | $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); |
||||
410 | |||||
411 | $this->assertNotEquals('/FTxkP', $sections[0]); |
||||
412 | $this->assertNotEquals('/FTxkP', $sections[1]); |
||||
413 | } |
||||
414 | |||||
415 | public function testParseDictionary(): void |
||||
416 | { |
||||
417 | $data = '<</ActualText(text)/XObject<</F2 6 0 R /F3 [/Sub /Array]>> /Array[/Parsed /Data/Actual]/Silent<>>>'; |
||||
418 | |||||
419 | $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data); |
||||
420 | |||||
421 | $this->assertArrayHasKey('ActualText', $dictionary); |
||||
422 | $this->assertArrayHasKey('XObject', $dictionary); |
||||
423 | $this->assertArrayHasKey('Array', $dictionary); |
||||
424 | $this->assertArrayHasKey('Silent', $dictionary); |
||||
425 | |||||
426 | $this->assertCount(3, $dictionary['Array']); |
||||
427 | |||||
428 | $this->assertEquals('<>', $dictionary['Silent']); |
||||
429 | } |
||||
430 | |||||
431 | /** |
||||
432 | * Tests that graphics position (cm) is taken into account when |
||||
433 | * positioning text |
||||
434 | * |
||||
435 | * @see: https://github.com/smalot/pdfparser/issues/608 |
||||
436 | */ |
||||
437 | public function testGraphicsPositioning(): void |
||||
438 | { |
||||
439 | $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; |
||||
440 | |||||
441 | $parser = $this->getParserInstance(); |
||||
442 | $document = $parser->parseFile($filename); |
||||
443 | $pages = $document->getPages(); |
||||
444 | |||||
445 | // The \n is not added if 'cm' commands are ignored |
||||
446 | $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText()); |
||||
447 | } |
||||
448 | |||||
449 | /** |
||||
450 | * Tests that ActualText text is printed for a block instead of the |
||||
451 | * contents of the Tj or TJ commands in the block. |
||||
452 | * |
||||
453 | * @see: https://github.com/smalot/pdfparser/issues/464 |
||||
454 | */ |
||||
455 | public function testActualText(): void |
||||
456 | { |
||||
457 | $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; |
||||
458 | |||||
459 | $parser = $this->getParserInstance(); |
||||
460 | $document = $parser->parseFile($filename); |
||||
461 | $pages = $document->getPages(); |
||||
462 | |||||
463 | // An ActualText command subs in the three literal characters |
||||
464 | // 'ffi' for the single character ligature here |
||||
465 | // In addition, if $last_written_position isn't used to store |
||||
466 | // the position to insert, \n's would be erroniously inserted |
||||
467 | // on either side of the 'ffi' |
||||
468 | $this->assertStringContainsString('efficitur', $pages[0]->getText()); |
||||
469 | } |
||||
470 | |||||
471 | /** |
||||
472 | * Tests for the correct decoding of an Em-dash character in |
||||
473 | * certain font contexts |
||||
474 | * |
||||
475 | * See: https://github.com/smalot/pdfparser/issues/585 |
||||
476 | */ |
||||
477 | public function testDecodeEmDash(): void |
||||
478 | { |
||||
479 | $filename = $this->rootDir.'/samples/bugs/Issue585.pdf'; |
||||
480 | |||||
481 | $parser = $this->getParserInstance(); |
||||
482 | $document = $parser->parseFile($filename); |
||||
483 | $pages = $document->getPages(); |
||||
484 | |||||
485 | $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText()); |
||||
486 | } |
||||
487 | |||||
488 | /** |
||||
489 | * Tests behavior with reversed chars instruction. |
||||
490 | * |
||||
491 | * @see: https://github.com/smalot/pdfparser/issues/398 |
||||
492 | */ |
||||
493 | public function testReversedChars(): void |
||||
494 | { |
||||
495 | $filename = $this->rootDir.'/samples/bugs/Issue398.pdf'; |
||||
496 | |||||
497 | $parser = $this->getParserInstance(); |
||||
498 | $document = $parser->parseFile($filename); |
||||
499 | $pages = $document->getPages(); |
||||
500 | |||||
501 | $pageText = $pages[0]->getText(); |
||||
502 | |||||
503 | $this->assertStringContainsString('שלומי טסט', $pageText); |
||||
504 | $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText); |
||||
505 | } |
||||
506 | |||||
507 | /** |
||||
508 | * Tests that a text stream with an improperly selected font code |
||||
509 | * page falls back to one that maps all characters. |
||||
510 | * |
||||
511 | * @see: https://github.com/smalot/pdfparser/issues/586 |
||||
512 | */ |
||||
513 | public function testImproperFontFallback(): void |
||||
514 | { |
||||
515 | $filename = $this->rootDir.'/samples/ImproperFontFallback.pdf'; |
||||
516 | |||||
517 | $parser = $this->getParserInstance(); |
||||
518 | $document = $parser->parseFile($filename); |
||||
519 | $pages = $document->getPages(); |
||||
520 | |||||
521 | $this->assertStringContainsString('сделал', $pages[0]->getText()); |
||||
522 | } |
||||
523 | |||||
524 | /** |
||||
525 | * Tests that a font ID containing a hyphen / dash character was |
||||
526 | * correctly parsed |
||||
527 | * |
||||
528 | * @see: https://github.com/smalot/pdfparser/issues/145 |
||||
529 | */ |
||||
530 | public function testFontIDWithHyphen(): void |
||||
531 | { |
||||
532 | $pdfObject = $this->getPdfObjectInstance(new Document()); |
||||
533 | |||||
534 | $fontCommandHyphen = $pdfObject->getCommandsText('/FID-01 15.00 Tf'); |
||||
535 | |||||
536 | $this->assertEquals('/', $fontCommandHyphen[0]['t']); |
||||
537 | $this->assertEquals('Tf', $fontCommandHyphen[0]['o']); |
||||
538 | $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']); |
||||
539 | } |
||||
540 | |||||
541 | /** |
||||
542 | * Tests that an invalid command does not cause an error, but just |
||||
543 | * returns an empty array |
||||
544 | */ |
||||
545 | public function testInvalidCommand(): void |
||||
546 | { |
||||
547 | $pdfObject = $this->getPdfObjectInstance(new Document()); |
||||
548 | |||||
549 | $validCommand = $pdfObject->getCommandsText('75 rg'); |
||||
550 | |||||
551 | $this->assertEquals('', $validCommand[0]['t']); |
||||
552 | $this->assertEquals('rg', $validCommand[0]['o']); |
||||
553 | $this->assertEquals('75', $validCommand[0]['c']); |
||||
554 | |||||
555 | $invalidCommand = $pdfObject->getCommandsText('75'); |
||||
556 | |||||
557 | $this->assertEquals([], $invalidCommand); |
||||
558 | } |
||||
559 | } |
||||
560 |