smalot /
pdfparser
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * @file This file is part of the PdfParser library. |
||
| 5 | * |
||
| 6 | * @author Konrad Abicht <[email protected]> |
||
| 7 | * |
||
| 8 | * @date 2020-06-01 |
||
| 9 | * |
||
| 10 | * @author Sébastien MALOT <[email protected]> |
||
| 11 | * |
||
| 12 | * @date 2017-01-03 |
||
| 13 | * |
||
| 14 | * @license LGPLv3 |
||
| 15 | * |
||
| 16 | * @url <https://github.com/smalot/pdfparser> |
||
| 17 | * |
||
| 18 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
| 19 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
| 20 | * |
||
| 21 | * This program is free software: you can redistribute it and/or modify |
||
| 22 | * it under the terms of the GNU Lesser General Public License as published by |
||
| 23 | * the Free Software Foundation, either version 3 of the License, or |
||
| 24 | * (at your option) any later version. |
||
| 25 | * |
||
| 26 | * This program is distributed in the hope that it will be useful, |
||
| 27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 29 | * GNU Lesser General Public License for more details. |
||
| 30 | * |
||
| 31 | * You should have received a copy of the GNU Lesser General Public License |
||
| 32 | * along with this program. |
||
| 33 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
| 34 | */ |
||
| 35 | |||
| 36 | namespace PHPUnitTests\Integration; |
||
| 37 | |||
| 38 | use PHPUnitTests\TestCase; |
||
| 39 | use Smalot\PdfParser\Document; |
||
| 40 | use Smalot\PdfParser\Header; |
||
| 41 | use Smalot\PdfParser\Page; |
||
| 42 | use Smalot\PdfParser\Pages; |
||
| 43 | use Smalot\PdfParser\Parser; |
||
| 44 | use Smalot\PdfParser\PDFObject; |
||
| 45 | |||
| 46 | class DocumentTest extends TestCase |
||
| 47 | { |
||
| 48 | protected function getDocumentInstance(): Document |
||
| 49 | { |
||
| 50 | return new Document(); |
||
| 51 | } |
||
| 52 | |||
| 53 | protected function getPDFObjectInstance(Document $document, Header $header = null): PDFObject |
||
| 54 | { |
||
| 55 | return new PDFObject($document, $header); |
||
| 56 | } |
||
| 57 | |||
| 58 | protected function getPageInstance(Document $document, Header $header): PDFObject |
||
| 59 | { |
||
| 60 | return new Page($document, $header); |
||
| 61 | } |
||
| 62 | |||
| 63 | protected function getPagesInstance(Document $document, Header $header): PDFObject |
||
| 64 | { |
||
| 65 | return new Pages($document, $header); |
||
| 66 | } |
||
| 67 | |||
| 68 | public function testSetObjects(): void |
||
| 69 | { |
||
| 70 | $document = $this->getDocumentInstance(); |
||
| 71 | $object = $this->getPDFObjectInstance($document); |
||
| 72 | |||
| 73 | // Obj #1 is missing |
||
| 74 | $this->assertNull($document->getObjectById(1)); |
||
| 75 | $document->setObjects([1 => $object]); |
||
| 76 | |||
| 77 | // Obj #1 exists |
||
| 78 | $this->assertTrue($document->getObjectById(1) instanceof PDFObject); |
||
| 79 | |||
| 80 | $content = '<</Type/Page>>'; |
||
| 81 | $header = Header::parse($content, $document); |
||
| 82 | $object = $this->getPDFObjectInstance($document, $header); |
||
| 83 | $document->setObjects([2 => $object]); |
||
| 84 | |||
| 85 | // Obj #1 is missing |
||
| 86 | $this->assertNull($document->getObjectById(1)); |
||
| 87 | |||
| 88 | // Obj #2 exists |
||
| 89 | $this->assertTrue($document->getObjectById(2) instanceof PDFObject); |
||
| 90 | } |
||
| 91 | |||
| 92 | public function testGetObjects(): void |
||
| 93 | { |
||
| 94 | $document = $this->getDocumentInstance(); |
||
| 95 | $object1 = $this->getPDFObjectInstance($document); |
||
| 96 | $content = '<</Type/Page>>unparsed content'; |
||
| 97 | $header = Header::parse($content, $document); |
||
| 98 | |||
| 99 | $object2 = $this->getPageInstance($document, $header); |
||
| 100 | $document->setObjects([1 => $object1, 2 => $object2]); |
||
| 101 | |||
| 102 | $objects = $document->getObjects(); |
||
| 103 | $this->assertEquals(2, \count($objects)); |
||
| 104 | $this->assertTrue($objects[1] instanceof PDFObject); |
||
| 105 | $this->assertTrue($objects[2] instanceof PDFObject); |
||
| 106 | $this->assertTrue($objects[2] instanceof Page); |
||
| 107 | } |
||
| 108 | |||
| 109 | public function testDictionary(): void |
||
| 110 | { |
||
| 111 | $document = $this->getDocumentInstance(); |
||
| 112 | $objects = $document->getDictionary(); |
||
| 113 | $this->assertEquals(0, \count($objects)); |
||
| 114 | $object1 = $this->getPDFObjectInstance($document); |
||
| 115 | |||
| 116 | $content = '<</Type/Page>>'; |
||
| 117 | $header = Header::parse($content, $document); |
||
| 118 | $object2 = $this->getPageInstance($document, $header); |
||
| 119 | $document->setObjects([1 => $object1, 2 => $object2]); |
||
| 120 | |||
| 121 | $objects = $document->getDictionary(); |
||
| 122 | $this->assertEquals(1, \count($objects)); |
||
| 123 | $this->assertEquals(1, \count($objects['Page']['all'])); |
||
| 124 | $this->assertEquals($object2, $objects['Page']['all'][2]); |
||
| 125 | } |
||
| 126 | |||
| 127 | public function testGetObjectsByType(): void |
||
| 128 | { |
||
| 129 | $document = $this->getDocumentInstance(); |
||
| 130 | $object1 = $this->getPDFObjectInstance($document); |
||
| 131 | $content = '<</Type/Page>>'; |
||
| 132 | $header = Header::parse($content, $document); |
||
| 133 | $object2 = $this->getPageInstance($document, $header); |
||
| 134 | $document->setObjects([1 => $object1, 2 => $object2]); |
||
| 135 | |||
| 136 | $objects = $document->getObjectsByType('Page'); |
||
| 137 | $this->assertEquals(1, \count($objects)); |
||
| 138 | $this->assertTrue($objects[2] instanceof PDFObject); |
||
| 139 | $this->assertTrue($objects[2] instanceof Page); |
||
| 140 | } |
||
| 141 | |||
| 142 | public function testGetPages(): void |
||
| 143 | { |
||
| 144 | $document = $this->getDocumentInstance(); |
||
| 145 | |||
| 146 | // Listing pages from type Page |
||
| 147 | $content = '<</Type/Page>>'; |
||
| 148 | $header = Header::parse($content, $document); |
||
| 149 | $object1 = $this->getPageInstance($document, $header); |
||
| 150 | $header = Header::parse($content, $document); |
||
| 151 | $object2 = $this->getPageInstance($document, $header); |
||
| 152 | $document->setObjects([1 => $object1, 2 => $object2]); |
||
| 153 | $pages = $document->getPages(); |
||
| 154 | |||
| 155 | $this->assertEquals(2, \count($pages)); |
||
| 156 | $this->assertTrue($pages[0] instanceof Page); |
||
| 157 | $this->assertTrue($pages[1] instanceof Page); |
||
| 158 | |||
| 159 | // Listing pages from type Pages (kids) |
||
| 160 | $content = '<</Type/Page>>'; |
||
| 161 | $header = Header::parse($content, $document); |
||
| 162 | $object1 = $this->getPageInstance($document, $header); |
||
| 163 | $header = Header::parse($content, $document); |
||
| 164 | $object2 = $this->getPageInstance($document, $header); |
||
| 165 | $header = Header::parse($content, $document); |
||
| 166 | $object3 = $this->getPageInstance($document, $header); |
||
| 167 | |||
| 168 | $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>'; |
||
| 169 | $header = Header::parse($content, $document); |
||
| 170 | $object4 = $this->getPagesInstance($document, $header); |
||
| 171 | |||
| 172 | $content = '<</Type/Pages/Kids[3 0 R]>>'; |
||
| 173 | $header = Header::parse($content, $document); |
||
| 174 | $object5 = $this->getPagesInstance($document, $header); |
||
| 175 | |||
| 176 | $document->setObjects([ |
||
| 177 | '1_0' => $object1, |
||
| 178 | '2_0' => $object2, |
||
| 179 | '3_0' => $object3, |
||
| 180 | '4_0' => $object4, |
||
| 181 | '5_0' => $object5, |
||
| 182 | ]); |
||
| 183 | $pages = $document->getPages(); |
||
| 184 | |||
| 185 | $this->assertEquals(3, \count($pages)); |
||
| 186 | $this->assertTrue($pages[0] instanceof Page); |
||
| 187 | $this->assertTrue($pages[1] instanceof Page); |
||
| 188 | $this->assertTrue($pages[2] instanceof Page); |
||
| 189 | |||
| 190 | // Listing pages from type Catalog |
||
| 191 | $content = '<</Type/Page>>'; |
||
| 192 | $header = Header::parse($content, $document); |
||
| 193 | $object1 = $this->getPageInstance($document, $header); |
||
| 194 | $header = Header::parse($content, $document); |
||
| 195 | $object2 = $this->getPageInstance($document, $header); |
||
| 196 | $header = Header::parse($content, $document); |
||
| 197 | $object3 = $this->getPageInstance($document, $header); |
||
| 198 | $content = '<</Type/Pages/Kids[1 0 R 2 0 R]>>'; |
||
| 199 | $header = Header::parse($content, $document); |
||
| 200 | $object4 = $this->getPagesInstance($document, $header); |
||
| 201 | $content = '<</Type/Pages/Kids[4 0 R 3 0 R]>>'; |
||
| 202 | $header = Header::parse($content, $document); |
||
| 203 | $object5 = $this->getPagesInstance($document, $header); |
||
| 204 | $content = '<</Type/Catalog/Pages 5 0 R >>'; |
||
| 205 | $header = Header::parse($content, $document); |
||
| 206 | $object6 = $this->getPagesInstance($document, $header); |
||
| 207 | $document->setObjects( |
||
| 208 | [ |
||
| 209 | '1_0' => $object1, |
||
| 210 | '2_0' => $object2, |
||
| 211 | '3_0' => $object3, |
||
| 212 | '4_0' => $object4, |
||
| 213 | '5_0' => $object5, |
||
| 214 | '6_0' => $object6, |
||
| 215 | ] |
||
| 216 | ); |
||
| 217 | $pages = $document->getPages(); |
||
| 218 | $this->assertEquals(3, \count($pages)); |
||
| 219 | $this->assertTrue($pages[0] instanceof Page); |
||
| 220 | $this->assertTrue($pages[1] instanceof Page); |
||
| 221 | $this->assertTrue($pages[2] instanceof Page); |
||
| 222 | } |
||
| 223 | |||
| 224 | public function testGetPagesMissingCatalog(): void |
||
| 225 | { |
||
| 226 | $this->expectException(\Exception::class); |
||
| 227 | $this->expectExceptionMessage('Missing catalog.'); |
||
| 228 | |||
| 229 | // Missing catalog |
||
| 230 | $document = $this->getDocumentInstance(); |
||
| 231 | $document->getPages(); |
||
| 232 | } |
||
| 233 | |||
| 234 | /** |
||
| 235 | * Tests getText method without a given page limit. |
||
| 236 | * |
||
| 237 | * @see https://github.com/smalot/pdfparser/pull/562 |
||
| 238 | */ |
||
| 239 | public function testGetTextNoPageLimit(): void |
||
| 240 | { |
||
| 241 | $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); |
||
| 242 | |||
| 243 | self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText()); |
||
| 244 | } |
||
| 245 | |||
| 246 | /** |
||
| 247 | * Tests getText method with a given page limit. |
||
| 248 | * |
||
| 249 | * @see https://github.com/smalot/pdfparser/pull/562 |
||
| 250 | */ |
||
| 251 | public function testGetTextWithPageLimit(): void |
||
| 252 | { |
||
| 253 | $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); |
||
| 254 | |||
| 255 | // given text is on page 2, it has to be ignored because of that |
||
| 256 | self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1)); |
||
| 257 | } |
||
| 258 | |||
| 259 | /** |
||
| 260 | * Tests extraction of XMP Metadata vs. getHeader() data. |
||
| 261 | * |
||
| 262 | * @see https://github.com/smalot/pdfparser/pull/606 |
||
| 263 | */ |
||
| 264 | public function testExtractXMPMetadata(): void |
||
| 265 | { |
||
| 266 | $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf'); |
||
| 267 | |||
| 268 | // Get the original parsed details from getHeader(). |
||
| 269 | $ref = new \ReflectionClass('\Smalot\PdfParser\Document'); |
||
| 270 | $prop = $ref->getProperty('trailer'); |
||
| 271 | $prop->setAccessible(true); |
||
| 272 | $trailer = $prop->getValue($document); |
||
| 273 | |||
| 274 | if ($trailer->has('Info')) { |
||
| 275 | $info = $trailer->get('Info'); |
||
| 276 | if (null !== $info && method_exists($info, 'getHeader')) { |
||
| 277 | $details = $info->getHeader()->getDetails(); |
||
| 278 | } |
||
| 279 | } |
||
| 280 | |||
| 281 | // Check that the Title does not contain a UTF-8 Right Single |
||
| 282 | // Quotation Mark, and that the Creator does not contain a UTF-8 |
||
| 283 | // Registered Trademark symbol, an indication that getHeader() |
||
| 284 | // did not find the correct values. |
||
| 285 | self::assertStringNotContainsString("\u{2019}", $details['Title']); |
||
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
Loading history...
|
|||
| 286 | self::assertStringNotContainsString("\u{00AE}", $details['Creator']); |
||
| 287 | |||
| 288 | $detailsXMP = $document->getDetails(); |
||
| 289 | |||
| 290 | // Test two fields for special characters that getHeader() does |
||
| 291 | // not handle properly. |
||
| 292 | self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $detailsXMP['Title']); |
||
| 293 | self::assertStringContainsString("Microsoft\u{00AE} Word for Microsoft 365", $detailsXMP['Creator']); |
||
| 294 | |||
| 295 | // Test that getDetails() data NOT contained in the XMP Metadata |
||
| 296 | // is still accessible and not discarded/overwritten. |
||
| 297 | self::assertEquals(1, $detailsXMP['Pages']); |
||
| 298 | } |
||
| 299 | } |
||
| 300 |