| Total Complexity | 143 |
| Total Lines | 716 |
| Duplicated Lines | 0 % |
| Changes | 3 | ||
| Bugs | 2 | Features | 0 |
Complex classes like PDFObject often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PDFObject, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 39 | class PDFObject |
||
| 40 | { |
||
| 41 | const TYPE = 't'; |
||
| 42 | |||
| 43 | const OPERATOR = 'o'; |
||
| 44 | |||
| 45 | const COMMAND = 'c'; |
||
| 46 | |||
| 47 | /** |
||
| 48 | * The recursion stack. |
||
| 49 | * |
||
| 50 | * @var array |
||
| 51 | */ |
||
| 52 | public static $recursionStack = []; |
||
| 53 | |||
| 54 | /** |
||
| 55 | * @var Document |
||
| 56 | */ |
||
| 57 | protected $document = null; |
||
| 58 | |||
| 59 | /** |
||
| 60 | * @var Header |
||
| 61 | */ |
||
| 62 | protected $header = null; |
||
| 63 | |||
| 64 | /** |
||
| 65 | * @var string |
||
| 66 | */ |
||
| 67 | protected $content = null; |
||
| 68 | |||
| 69 | /** |
||
| 70 | * @param Header $header |
||
| 71 | * @param string $content |
||
| 72 | */ |
||
| 73 | public function __construct(Document $document, Header $header = null, $content = null) |
||
| 74 | { |
||
| 75 | $this->document = $document; |
||
| 76 | $this->header = null !== $header ? $header : new Header(); |
||
| 77 | $this->content = $content; |
||
| 78 | } |
||
| 79 | |||
| 80 | public function init() |
||
| 81 | { |
||
| 82 | } |
||
| 83 | |||
| 84 | /** |
||
| 85 | * @return Header|null |
||
| 86 | */ |
||
| 87 | public function getHeader() |
||
| 88 | { |
||
| 89 | return $this->header; |
||
| 90 | } |
||
| 91 | |||
| 92 | /** |
||
| 93 | * @param string $name |
||
| 94 | * |
||
| 95 | * @return Element|PDFObject |
||
| 96 | */ |
||
| 97 | public function get($name) |
||
| 98 | { |
||
| 99 | return $this->header->get($name); |
||
| 100 | } |
||
| 101 | |||
| 102 | /** |
||
| 103 | * @param string $name |
||
| 104 | * |
||
| 105 | * @return bool |
||
| 106 | */ |
||
| 107 | public function has($name) |
||
| 108 | { |
||
| 109 | return $this->header->has($name); |
||
| 110 | } |
||
| 111 | |||
| 112 | /** |
||
| 113 | * @param bool $deep |
||
| 114 | * |
||
| 115 | * @return array |
||
| 116 | */ |
||
| 117 | public function getDetails($deep = true) |
||
| 118 | { |
||
| 119 | return $this->header->getDetails($deep); |
||
| 120 | } |
||
| 121 | |||
| 122 | /** |
||
| 123 | * @return string|null |
||
| 124 | */ |
||
| 125 | public function getContent() |
||
| 128 | } |
||
| 129 | |||
| 130 | /** |
||
| 131 | * @param string $content |
||
| 132 | */ |
||
| 133 | public function cleanContent($content, $char = 'X') |
||
| 190 | } |
||
| 191 | |||
| 192 | /** |
||
| 193 | * @param string $content |
||
| 194 | * |
||
| 195 | * @return array |
||
| 196 | */ |
||
| 197 | public function getSectionsText($content) |
||
| 232 | } |
||
| 233 | |||
| 234 | private function getDefaultFont(Page $page = null) |
||
| 235 | { |
||
| 236 | $fonts = []; |
||
| 237 | if (!is_null($page)) { |
||
| 238 | $fonts = $page->getFonts(); |
||
| 239 | } |
||
| 240 | |||
| 241 | $fonts = array_merge($fonts, array_values($this->document->getFonts())); |
||
| 242 | |||
| 243 | if (count($fonts) > 0) |
||
| 244 | { |
||
| 245 | return reset($fonts); |
||
| 246 | } |
||
| 247 | |||
| 248 | return new Font($this->document); |
||
| 249 | } |
||
| 250 | |||
| 251 | /** |
||
| 252 | * @param Page $page |
||
| 253 | * |
||
| 254 | * @return string |
||
| 255 | * |
||
| 256 | * @throws \Exception |
||
| 257 | */ |
||
| 258 | public function getText(Page $page = null) |
||
| 259 | { |
||
| 260 | $text = ''; |
||
| 261 | $sections = $this->getSectionsText($this->content); |
||
| 262 | $current_font = $this->getDefaultFont($page); |
||
| 263 | |||
| 264 | $current_position_td = ['x' => false, 'y' => false]; |
||
| 265 | $current_position_tm = ['x' => false, 'y' => false]; |
||
| 266 | |||
| 267 | array_push(self::$recursionStack, $this->getUniqueId()); |
||
| 268 | |||
| 269 | foreach ($sections as $section) { |
||
| 270 | $commands = $this->getCommandsText($section); |
||
| 271 | |||
| 272 | foreach ($commands as $command) { |
||
| 273 | switch ($command[self::OPERATOR]) { |
||
| 274 | // set character spacing |
||
| 275 | case 'Tc': |
||
| 276 | break; |
||
| 277 | |||
| 278 | // move text current point |
||
| 279 | case 'Td': |
||
| 280 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 281 | $y = array_pop($args); |
||
|
|
|||
| 282 | $x = array_pop($args); |
||
| 283 | if (((float) $x <= 0) || |
||
| 284 | (false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y'])) |
||
| 285 | ) { |
||
| 286 | // vertical offset |
||
| 287 | $text .= "\n"; |
||
| 288 | } elseif (false !== $current_position_td['x'] && (float) $x > (float) ( |
||
| 289 | $current_position_td['x'] |
||
| 290 | ) |
||
| 291 | ) { |
||
| 292 | // horizontal offset |
||
| 293 | $text .= ' '; |
||
| 294 | } |
||
| 295 | $current_position_td = ['x' => $x, 'y' => $y]; |
||
| 296 | break; |
||
| 297 | |||
| 298 | // move text current point and set leading |
||
| 299 | case 'TD': |
||
| 300 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 301 | $y = array_pop($args); |
||
| 302 | $x = array_pop($args); |
||
| 303 | if ((float) $y < 0) { |
||
| 304 | $text .= "\n"; |
||
| 305 | } elseif ((float) $x <= 0) { |
||
| 306 | $text .= ' '; |
||
| 307 | } |
||
| 308 | break; |
||
| 309 | |||
| 310 | case 'Tf': |
||
| 311 | list($id) = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 312 | $id = trim($id, '/'); |
||
| 313 | if (null !== $page) { |
||
| 314 | $current_font = $page->getFont($id); |
||
| 315 | } |
||
| 316 | break; |
||
| 317 | |||
| 318 | case "'": |
||
| 319 | case 'Tj': |
||
| 320 | $command[self::COMMAND] = [$command]; |
||
| 321 | // no break |
||
| 322 | case 'TJ': |
||
| 323 | $sub_text = $current_font->decodeText($command[self::COMMAND]); |
||
| 324 | $text .= $sub_text; |
||
| 325 | break; |
||
| 326 | |||
| 327 | // set leading |
||
| 328 | case 'TL': |
||
| 329 | $text .= ' '; |
||
| 330 | break; |
||
| 331 | |||
| 332 | case 'Tm': |
||
| 333 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 334 | $y = array_pop($args); |
||
| 335 | $x = array_pop($args); |
||
| 336 | if (false !== $current_position_tm['x']) { |
||
| 337 | $delta = abs((float) $x - (float) ($current_position_tm['x'])); |
||
| 338 | if ($delta > 10) { |
||
| 339 | $text .= "\t"; |
||
| 340 | } |
||
| 341 | } |
||
| 342 | if (false !== $current_position_tm['y']) { |
||
| 343 | $delta = abs((float) $y - (float) ($current_position_tm['y'])); |
||
| 344 | if ($delta > 10) { |
||
| 345 | $text .= "\n"; |
||
| 346 | } |
||
| 347 | } |
||
| 348 | $current_position_tm = ['x' => $x, 'y' => $y]; |
||
| 349 | break; |
||
| 350 | |||
| 351 | // set super/subscripting text rise |
||
| 352 | case 'Ts': |
||
| 353 | break; |
||
| 354 | |||
| 355 | // set word spacing |
||
| 356 | case 'Tw': |
||
| 357 | break; |
||
| 358 | |||
| 359 | // set horizontal scaling |
||
| 360 | case 'Tz': |
||
| 361 | $text .= "\n"; |
||
| 362 | break; |
||
| 363 | |||
| 364 | // move to start of next line |
||
| 365 | case 'T*': |
||
| 366 | $text .= "\n"; |
||
| 367 | break; |
||
| 368 | |||
| 369 | case 'Da': |
||
| 370 | break; |
||
| 371 | |||
| 372 | case 'Do': |
||
| 373 | if (null !== $page) { |
||
| 374 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 375 | $id = trim(array_pop($args), '/ '); |
||
| 376 | $xobject = $page->getXObject($id); |
||
| 377 | |||
| 378 | // @todo $xobject could be a ElementXRef object, which would then throw an error |
||
| 379 | if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) { |
||
| 380 | // Not a circular reference. |
||
| 381 | $text .= $xobject->getText($page); |
||
| 382 | } |
||
| 383 | } |
||
| 384 | break; |
||
| 385 | |||
| 386 | case 'rg': |
||
| 387 | case 'RG': |
||
| 388 | break; |
||
| 389 | |||
| 390 | case 're': |
||
| 391 | break; |
||
| 392 | |||
| 393 | case 'co': |
||
| 394 | break; |
||
| 395 | |||
| 396 | case 'cs': |
||
| 397 | break; |
||
| 398 | |||
| 399 | case 'gs': |
||
| 400 | break; |
||
| 401 | |||
| 402 | case 'en': |
||
| 403 | break; |
||
| 404 | |||
| 405 | case 'sc': |
||
| 406 | case 'SC': |
||
| 407 | break; |
||
| 408 | |||
| 409 | case 'g': |
||
| 410 | case 'G': |
||
| 411 | break; |
||
| 412 | |||
| 413 | case 'V': |
||
| 414 | break; |
||
| 415 | |||
| 416 | case 'vo': |
||
| 417 | case 'Vo': |
||
| 418 | break; |
||
| 419 | |||
| 420 | default: |
||
| 421 | } |
||
| 422 | } |
||
| 423 | } |
||
| 424 | |||
| 425 | array_pop(self::$recursionStack); |
||
| 426 | |||
| 427 | return $text.' '; |
||
| 428 | } |
||
| 429 | |||
| 430 | /** |
||
| 431 | * @param Page |
||
| 432 | * |
||
| 433 | * @return array |
||
| 434 | * @throws \Exception |
||
| 435 | */ |
||
| 436 | public function getTextArray(Page $page = null) |
||
| 437 | { |
||
| 438 | $text = array(); |
||
| 439 | $sections = $this->getSectionsText($this->content); |
||
| 440 | $current_font = $this->getDefaultFont($page); |
||
| 441 | |||
| 442 | foreach ($sections as $section) { |
||
| 443 | |||
| 444 | $commands = $this->getCommandsText($section); |
||
| 445 | |||
| 446 | foreach ($commands as $command) { |
||
| 447 | |||
| 448 | switch ($command[self::OPERATOR]) { |
||
| 449 | // set character spacing |
||
| 450 | case 'Tc': |
||
| 451 | break; |
||
| 452 | |||
| 453 | // move text current point |
||
| 454 | case 'Td': |
||
| 455 | break; |
||
| 456 | |||
| 457 | // move text current point and set leading |
||
| 458 | case 'TD': |
||
| 459 | break; |
||
| 460 | |||
| 461 | case 'Tf': |
||
| 462 | if (!is_null($page)) { |
||
| 463 | list($id,) = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 464 | $id = trim($id, '/'); |
||
| 465 | $current_font = $page->getFont($id); |
||
| 466 | } |
||
| 467 | break; |
||
| 468 | |||
| 469 | case "'": |
||
| 470 | case 'Tj': |
||
| 471 | $command[self::COMMAND] = array($command); |
||
| 472 | case 'TJ': |
||
| 473 | $sub_text = $current_font->decodeText($command[self::COMMAND]); |
||
| 474 | $text[] = $sub_text; |
||
| 475 | break; |
||
| 476 | |||
| 477 | // set leading |
||
| 478 | case 'TL': |
||
| 479 | break; |
||
| 480 | |||
| 481 | case 'Tm': |
||
| 482 | break; |
||
| 483 | |||
| 484 | // set super/subscripting text rise |
||
| 485 | case 'Ts': |
||
| 486 | break; |
||
| 487 | |||
| 488 | // set word spacing |
||
| 489 | case 'Tw': |
||
| 490 | break; |
||
| 491 | |||
| 492 | // set horizontal scaling |
||
| 493 | case 'Tz': |
||
| 494 | //$text .= "\n"; |
||
| 495 | break; |
||
| 496 | |||
| 497 | // move to start of next line |
||
| 498 | case 'T*': |
||
| 499 | //$text .= "\n"; |
||
| 500 | break; |
||
| 501 | |||
| 502 | case 'Da': |
||
| 503 | break; |
||
| 504 | |||
| 505 | case 'Do': |
||
| 506 | if (!is_null($page)) { |
||
| 507 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
| 508 | $id = trim(array_pop($args), '/ '); |
||
| 509 | if ($xobject = $page->getXObject($id)) { |
||
| 510 | $text[] = $xobject->getText($page); |
||
| 511 | } |
||
| 512 | } |
||
| 513 | break; |
||
| 514 | |||
| 515 | case 'rg': |
||
| 516 | case 'RG': |
||
| 517 | break; |
||
| 518 | |||
| 519 | case 're': |
||
| 520 | break; |
||
| 521 | |||
| 522 | case 'co': |
||
| 523 | break; |
||
| 524 | |||
| 525 | case 'cs': |
||
| 526 | break; |
||
| 527 | |||
| 528 | case 'gs': |
||
| 529 | break; |
||
| 530 | |||
| 531 | case 'en': |
||
| 532 | break; |
||
| 533 | |||
| 534 | case 'vo': |
||
| 535 | case 'Vo': |
||
| 536 | break; |
||
| 537 | |||
| 538 | default: |
||
| 539 | } |
||
| 540 | } |
||
| 541 | } |
||
| 542 | |||
| 543 | return $text; |
||
| 544 | } |
||
| 545 | |||
| 546 | /** |
||
| 547 | * @param string $text_part |
||
| 548 | * @param int $offset |
||
| 549 | * |
||
| 550 | * @return array |
||
| 551 | */ |
||
| 552 | public function getCommandsText($text_part, &$offset = 0) |
||
| 553 | { |
||
| 554 | $commands = $matches = []; |
||
| 555 | |||
| 556 | while ($offset < \strlen($text_part)) { |
||
| 557 | $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset); |
||
| 558 | $char = $text_part[$offset]; |
||
| 559 | |||
| 560 | $operator = ''; |
||
| 561 | $type = ''; |
||
| 562 | $command = false; |
||
| 563 | |||
| 564 | switch ($char) { |
||
| 565 | case '/': |
||
| 566 | $type = $char; |
||
| 567 | if (preg_match( |
||
| 568 | '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', |
||
| 569 | substr($text_part, $offset), |
||
| 570 | $matches |
||
| 571 | ) |
||
| 572 | ) { |
||
| 573 | $operator = $matches[2]; |
||
| 574 | $command = $matches[1]; |
||
| 575 | $offset += \strlen($matches[0]); |
||
| 576 | } elseif (preg_match( |
||
| 577 | '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si', |
||
| 578 | substr($text_part, $offset), |
||
| 579 | $matches |
||
| 580 | ) |
||
| 581 | ) { |
||
| 582 | $operator = $matches[2]; |
||
| 583 | $command = $matches[1]; |
||
| 584 | $offset += \strlen($matches[0]); |
||
| 585 | } |
||
| 586 | break; |
||
| 587 | |||
| 588 | case '[': |
||
| 589 | case ']': |
||
| 590 | // array object |
||
| 591 | $type = $char; |
||
| 592 | if ('[' == $char) { |
||
| 593 | ++$offset; |
||
| 594 | // get elements |
||
| 595 | $command = $this->getCommandsText($text_part, $offset); |
||
| 596 | |||
| 597 | if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { |
||
| 598 | $operator = trim($matches[0]); |
||
| 599 | $offset += \strlen($matches[0]); |
||
| 600 | } |
||
| 601 | } else { |
||
| 602 | ++$offset; |
||
| 603 | break; |
||
| 604 | } |
||
| 605 | break; |
||
| 606 | |||
| 607 | case '<': |
||
| 608 | case '>': |
||
| 609 | // array object |
||
| 610 | $type = $char; |
||
| 611 | ++$offset; |
||
| 612 | if ('<' == $char) { |
||
| 613 | $strpos = strpos($text_part, '>', $offset); |
||
| 614 | $command = substr($text_part, $offset, ($strpos - $offset)); |
||
| 615 | $offset = $strpos + 1; |
||
| 616 | } |
||
| 617 | |||
| 618 | if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { |
||
| 619 | $operator = trim($matches[0]); |
||
| 620 | $offset += \strlen($matches[0]); |
||
| 621 | } |
||
| 622 | break; |
||
| 623 | |||
| 624 | case '(': |
||
| 625 | case ')': |
||
| 626 | ++$offset; |
||
| 627 | $type = $char; |
||
| 628 | $strpos = $offset; |
||
| 629 | if ('(' == $char) { |
||
| 630 | $open_bracket = 1; |
||
| 631 | while ($open_bracket > 0) { |
||
| 632 | if (!isset($text_part[$strpos])) { |
||
| 633 | break; |
||
| 634 | } |
||
| 635 | $ch = $text_part[$strpos]; |
||
| 636 | switch ($ch) { |
||
| 637 | case '\\': |
||
| 638 | // REVERSE SOLIDUS (5Ch) (Backslash) |
||
| 639 | // skip next character |
||
| 640 | ++$strpos; |
||
| 641 | break; |
||
| 642 | |||
| 643 | case '(': |
||
| 644 | // LEFT PARENHESIS (28h) |
||
| 645 | ++$open_bracket; |
||
| 646 | break; |
||
| 647 | |||
| 648 | case ')': |
||
| 649 | // RIGHT PARENTHESIS (29h) |
||
| 650 | --$open_bracket; |
||
| 651 | break; |
||
| 652 | } |
||
| 653 | ++$strpos; |
||
| 654 | } |
||
| 655 | $command = substr($text_part, $offset, ($strpos - $offset - 1)); |
||
| 656 | $offset = $strpos; |
||
| 657 | |||
| 658 | if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) { |
||
| 659 | $operator = $matches[1]; |
||
| 660 | $offset += \strlen($matches[0]); |
||
| 661 | } |
||
| 662 | } |
||
| 663 | break; |
||
| 664 | |||
| 665 | default: |
||
| 666 | |||
| 667 | if ('ET' == substr($text_part, $offset, 2)) { |
||
| 668 | break; |
||
| 669 | } elseif (preg_match( |
||
| 670 | '/^\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si', |
||
| 671 | substr($text_part, $offset), |
||
| 672 | $matches |
||
| 673 | ) |
||
| 674 | ) { |
||
| 675 | $operator = trim($matches['id']); |
||
| 676 | $command = trim($matches['data']); |
||
| 677 | $offset += \strlen($matches[0]); |
||
| 678 | } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) { |
||
| 679 | $type = 'n'; |
||
| 680 | $command = trim($matches[0]); |
||
| 681 | $offset += \strlen($matches[0]); |
||
| 682 | } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) { |
||
| 683 | $type = ''; |
||
| 684 | $operator = $matches[1]; |
||
| 685 | $command = ''; |
||
| 686 | $offset += \strlen($matches[0]); |
||
| 687 | } |
||
| 688 | } |
||
| 689 | |||
| 690 | if (false !== $command) { |
||
| 691 | $commands[] = [ |
||
| 692 | self::TYPE => $type, |
||
| 693 | self::OPERATOR => $operator, |
||
| 694 | self::COMMAND => $command, |
||
| 695 | ]; |
||
| 696 | } else { |
||
| 697 | break; |
||
| 698 | } |
||
| 699 | } |
||
| 700 | |||
| 701 | return $commands; |
||
| 702 | } |
||
| 703 | |||
| 704 | /** |
||
| 705 | * @param string $content |
||
| 706 | * |
||
| 707 | * @return PDFObject |
||
| 708 | */ |
||
| 709 | public static function factory(Document $document, Header $header, $content) |
||
| 710 | { |
||
| 711 | switch ($header->get('Type')->getContent()) { |
||
| 712 | case 'XObject': |
||
| 713 | switch ($header->get('Subtype')->getContent()) { |
||
| 714 | case 'Image': |
||
| 715 | return new Image($document, $header, $content); |
||
| 716 | |||
| 717 | case 'Form': |
||
| 718 | return new Form($document, $header, $content); |
||
| 719 | } |
||
| 720 | |||
| 721 | return new self($document, $header, $content); |
||
| 722 | |||
| 723 | case 'Pages': |
||
| 724 | return new Pages($document, $header, $content); |
||
| 725 | |||
| 726 | case 'Page': |
||
| 727 | return new Page($document, $header, $content); |
||
| 728 | |||
| 729 | case 'Encoding': |
||
| 730 | return new Encoding($document, $header, $content); |
||
| 731 | |||
| 732 | case 'Font': |
||
| 733 | $subtype = $header->get('Subtype')->getContent(); |
||
| 734 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
||
| 735 | |||
| 736 | if (class_exists($classname)) { |
||
| 737 | return new $classname($document, $header, $content); |
||
| 738 | } |
||
| 739 | |||
| 740 | return new Font($document, $header, $content); |
||
| 741 | |||
| 742 | default: |
||
| 743 | return new self($document, $header, $content); |
||
| 744 | } |
||
| 745 | } |
||
| 746 | |||
| 747 | /** |
||
| 748 | * Returns unique id identifying the object. |
||
| 749 | * |
||
| 750 | * @return string |
||
| 751 | */ |
||
| 752 | protected function getUniqueId() |
||
| 755 | } |
||
| 756 | } |
||
| 757 |