Total Complexity | 143 |
Total Lines | 716 |
Duplicated Lines | 0 % |
Changes | 3 | ||
Bugs | 2 | Features | 0 |
Complex classes like PDFObject often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PDFObject, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
39 | class PDFObject |
||
40 | { |
||
41 | const TYPE = 't'; |
||
42 | |||
43 | const OPERATOR = 'o'; |
||
44 | |||
45 | const COMMAND = 'c'; |
||
46 | |||
47 | /** |
||
48 | * The recursion stack. |
||
49 | * |
||
50 | * @var array |
||
51 | */ |
||
52 | public static $recursionStack = []; |
||
53 | |||
54 | /** |
||
55 | * @var Document |
||
56 | */ |
||
57 | protected $document = null; |
||
58 | |||
59 | /** |
||
60 | * @var Header |
||
61 | */ |
||
62 | protected $header = null; |
||
63 | |||
64 | /** |
||
65 | * @var string |
||
66 | */ |
||
67 | protected $content = null; |
||
68 | |||
69 | /** |
||
70 | * @param Header $header |
||
71 | * @param string $content |
||
72 | */ |
||
73 | public function __construct(Document $document, Header $header = null, $content = null) |
||
74 | { |
||
75 | $this->document = $document; |
||
76 | $this->header = null !== $header ? $header : new Header(); |
||
77 | $this->content = $content; |
||
78 | } |
||
79 | |||
80 | public function init() |
||
81 | { |
||
82 | } |
||
83 | |||
84 | /** |
||
85 | * @return Header|null |
||
86 | */ |
||
87 | public function getHeader() |
||
88 | { |
||
89 | return $this->header; |
||
90 | } |
||
91 | |||
92 | /** |
||
93 | * @param string $name |
||
94 | * |
||
95 | * @return Element|PDFObject |
||
96 | */ |
||
97 | public function get($name) |
||
98 | { |
||
99 | return $this->header->get($name); |
||
100 | } |
||
101 | |||
102 | /** |
||
103 | * @param string $name |
||
104 | * |
||
105 | * @return bool |
||
106 | */ |
||
107 | public function has($name) |
||
108 | { |
||
109 | return $this->header->has($name); |
||
110 | } |
||
111 | |||
112 | /** |
||
113 | * @param bool $deep |
||
114 | * |
||
115 | * @return array |
||
116 | */ |
||
117 | public function getDetails($deep = true) |
||
118 | { |
||
119 | return $this->header->getDetails($deep); |
||
120 | } |
||
121 | |||
122 | /** |
||
123 | * @return string|null |
||
124 | */ |
||
125 | public function getContent() |
||
128 | } |
||
129 | |||
130 | /** |
||
131 | * @param string $content |
||
132 | */ |
||
133 | public function cleanContent($content, $char = 'X') |
||
190 | } |
||
191 | |||
192 | /** |
||
193 | * @param string $content |
||
194 | * |
||
195 | * @return array |
||
196 | */ |
||
197 | public function getSectionsText($content) |
||
232 | } |
||
233 | |||
234 | private function getDefaultFont(Page $page = null) |
||
235 | { |
||
236 | $fonts = []; |
||
237 | if (!is_null($page)) { |
||
238 | $fonts = $page->getFonts(); |
||
239 | } |
||
240 | |||
241 | $fonts = array_merge($fonts, array_values($this->document->getFonts())); |
||
242 | |||
243 | if (count($fonts) > 0) |
||
244 | { |
||
245 | return reset($fonts); |
||
246 | } |
||
247 | |||
248 | return new Font($this->document); |
||
249 | } |
||
250 | |||
251 | /** |
||
252 | * @param Page $page |
||
253 | * |
||
254 | * @return string |
||
255 | * |
||
256 | * @throws \Exception |
||
257 | */ |
||
258 | public function getText(Page $page = null) |
||
259 | { |
||
260 | $text = ''; |
||
261 | $sections = $this->getSectionsText($this->content); |
||
262 | $current_font = $this->getDefaultFont($page); |
||
263 | |||
264 | $current_position_td = ['x' => false, 'y' => false]; |
||
265 | $current_position_tm = ['x' => false, 'y' => false]; |
||
266 | |||
267 | array_push(self::$recursionStack, $this->getUniqueId()); |
||
268 | |||
269 | foreach ($sections as $section) { |
||
270 | $commands = $this->getCommandsText($section); |
||
271 | |||
272 | foreach ($commands as $command) { |
||
273 | switch ($command[self::OPERATOR]) { |
||
274 | // set character spacing |
||
275 | case 'Tc': |
||
276 | break; |
||
277 | |||
278 | // move text current point |
||
279 | case 'Td': |
||
280 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
281 | $y = array_pop($args); |
||
|
|||
282 | $x = array_pop($args); |
||
283 | if (((float) $x <= 0) || |
||
284 | (false !== $current_position_td['y'] && (float) $y < (float) ($current_position_td['y'])) |
||
285 | ) { |
||
286 | // vertical offset |
||
287 | $text .= "\n"; |
||
288 | } elseif (false !== $current_position_td['x'] && (float) $x > (float) ( |
||
289 | $current_position_td['x'] |
||
290 | ) |
||
291 | ) { |
||
292 | // horizontal offset |
||
293 | $text .= ' '; |
||
294 | } |
||
295 | $current_position_td = ['x' => $x, 'y' => $y]; |
||
296 | break; |
||
297 | |||
298 | // move text current point and set leading |
||
299 | case 'TD': |
||
300 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
301 | $y = array_pop($args); |
||
302 | $x = array_pop($args); |
||
303 | if ((float) $y < 0) { |
||
304 | $text .= "\n"; |
||
305 | } elseif ((float) $x <= 0) { |
||
306 | $text .= ' '; |
||
307 | } |
||
308 | break; |
||
309 | |||
310 | case 'Tf': |
||
311 | list($id) = preg_split('/\s/s', $command[self::COMMAND]); |
||
312 | $id = trim($id, '/'); |
||
313 | if (null !== $page) { |
||
314 | $current_font = $page->getFont($id); |
||
315 | } |
||
316 | break; |
||
317 | |||
318 | case "'": |
||
319 | case 'Tj': |
||
320 | $command[self::COMMAND] = [$command]; |
||
321 | // no break |
||
322 | case 'TJ': |
||
323 | $sub_text = $current_font->decodeText($command[self::COMMAND]); |
||
324 | $text .= $sub_text; |
||
325 | break; |
||
326 | |||
327 | // set leading |
||
328 | case 'TL': |
||
329 | $text .= ' '; |
||
330 | break; |
||
331 | |||
332 | case 'Tm': |
||
333 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
334 | $y = array_pop($args); |
||
335 | $x = array_pop($args); |
||
336 | if (false !== $current_position_tm['x']) { |
||
337 | $delta = abs((float) $x - (float) ($current_position_tm['x'])); |
||
338 | if ($delta > 10) { |
||
339 | $text .= "\t"; |
||
340 | } |
||
341 | } |
||
342 | if (false !== $current_position_tm['y']) { |
||
343 | $delta = abs((float) $y - (float) ($current_position_tm['y'])); |
||
344 | if ($delta > 10) { |
||
345 | $text .= "\n"; |
||
346 | } |
||
347 | } |
||
348 | $current_position_tm = ['x' => $x, 'y' => $y]; |
||
349 | break; |
||
350 | |||
351 | // set super/subscripting text rise |
||
352 | case 'Ts': |
||
353 | break; |
||
354 | |||
355 | // set word spacing |
||
356 | case 'Tw': |
||
357 | break; |
||
358 | |||
359 | // set horizontal scaling |
||
360 | case 'Tz': |
||
361 | $text .= "\n"; |
||
362 | break; |
||
363 | |||
364 | // move to start of next line |
||
365 | case 'T*': |
||
366 | $text .= "\n"; |
||
367 | break; |
||
368 | |||
369 | case 'Da': |
||
370 | break; |
||
371 | |||
372 | case 'Do': |
||
373 | if (null !== $page) { |
||
374 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
375 | $id = trim(array_pop($args), '/ '); |
||
376 | $xobject = $page->getXObject($id); |
||
377 | |||
378 | // @todo $xobject could be a ElementXRef object, which would then throw an error |
||
379 | if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) { |
||
380 | // Not a circular reference. |
||
381 | $text .= $xobject->getText($page); |
||
382 | } |
||
383 | } |
||
384 | break; |
||
385 | |||
386 | case 'rg': |
||
387 | case 'RG': |
||
388 | break; |
||
389 | |||
390 | case 're': |
||
391 | break; |
||
392 | |||
393 | case 'co': |
||
394 | break; |
||
395 | |||
396 | case 'cs': |
||
397 | break; |
||
398 | |||
399 | case 'gs': |
||
400 | break; |
||
401 | |||
402 | case 'en': |
||
403 | break; |
||
404 | |||
405 | case 'sc': |
||
406 | case 'SC': |
||
407 | break; |
||
408 | |||
409 | case 'g': |
||
410 | case 'G': |
||
411 | break; |
||
412 | |||
413 | case 'V': |
||
414 | break; |
||
415 | |||
416 | case 'vo': |
||
417 | case 'Vo': |
||
418 | break; |
||
419 | |||
420 | default: |
||
421 | } |
||
422 | } |
||
423 | } |
||
424 | |||
425 | array_pop(self::$recursionStack); |
||
426 | |||
427 | return $text.' '; |
||
428 | } |
||
429 | |||
430 | /** |
||
431 | * @param Page |
||
432 | * |
||
433 | * @return array |
||
434 | * @throws \Exception |
||
435 | */ |
||
436 | public function getTextArray(Page $page = null) |
||
437 | { |
||
438 | $text = array(); |
||
439 | $sections = $this->getSectionsText($this->content); |
||
440 | $current_font = $this->getDefaultFont($page); |
||
441 | |||
442 | foreach ($sections as $section) { |
||
443 | |||
444 | $commands = $this->getCommandsText($section); |
||
445 | |||
446 | foreach ($commands as $command) { |
||
447 | |||
448 | switch ($command[self::OPERATOR]) { |
||
449 | // set character spacing |
||
450 | case 'Tc': |
||
451 | break; |
||
452 | |||
453 | // move text current point |
||
454 | case 'Td': |
||
455 | break; |
||
456 | |||
457 | // move text current point and set leading |
||
458 | case 'TD': |
||
459 | break; |
||
460 | |||
461 | case 'Tf': |
||
462 | if (!is_null($page)) { |
||
463 | list($id,) = preg_split('/\s/s', $command[self::COMMAND]); |
||
464 | $id = trim($id, '/'); |
||
465 | $current_font = $page->getFont($id); |
||
466 | } |
||
467 | break; |
||
468 | |||
469 | case "'": |
||
470 | case 'Tj': |
||
471 | $command[self::COMMAND] = array($command); |
||
472 | case 'TJ': |
||
473 | $sub_text = $current_font->decodeText($command[self::COMMAND]); |
||
474 | $text[] = $sub_text; |
||
475 | break; |
||
476 | |||
477 | // set leading |
||
478 | case 'TL': |
||
479 | break; |
||
480 | |||
481 | case 'Tm': |
||
482 | break; |
||
483 | |||
484 | // set super/subscripting text rise |
||
485 | case 'Ts': |
||
486 | break; |
||
487 | |||
488 | // set word spacing |
||
489 | case 'Tw': |
||
490 | break; |
||
491 | |||
492 | // set horizontal scaling |
||
493 | case 'Tz': |
||
494 | //$text .= "\n"; |
||
495 | break; |
||
496 | |||
497 | // move to start of next line |
||
498 | case 'T*': |
||
499 | //$text .= "\n"; |
||
500 | break; |
||
501 | |||
502 | case 'Da': |
||
503 | break; |
||
504 | |||
505 | case 'Do': |
||
506 | if (!is_null($page)) { |
||
507 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
508 | $id = trim(array_pop($args), '/ '); |
||
509 | if ($xobject = $page->getXObject($id)) { |
||
510 | $text[] = $xobject->getText($page); |
||
511 | } |
||
512 | } |
||
513 | break; |
||
514 | |||
515 | case 'rg': |
||
516 | case 'RG': |
||
517 | break; |
||
518 | |||
519 | case 're': |
||
520 | break; |
||
521 | |||
522 | case 'co': |
||
523 | break; |
||
524 | |||
525 | case 'cs': |
||
526 | break; |
||
527 | |||
528 | case 'gs': |
||
529 | break; |
||
530 | |||
531 | case 'en': |
||
532 | break; |
||
533 | |||
534 | case 'vo': |
||
535 | case 'Vo': |
||
536 | break; |
||
537 | |||
538 | default: |
||
539 | } |
||
540 | } |
||
541 | } |
||
542 | |||
543 | return $text; |
||
544 | } |
||
545 | |||
546 | /** |
||
547 | * @param string $text_part |
||
548 | * @param int $offset |
||
549 | * |
||
550 | * @return array |
||
551 | */ |
||
552 | public function getCommandsText($text_part, &$offset = 0) |
||
553 | { |
||
554 | $commands = $matches = []; |
||
555 | |||
556 | while ($offset < \strlen($text_part)) { |
||
557 | $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset); |
||
558 | $char = $text_part[$offset]; |
||
559 | |||
560 | $operator = ''; |
||
561 | $type = ''; |
||
562 | $command = false; |
||
563 | |||
564 | switch ($char) { |
||
565 | case '/': |
||
566 | $type = $char; |
||
567 | if (preg_match( |
||
568 | '/^\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', |
||
569 | substr($text_part, $offset), |
||
570 | $matches |
||
571 | ) |
||
572 | ) { |
||
573 | $operator = $matches[2]; |
||
574 | $command = $matches[1]; |
||
575 | $offset += \strlen($matches[0]); |
||
576 | } elseif (preg_match( |
||
577 | '/^\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si', |
||
578 | substr($text_part, $offset), |
||
579 | $matches |
||
580 | ) |
||
581 | ) { |
||
582 | $operator = $matches[2]; |
||
583 | $command = $matches[1]; |
||
584 | $offset += \strlen($matches[0]); |
||
585 | } |
||
586 | break; |
||
587 | |||
588 | case '[': |
||
589 | case ']': |
||
590 | // array object |
||
591 | $type = $char; |
||
592 | if ('[' == $char) { |
||
593 | ++$offset; |
||
594 | // get elements |
||
595 | $command = $this->getCommandsText($text_part, $offset); |
||
596 | |||
597 | if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { |
||
598 | $operator = trim($matches[0]); |
||
599 | $offset += \strlen($matches[0]); |
||
600 | } |
||
601 | } else { |
||
602 | ++$offset; |
||
603 | break; |
||
604 | } |
||
605 | break; |
||
606 | |||
607 | case '<': |
||
608 | case '>': |
||
609 | // array object |
||
610 | $type = $char; |
||
611 | ++$offset; |
||
612 | if ('<' == $char) { |
||
613 | $strpos = strpos($text_part, '>', $offset); |
||
614 | $command = substr($text_part, $offset, ($strpos - $offset)); |
||
615 | $offset = $strpos + 1; |
||
616 | } |
||
617 | |||
618 | if (preg_match('/^\s*[A-Z]{1,2}\s*/si', substr($text_part, $offset), $matches)) { |
||
619 | $operator = trim($matches[0]); |
||
620 | $offset += \strlen($matches[0]); |
||
621 | } |
||
622 | break; |
||
623 | |||
624 | case '(': |
||
625 | case ')': |
||
626 | ++$offset; |
||
627 | $type = $char; |
||
628 | $strpos = $offset; |
||
629 | if ('(' == $char) { |
||
630 | $open_bracket = 1; |
||
631 | while ($open_bracket > 0) { |
||
632 | if (!isset($text_part[$strpos])) { |
||
633 | break; |
||
634 | } |
||
635 | $ch = $text_part[$strpos]; |
||
636 | switch ($ch) { |
||
637 | case '\\': |
||
638 | // REVERSE SOLIDUS (5Ch) (Backslash) |
||
639 | // skip next character |
||
640 | ++$strpos; |
||
641 | break; |
||
642 | |||
643 | case '(': |
||
644 | // LEFT PARENHESIS (28h) |
||
645 | ++$open_bracket; |
||
646 | break; |
||
647 | |||
648 | case ')': |
||
649 | // RIGHT PARENTHESIS (29h) |
||
650 | --$open_bracket; |
||
651 | break; |
||
652 | } |
||
653 | ++$strpos; |
||
654 | } |
||
655 | $command = substr($text_part, $offset, ($strpos - $offset - 1)); |
||
656 | $offset = $strpos; |
||
657 | |||
658 | if (preg_match('/^\s*([A-Z\']{1,2})\s*/si', substr($text_part, $offset), $matches)) { |
||
659 | $operator = $matches[1]; |
||
660 | $offset += \strlen($matches[0]); |
||
661 | } |
||
662 | } |
||
663 | break; |
||
664 | |||
665 | default: |
||
666 | |||
667 | if ('ET' == substr($text_part, $offset, 2)) { |
||
668 | break; |
||
669 | } elseif (preg_match( |
||
670 | '/^\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si', |
||
671 | substr($text_part, $offset), |
||
672 | $matches |
||
673 | ) |
||
674 | ) { |
||
675 | $operator = trim($matches['id']); |
||
676 | $command = trim($matches['data']); |
||
677 | $offset += \strlen($matches[0]); |
||
678 | } elseif (preg_match('/^\s*([0-9\.\-]+\s*?)+\s*/si', substr($text_part, $offset), $matches)) { |
||
679 | $type = 'n'; |
||
680 | $command = trim($matches[0]); |
||
681 | $offset += \strlen($matches[0]); |
||
682 | } elseif (preg_match('/^\s*([A-Z\*]+)\s*/si', substr($text_part, $offset), $matches)) { |
||
683 | $type = ''; |
||
684 | $operator = $matches[1]; |
||
685 | $command = ''; |
||
686 | $offset += \strlen($matches[0]); |
||
687 | } |
||
688 | } |
||
689 | |||
690 | if (false !== $command) { |
||
691 | $commands[] = [ |
||
692 | self::TYPE => $type, |
||
693 | self::OPERATOR => $operator, |
||
694 | self::COMMAND => $command, |
||
695 | ]; |
||
696 | } else { |
||
697 | break; |
||
698 | } |
||
699 | } |
||
700 | |||
701 | return $commands; |
||
702 | } |
||
703 | |||
704 | /** |
||
705 | * @param string $content |
||
706 | * |
||
707 | * @return PDFObject |
||
708 | */ |
||
709 | public static function factory(Document $document, Header $header, $content) |
||
710 | { |
||
711 | switch ($header->get('Type')->getContent()) { |
||
712 | case 'XObject': |
||
713 | switch ($header->get('Subtype')->getContent()) { |
||
714 | case 'Image': |
||
715 | return new Image($document, $header, $content); |
||
716 | |||
717 | case 'Form': |
||
718 | return new Form($document, $header, $content); |
||
719 | } |
||
720 | |||
721 | return new self($document, $header, $content); |
||
722 | |||
723 | case 'Pages': |
||
724 | return new Pages($document, $header, $content); |
||
725 | |||
726 | case 'Page': |
||
727 | return new Page($document, $header, $content); |
||
728 | |||
729 | case 'Encoding': |
||
730 | return new Encoding($document, $header, $content); |
||
731 | |||
732 | case 'Font': |
||
733 | $subtype = $header->get('Subtype')->getContent(); |
||
734 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
||
735 | |||
736 | if (class_exists($classname)) { |
||
737 | return new $classname($document, $header, $content); |
||
738 | } |
||
739 | |||
740 | return new Font($document, $header, $content); |
||
741 | |||
742 | default: |
||
743 | return new self($document, $header, $content); |
||
744 | } |
||
745 | } |
||
746 | |||
747 | /** |
||
748 | * Returns unique id identifying the object. |
||
749 | * |
||
750 | * @return string |
||
751 | */ |
||
752 | protected function getUniqueId() |
||
755 | } |
||
756 | } |
||
757 |