Total Complexity | 180 |
Total Lines | 1161 |
Duplicated Lines | 0 % |
Changes | 5 | ||
Bugs | 3 | Features | 0 |
Complex classes like PDFObject often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use PDFObject, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
42 | class PDFObject |
||
43 | { |
||
44 | public const TYPE = 't'; |
||
45 | |||
46 | public const OPERATOR = 'o'; |
||
47 | |||
48 | public const COMMAND = 'c'; |
||
49 | |||
50 | /** |
||
51 | * The recursion stack. |
||
52 | * |
||
53 | * @var array |
||
54 | */ |
||
55 | public static $recursionStack = []; |
||
56 | |||
57 | /** |
||
58 | * @var Document|null |
||
59 | */ |
||
60 | protected $document; |
||
61 | |||
62 | /** |
||
63 | * @var Header |
||
64 | */ |
||
65 | protected $header; |
||
66 | |||
67 | /** |
||
68 | * @var string |
||
69 | */ |
||
70 | protected $content; |
||
71 | |||
72 | /** |
||
73 | * @var Config|null |
||
74 | */ |
||
75 | protected $config; |
||
76 | |||
77 | /** |
||
78 | * @var bool |
||
79 | */ |
||
80 | protected $addPositionWhitespace = false; |
||
81 | |||
82 | public function __construct( |
||
83 | Document $document, |
||
84 | ?Header $header = null, |
||
85 | ?string $content = null, |
||
86 | ?Config $config = null |
||
87 | ) { |
||
88 | $this->document = $document; |
||
89 | $this->header = $header ?? new Header(); |
||
90 | $this->content = $content; |
||
91 | $this->config = $config; |
||
92 | } |
||
93 | |||
94 | public function init() |
||
95 | { |
||
96 | } |
||
97 | |||
98 | public function getDocument(): Document |
||
99 | { |
||
100 | return $this->document; |
||
|
|||
101 | } |
||
102 | |||
103 | public function getHeader(): ?Header |
||
104 | { |
||
105 | return $this->header; |
||
106 | } |
||
107 | |||
108 | public function getConfig(): ?Config |
||
109 | { |
||
110 | return $this->config; |
||
111 | } |
||
112 | |||
113 | /** |
||
114 | * @return Element|PDFObject|Header |
||
115 | */ |
||
116 | public function get(string $name) |
||
117 | { |
||
118 | return $this->header->get($name); |
||
119 | } |
||
120 | |||
121 | public function has(string $name): bool |
||
122 | { |
||
123 | return $this->header->has($name); |
||
124 | } |
||
125 | |||
126 | public function getDetails(bool $deep = true): array |
||
127 | { |
||
128 | return $this->header->getDetails($deep); |
||
129 | } |
||
130 | |||
131 | public function getContent(): ?string |
||
134 | } |
||
135 | |||
136 | /** |
||
137 | * Creates a duplicate of the document stream with |
||
138 | * strings and other items replaced by $char. Formerly |
||
139 | * getSectionsText() used this output to more easily gather offset |
||
140 | * values to extract text from the *actual* document stream. |
||
141 | * |
||
142 | * @deprecated function is no longer used and will be removed in a future release |
||
143 | * |
||
144 | * @internal |
||
145 | */ |
||
146 | public function cleanContent(string $content, string $char = 'X') |
||
147 | { |
||
148 | $char = $char[0]; |
||
149 | $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content); |
||
150 | |||
151 | // Remove image bloc with binary content |
||
152 | preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
||
153 | foreach ($matches[0] as $part) { |
||
154 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||
155 | } |
||
156 | |||
157 | // Clean content in square brackets [.....] |
||
158 | preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
||
159 | foreach ($matches[1] as $part) { |
||
160 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||
161 | } |
||
162 | |||
163 | // Clean content in round brackets (.....) |
||
164 | preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
||
165 | foreach ($matches[1] as $part) { |
||
166 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||
167 | } |
||
168 | |||
169 | // Clean structure |
||
170 | if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { |
||
171 | $content = ''; |
||
172 | $level = 0; |
||
173 | foreach ($parts as $part) { |
||
174 | if ('<' == $part) { |
||
175 | ++$level; |
||
176 | } |
||
177 | |||
178 | $content .= (0 == $level ? $part : str_repeat($char, \strlen($part))); |
||
179 | |||
180 | if ('>' == $part) { |
||
181 | --$level; |
||
182 | } |
||
183 | } |
||
184 | } |
||
185 | |||
186 | // Clean BDC and EMC markup |
||
187 | preg_match_all( |
||
188 | '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', |
||
189 | $content, |
||
190 | $matches, |
||
191 | \PREG_OFFSET_CAPTURE |
||
192 | ); |
||
193 | foreach ($matches[1] as $part) { |
||
194 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||
195 | } |
||
196 | |||
197 | preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
||
198 | foreach ($matches[1] as $part) { |
||
199 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||
200 | } |
||
201 | |||
202 | return $content; |
||
203 | } |
||
204 | |||
205 | /** |
||
206 | * Takes a string of PDF document stream text and formats |
||
207 | * it into a multi-line string with one PDF command on each line, |
||
208 | * separated by \r\n. If the given string is null, or binary data |
||
209 | * is detected instead of a document stream then return an empty |
||
210 | * string. |
||
211 | */ |
||
212 | private function formatContent(?string $content): string |
||
213 | { |
||
214 | if (null === $content) { |
||
215 | return ''; |
||
216 | } |
||
217 | |||
218 | // Outside of (String) and inline image content in PDF document |
||
219 | // streams, all text should conform to UTF-8. Test for binary |
||
220 | // content by deleting everything after the first open- |
||
221 | // parenthesis ( which indicates the beginning of a string, or |
||
222 | // the first ID command which indicates the beginning of binary |
||
223 | // inline image content. Then test what remains for valid |
||
224 | // UTF-8. If it's not UTF-8, return an empty string as this |
||
225 | // $content is most likely binary. Unfortunately, using |
||
226 | // mb_check_encoding(..., 'UTF-8') is not strict enough, so the |
||
227 | // following regexp, adapted from the W3, is used. See: |
||
228 | // https://www.w3.org/International/questions/qa-forms-utf-8.en |
||
229 | // We use preg_replace() instead of preg_match() to avoid "JIT |
||
230 | // stack limit exhausted" errors on larger files. |
||
231 | $utf8Filter = preg_replace('/( |
||
232 | [\x09\x0A\x0D\x20-\x7E] | # ASCII |
||
233 | [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte |
||
234 | \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs |
||
235 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte |
||
236 | \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates |
||
237 | \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 |
||
238 | [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 |
||
239 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||
240 | )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); |
||
241 | |||
242 | if ('' !== $utf8Filter) { |
||
243 | return ''; |
||
244 | } |
||
245 | |||
246 | // Find all inline image content and replace them so they aren't |
||
247 | // affected by the next steps |
||
248 | $pdfInlineImages = []; |
||
249 | $offsetBI = 0; |
||
250 | while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { |
||
251 | // Attempt to detemine if this instance of the 'BI' command |
||
252 | // actually occured within a (string) using the following |
||
253 | // steps: |
||
254 | |||
255 | // Step 1: Remove any escaped slashes and parentheses from |
||
256 | // the alleged image characteristics data |
||
257 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]); |
||
258 | |||
259 | // Step 2: Remove all correctly ordered and balanced |
||
260 | // parentheses from (strings) |
||
261 | do { |
||
262 | $paraTest = $para; |
||
263 | $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); |
||
264 | } while ($para != $paraTest); |
||
265 | |||
266 | $paraOpen = strpos($para, '('); |
||
267 | $paraClose = strpos($para, ')'); |
||
268 | |||
269 | // Check: If the remaining text contains a close parenthesis |
||
270 | // ')' AND it occurs before any open parenthesis, then we |
||
271 | // are almost certain to be inside a (string) |
||
272 | if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { |
||
273 | // Bump the search offset forward and match again |
||
274 | $offsetBI = (int) $text[1][1]; |
||
275 | continue; |
||
276 | } |
||
277 | |||
278 | // Step 3: Double check that this is actually inline image |
||
279 | // data by parsing the alleged image characteristics as a |
||
280 | // dictionary |
||
281 | $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); |
||
282 | |||
283 | // Check if an image Width and Height are set in the dict |
||
284 | if ((isset($dict['W']) || isset($dict['Width'])) |
||
285 | && (isset($dict['H']) || isset($dict['Height']))) { |
||
286 | $id = uniqid('IMAGE_', true); |
||
287 | $pdfInlineImages[$id] = [ |
||
288 | preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), |
||
289 | preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), |
||
290 | ]; |
||
291 | $content = preg_replace( |
||
292 | '/'.preg_quote($text[0][0], '/').'/', |
||
293 | '^^^'.$id.'^^^', |
||
294 | $content, |
||
295 | 1 |
||
296 | ); |
||
297 | } else { |
||
298 | // If there was no valid dictionary, or a height and width |
||
299 | // weren't specified, then we don't know what this is, so |
||
300 | // just leave it alone; bump the search offset forward and |
||
301 | // match again |
||
302 | $offsetBI = (int) $text[1][1]; |
||
303 | } |
||
304 | } |
||
305 | |||
306 | // Find all strings () and replace them so they aren't affected |
||
307 | // by the next steps |
||
308 | $pdfstrings = []; |
||
309 | $attempt = '('; |
||
310 | while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) { |
||
311 | // Remove all escaped slashes and parentheses from the target text |
||
312 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]); |
||
313 | |||
314 | // PDF strings can contain unescaped parentheses as long as |
||
315 | // they're balanced, so check for balanced parentheses |
||
316 | $left = preg_match_all('/\(/', $para); |
||
317 | $right = preg_match_all('/\)/', $para); |
||
318 | |||
319 | if (')' == $para[-1] && $left == $right) { |
||
320 | // Replace the string with a unique placeholder |
||
321 | $id = uniqid('STRING_', true); |
||
322 | $pdfstrings[$id] = $text[0]; |
||
323 | $content = preg_replace( |
||
324 | '/'.preg_quote($text[0], '/').'/', |
||
325 | '@@@'.$id.'@@@', |
||
326 | $content, |
||
327 | 1 |
||
328 | ); |
||
329 | |||
330 | // Reset to search for the next string |
||
331 | $attempt = '('; |
||
332 | } else { |
||
333 | // We had unbalanced parentheses, so use the current |
||
334 | // match as a base to find a longer string |
||
335 | $attempt = $text[0]; |
||
336 | } |
||
337 | } |
||
338 | |||
339 | // Remove all carriage returns and line-feeds from the document stream |
||
340 | $content = str_replace(["\r", "\n"], ' ', trim($content)); |
||
341 | |||
342 | // Find all dictionary << >> commands and replace them so they |
||
343 | // aren't affected by the next steps |
||
344 | $dictstore = []; |
||
345 | while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { |
||
346 | $dictid = uniqid('DICT_', true); |
||
347 | $dictstore[$dictid] = $dicttext[1]; |
||
348 | $content = preg_replace( |
||
349 | '/'.preg_quote($dicttext[0], '/').'/', |
||
350 | ' ###'.$dictid.'###'.$dicttext[2], |
||
351 | $content, |
||
352 | 1 |
||
353 | ); |
||
354 | } |
||
355 | |||
356 | // Normalize white-space in the document stream |
||
357 | $content = preg_replace('/\s{2,}/', ' ', $content); |
||
358 | |||
359 | // Find all valid PDF operators and add \r\n after each; this |
||
360 | // ensures there is just one command on every line |
||
361 | // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A |
||
362 | // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A |
||
363 | // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while |
||
364 | // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions |
||
365 | // appear here in the list for completeness. |
||
366 | $operators = [ |
||
367 | 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', |
||
368 | 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', |
||
369 | 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', |
||
370 | 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', |
||
371 | 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', |
||
372 | 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', |
||
373 | ]; |
||
374 | foreach ($operators as $operator) { |
||
375 | $content = preg_replace( |
||
376 | '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/', |
||
377 | $operator."\r\n", |
||
378 | $content |
||
379 | ); |
||
380 | } |
||
381 | |||
382 | // Restore the original content of the dictionary << >> commands |
||
383 | $dictstore = array_reverse($dictstore, true); |
||
384 | foreach ($dictstore as $id => $dict) { |
||
385 | $content = str_replace('###'.$id.'###', $dict, $content); |
||
386 | } |
||
387 | |||
388 | // Restore the original string content |
||
389 | $pdfstrings = array_reverse($pdfstrings, true); |
||
390 | foreach ($pdfstrings as $id => $text) { |
||
391 | // Strings may contain escaped newlines, or literal newlines |
||
392 | // and we should clean these up before replacing the string |
||
393 | // back into the content stream; this ensures no strings are |
||
394 | // split between two lines (every command must be on one line) |
||
395 | $text = str_replace( |
||
396 | ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], |
||
397 | ['', '', '', '\r', '\n'], |
||
398 | $text |
||
399 | ); |
||
400 | |||
401 | $content = str_replace('@@@'.$id.'@@@', $text, $content); |
||
402 | } |
||
403 | |||
404 | // Restore the original content of any inline images |
||
405 | $pdfInlineImages = array_reverse($pdfInlineImages, true); |
||
406 | foreach ($pdfInlineImages as $id => $image) { |
||
407 | $content = str_replace( |
||
408 | '^^^'.$id.'^^^', |
||
409 | "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", |
||
410 | $content |
||
411 | ); |
||
412 | } |
||
413 | |||
414 | $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); |
||
415 | |||
416 | return $content; |
||
417 | } |
||
418 | |||
419 | /** |
||
420 | * getSectionsText() now takes an entire, unformatted |
||
421 | * document stream as a string, cleans it, then filters out |
||
422 | * commands that aren't needed for text positioning/extraction. It |
||
423 | * returns an array of unprocessed PDF commands, one command per |
||
424 | * element. |
||
425 | * |
||
426 | * @internal |
||
427 | */ |
||
428 | public function getSectionsText(?string $content): array |
||
429 | { |
||
430 | $sections = []; |
||
431 | |||
432 | // A cleaned stream has one command on every line, so split the |
||
433 | // cleaned stream content on \r\n into an array |
||
434 | $textCleaned = preg_split( |
||
435 | '/(\r\n|\n|\r)/', |
||
436 | $this->formatContent($content), |
||
437 | -1, |
||
438 | \PREG_SPLIT_NO_EMPTY |
||
439 | ); |
||
440 | |||
441 | $inTextBlock = false; |
||
442 | foreach ($textCleaned as $line) { |
||
443 | $line = trim($line); |
||
444 | |||
445 | // Skip empty lines |
||
446 | if ('' === $line) { |
||
447 | continue; |
||
448 | } |
||
449 | |||
450 | // If a 'BT' is encountered, set the $inTextBlock flag |
||
451 | if (preg_match('/BT$/', $line)) { |
||
452 | $inTextBlock = true; |
||
453 | $sections[] = $line; |
||
454 | |||
455 | // If an 'ET' is encountered, unset the $inTextBlock flag |
||
456 | } elseif ('ET' == $line) { |
||
457 | $inTextBlock = false; |
||
458 | $sections[] = $line; |
||
459 | } elseif ($inTextBlock) { |
||
460 | // If we are inside a BT ... ET text block, save all lines |
||
461 | $sections[] = trim($line); |
||
462 | } else { |
||
463 | // Otherwise, if we are outside of a text block, only |
||
464 | // save specific, necessary lines. Care should be taken |
||
465 | // to ensure a command being checked for *only* matches |
||
466 | // that command. For instance, a simple search for 'c' |
||
467 | // may also match the 'sc' command. See the command |
||
468 | // list in the formatContent() method above. |
||
469 | // Add more commands to save here as you find them in |
||
470 | // weird PDFs! |
||
471 | if ('q' == $line[-1] || 'Q' == $line[-1]) { |
||
472 | // Save and restore graphics state commands |
||
473 | $sections[] = $line; |
||
474 | } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) { |
||
475 | // Begin marked content sequence |
||
476 | $sections[] = $line; |
||
477 | } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) { |
||
478 | // Marked content point |
||
479 | $sections[] = $line; |
||
480 | } elseif (preg_match('/(?<!\w)EMC$/', $line)) { |
||
481 | // End marked content sequence |
||
482 | $sections[] = $line; |
||
483 | } elseif (preg_match('/(?<!\w)cm$/', $line)) { |
||
484 | // Graphics position change commands |
||
485 | $sections[] = $line; |
||
486 | } elseif (preg_match('/(?<!\w)Tf$/', $line)) { |
||
487 | // Font change commands |
||
488 | $sections[] = $line; |
||
489 | } elseif (preg_match('/(?<!\w)Do$/', $line)) { |
||
490 | // Invoke named XObject command |
||
491 | $sections[] = $line; |
||
492 | } |
||
493 | } |
||
494 | } |
||
495 | |||
496 | return $sections; |
||
497 | } |
||
498 | |||
499 | private function getDefaultFont(?Page $page = null): Font |
||
500 | { |
||
501 | $fonts = []; |
||
502 | if (null !== $page) { |
||
503 | $fonts = $page->getFonts(); |
||
504 | } |
||
505 | |||
506 | $firstFont = $this->document->getFirstFont(); |
||
507 | if (null !== $firstFont) { |
||
508 | $fonts[] = $firstFont; |
||
509 | } |
||
510 | |||
511 | if (\count($fonts) > 0) { |
||
512 | return reset($fonts); |
||
513 | } |
||
514 | |||
515 | return new Font($this->document, null, null, $this->config); |
||
516 | } |
||
517 | |||
518 | /** |
||
519 | * Decode a '[]TJ' command and attempt to use alternate |
||
520 | * fonts if the current font results in output that contains |
||
521 | * Unicode control characters. |
||
522 | * |
||
523 | * @internal |
||
524 | * |
||
525 | * @param array<int,array<string,string|bool>> $command |
||
526 | */ |
||
527 | private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string |
||
555 | } |
||
556 | |||
557 | /** |
||
558 | * Expects a string that is a full PDF dictionary object, |
||
559 | * including the outer enclosing << >> angle brackets |
||
560 | * |
||
561 | * @internal |
||
562 | * |
||
563 | * @throws InvalidDictionaryObjectException |
||
564 | */ |
||
565 | public function parseDictionary(string $dictionary): array |
||
566 | { |
||
567 | // Normalize whitespace |
||
568 | $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); |
||
569 | |||
570 | if ('<<' != substr($dictionary, 0, 2)) { |
||
571 | throw new InvalidDictionaryObjectException('Not a valid dictionary object.'); |
||
572 | } |
||
573 | |||
574 | $parsed = []; |
||
575 | $stack = []; |
||
576 | $currentName = ''; |
||
577 | $arrayTypeNumeric = false; |
||
578 | |||
579 | // Remove outer layer of dictionary, and split on tokens |
||
580 | $split = preg_split( |
||
581 | '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', |
||
582 | trim(preg_replace('/^<<|>>$/', '', $dictionary)), |
||
583 | -1, |
||
584 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
||
585 | ); |
||
586 | |||
587 | foreach ($split as $token) { |
||
588 | $token = trim($token); |
||
589 | switch ($token) { |
||
590 | case '': |
||
591 | break; |
||
592 | |||
593 | // Open numeric array |
||
594 | case '[': |
||
595 | $parsed[$currentName] = []; |
||
596 | $arrayTypeNumeric = true; |
||
597 | |||
598 | // Move up one level in the stack |
||
599 | $stack[\count($stack)] = &$parsed; |
||
600 | $parsed = &$parsed[$currentName]; |
||
601 | $currentName = ''; |
||
602 | break; |
||
603 | |||
604 | // Open hashed array |
||
605 | case '<<': |
||
606 | $parsed[$currentName] = []; |
||
607 | $arrayTypeNumeric = false; |
||
608 | |||
609 | // Move up one level in the stack |
||
610 | $stack[\count($stack)] = &$parsed; |
||
611 | $parsed = &$parsed[$currentName]; |
||
612 | $currentName = ''; |
||
613 | break; |
||
614 | |||
615 | // Close numeric array |
||
616 | case ']': |
||
617 | // Revert string type arrays back to a single element |
||
618 | if (\is_array($parsed) && 1 == \count($parsed) |
||
619 | && isset($parsed[0]) && \is_string($parsed[0]) |
||
620 | && '' !== $parsed[0] && '/' != $parsed[0][0]) { |
||
621 | $parsed = '['.$parsed[0].']'; |
||
622 | } |
||
623 | // Close hashed array |
||
624 | // no break |
||
625 | case '>>': |
||
626 | $arrayTypeNumeric = false; |
||
627 | |||
628 | // Move down one level in the stack |
||
629 | $parsed = &$stack[\count($stack) - 1]; |
||
630 | unset($stack[\count($stack) - 1]); |
||
631 | break; |
||
632 | |||
633 | default: |
||
634 | // If value begins with a slash, then this is a name |
||
635 | // Add it to the appropriate array |
||
636 | if ('/' == substr($token, 0, 1)) { |
||
637 | $currentName = substr($token, 1); |
||
638 | if (true == $arrayTypeNumeric) { |
||
639 | $parsed[] = $currentName; |
||
640 | $currentName = ''; |
||
641 | } |
||
642 | } elseif ('' != $currentName) { |
||
643 | if (false == $arrayTypeNumeric) { |
||
644 | $parsed[$currentName] = $token; |
||
645 | } |
||
646 | $currentName = ''; |
||
647 | } elseif ('' == $currentName) { |
||
648 | $parsed[] = $token; |
||
649 | } |
||
650 | } |
||
651 | } |
||
652 | |||
653 | return $parsed; |
||
654 | } |
||
655 | |||
656 | /** |
||
657 | * Returns the text content of a PDF as a string. Attempts to add |
||
658 | * whitespace for spacing and line-breaks where appropriate. |
||
659 | * |
||
660 | * getText() leverages getTextArray() to get the content |
||
661 | * of the document, setting the addPositionWhitespace flag to true |
||
662 | * so whitespace is inserted in a logical way for reading by |
||
663 | * humans. |
||
664 | */ |
||
665 | public function getText(?Page $page = null): string |
||
672 | } |
||
673 | |||
674 | /** |
||
675 | * Returns the text content of a PDF as an array of strings. No |
||
676 | * extra whitespace is inserted besides what is actually encoded in |
||
677 | * the PDF text. |
||
678 | * |
||
679 | * @throws \Exception |
||
680 | */ |
||
681 | public function getTextArray(?Page $page = null): array |
||
682 | { |
||
683 | $result = []; |
||
684 | $text = []; |
||
685 | |||
686 | $marked_stack = []; |
||
687 | $last_written_position = false; |
||
688 | |||
689 | $sections = $this->getSectionsText($this->content); |
||
690 | $current_font = $this->getDefaultFont($page); |
||
691 | $current_font_size = 1; |
||
692 | $current_text_leading = 0; |
||
693 | |||
694 | $current_position = ['x' => false, 'y' => false]; |
||
695 | $current_position_tm = [ |
||
696 | 'a' => 1, 'b' => 0, 'c' => 0, |
||
697 | 'i' => 0, 'j' => 1, 'k' => 0, |
||
698 | 'x' => 0, 'y' => 0, 'z' => 1, |
||
699 | ]; |
||
700 | $current_position_td = ['x' => 0, 'y' => 0]; |
||
701 | $current_position_cm = [ |
||
702 | 'a' => 1, 'b' => 0, 'c' => 0, |
||
703 | 'i' => 0, 'j' => 1, 'k' => 0, |
||
704 | 'x' => 0, 'y' => 0, 'z' => 1, |
||
705 | ]; |
||
706 | |||
707 | $clipped_font = []; |
||
708 | $clipped_position_cm = []; |
||
709 | |||
710 | self::$recursionStack[] = $this->getUniqueId(); |
||
711 | |||
712 | foreach ($sections as $section) { |
||
713 | $commands = $this->getCommandsText($section); |
||
714 | foreach ($commands as $command) { |
||
715 | switch ($command[self::OPERATOR]) { |
||
716 | // Begin text object |
||
717 | case 'BT': |
||
718 | // Reset text positioning matrices |
||
719 | $current_position_tm = [ |
||
720 | 'a' => 1, 'b' => 0, 'c' => 0, |
||
721 | 'i' => 0, 'j' => 1, 'k' => 0, |
||
722 | 'x' => 0, 'y' => 0, 'z' => 1, |
||
723 | ]; |
||
724 | $current_position_td = ['x' => 0, 'y' => 0]; |
||
725 | $current_text_leading = 0; |
||
726 | break; |
||
727 | |||
728 | // Begin marked content sequence with property list |
||
729 | case 'BDC': |
||
730 | if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { |
||
731 | $dict = $this->parseDictionary($match[1]); |
||
732 | |||
733 | // Check for ActualText block |
||
734 | if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { |
||
735 | if ('[' == $dict['ActualText'][0]) { |
||
736 | // Simulate a 'TJ' command on the stack |
||
737 | $marked_stack[] = [ |
||
738 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], |
||
739 | ]; |
||
740 | } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { |
||
741 | // Simulate a 'Tj' command on the stack |
||
742 | $marked_stack[] = [ |
||
743 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], |
||
744 | ]; |
||
745 | } |
||
746 | } |
||
747 | } |
||
748 | break; |
||
749 | |||
750 | // Begin marked content sequence |
||
751 | case 'BMC': |
||
752 | if ('ReversedChars' == $command[self::COMMAND]) { |
||
753 | // Upon encountering a ReversedChars command, |
||
754 | // add the characters we've built up so far to |
||
755 | // the result array |
||
756 | $result = array_merge($result, $text); |
||
757 | |||
758 | // Start a fresh $text array that will contain |
||
759 | // reversed characters |
||
760 | $text = []; |
||
761 | |||
762 | // Add the reversed text flag to the stack |
||
763 | $marked_stack[] = ['ReversedChars' => true]; |
||
764 | } |
||
765 | break; |
||
766 | |||
767 | // set graphics position matrix |
||
768 | case 'cm': |
||
769 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
||
770 | $current_position_cm = [ |
||
771 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
||
772 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
||
773 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
||
774 | ]; |
||
775 | break; |
||
776 | |||
777 | case 'Do': |
||
778 | if (is_null($page)) { |
||
779 | break; |
||
780 | } |
||
781 | |||
782 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
783 | $id = trim(array_pop($args), '/ '); |
||
784 | $xobject = $page->getXObject($id); |
||
785 | |||
786 | // Check we got a PDFObject back. |
||
787 | if (!$xobject instanceof self) { |
||
788 | break; |
||
789 | } |
||
790 | |||
791 | // If the PDFObject is an image, do nothing, as images aren't text. |
||
792 | if ($xobject instanceof Image) { |
||
793 | break; |
||
794 | } |
||
795 | |||
796 | // Check this is not a circular reference. |
||
797 | if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { |
||
798 | $text[] = $xobject->getText($page); |
||
799 | } |
||
800 | break; |
||
801 | |||
802 | // Marked content point with (DP) & without (MP) property list |
||
803 | case 'DP': |
||
804 | case 'MP': |
||
805 | break; |
||
806 | |||
807 | // End text object |
||
808 | case 'ET': |
||
809 | break; |
||
810 | |||
811 | // Store current selected font and graphics matrix |
||
812 | case 'q': |
||
813 | $clipped_font[] = [$current_font, $current_font_size]; |
||
814 | $clipped_position_cm[] = $current_position_cm; |
||
815 | break; |
||
816 | |||
817 | // Restore previous selected font and graphics matrix |
||
818 | case 'Q': |
||
819 | list($current_font, $current_font_size) = array_pop($clipped_font); |
||
820 | $current_position_cm = array_pop($clipped_position_cm); |
||
821 | break; |
||
822 | |||
823 | // End marked content sequence |
||
824 | case 'EMC': |
||
825 | $data = false; |
||
826 | if (\count($marked_stack)) { |
||
827 | $marked = array_pop($marked_stack); |
||
828 | $action = key($marked); |
||
829 | $data = $marked[$action]; |
||
830 | |||
831 | switch ($action) { |
||
832 | // If we are in ReversedChars mode... |
||
833 | case 'ReversedChars': |
||
834 | // Reverse the characters we've built up so far |
||
835 | foreach ($text as $key => $t) { |
||
836 | $text[$key] = implode('', array_reverse( |
||
837 | mb_str_split($t, 1, mb_internal_encoding()) |
||
838 | )); |
||
839 | } |
||
840 | |||
841 | // Add these characters to the result array |
||
842 | $result = array_merge($result, $text); |
||
843 | |||
844 | // Start a fresh $text array that will contain |
||
845 | // non-reversed characters |
||
846 | $text = []; |
||
847 | break; |
||
848 | |||
849 | case 'ActualText': |
||
850 | // Use the content of the ActualText as a command |
||
851 | $command = $data; |
||
852 | break; |
||
853 | } |
||
854 | } |
||
855 | |||
856 | // If this EMC command has been transformed into a 'Tj' |
||
857 | // or 'TJ' command because of being ActualText, then bypass |
||
858 | // the break to proceed to the writing section below. |
||
859 | if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { |
||
860 | break; |
||
861 | } |
||
862 | |||
863 | // no break |
||
864 | case "'": |
||
865 | case '"': |
||
866 | if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { |
||
867 | // Move to next line and write text |
||
868 | $current_position['x'] = 0; |
||
869 | $current_position_td['x'] = 0; |
||
870 | $current_position_td['y'] += $current_text_leading; |
||
871 | } |
||
872 | // no break |
||
873 | case 'Tj': |
||
874 | $command[self::COMMAND] = [$command]; |
||
875 | // no break |
||
876 | case 'TJ': |
||
877 | // Check the marked content stack for flags |
||
878 | $actual_text = false; |
||
879 | $reverse_text = false; |
||
880 | foreach ($marked_stack as $marked) { |
||
881 | if (isset($marked['ActualText'])) { |
||
882 | $actual_text = true; |
||
883 | } |
||
884 | if (isset($marked['ReversedChars'])) { |
||
885 | $reverse_text = true; |
||
886 | } |
||
887 | } |
||
888 | |||
889 | // Account for text position ONLY just before we write text |
||
890 | if (false === $actual_text && \is_array($last_written_position)) { |
||
891 | // If $last_written_position is an array, that |
||
892 | // means we have stored text position coordinates |
||
893 | // for placing an ActualText |
||
894 | $currentX = $last_written_position[0]; |
||
895 | $currentY = $last_written_position[1]; |
||
896 | $last_written_position = false; |
||
897 | } else { |
||
898 | $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; |
||
899 | $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; |
||
900 | } |
||
901 | $whiteSpace = ''; |
||
902 | |||
903 | $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; |
||
904 | $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; |
||
905 | |||
906 | if (true === $this->addPositionWhitespace && false !== $current_position['x']) { |
||
907 | $curY = $currentY - $current_position['y']; |
||
908 | if (abs($curY) >= abs($factorY) / 4) { |
||
909 | $whiteSpace = "\n"; |
||
910 | } else { |
||
911 | if (true === $reverse_text) { |
||
912 | $curX = $current_position['x'] - $currentX; |
||
913 | } else { |
||
914 | $curX = $currentX - $current_position['x']; |
||
915 | } |
||
916 | |||
917 | // In abs($factorX * 7) below, the 7 is chosen arbitrarily |
||
918 | // as the number of apparent "spaces" in a document we |
||
919 | // would need before considering them a "tab". In the |
||
920 | // future, we might offer this value to users as a config |
||
921 | // option. |
||
922 | if ($curX >= abs($factorX * 7)) { |
||
923 | $whiteSpace = "\t"; |
||
924 | } elseif ($curX >= abs($factorX * 2)) { |
||
925 | $whiteSpace = ' '; |
||
926 | } |
||
927 | } |
||
928 | } |
||
929 | |||
930 | $newtext = $this->getTJUsingFontFallback( |
||
931 | $current_font, |
||
932 | $command[self::COMMAND], |
||
933 | $page, |
||
934 | $factorX |
||
935 | ); |
||
936 | |||
937 | // If there is no ActualText pending then write |
||
938 | if (false === $actual_text) { |
||
939 | $newtext = str_replace(["\r", "\n"], '', $newtext); |
||
940 | if (false !== $reverse_text) { |
||
941 | // If we are in ReversedChars mode, add the whitespace last |
||
942 | $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); |
||
943 | } else { |
||
944 | // Otherwise add the whitespace first |
||
945 | if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { |
||
946 | $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); |
||
947 | } |
||
948 | $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); |
||
949 | } |
||
950 | |||
951 | // Record the position of this inserted text for comparison |
||
952 | // with the next text block. |
||
953 | // Provide a 'fudge' factor guess on how wide this text block |
||
954 | // is based on the number of characters. This helps limit the |
||
955 | // number of tabs inserted, but isn't perfect. |
||
956 | $factor = $factorX / 2; |
||
957 | $current_position = [ |
||
958 | 'x' => $currentX - mb_strlen($newtext) * $factor, |
||
959 | 'y' => $currentY, |
||
960 | ]; |
||
961 | } elseif (false === $last_written_position) { |
||
962 | // If there is an ActualText in the pipeline |
||
963 | // store the position this undisplayed text |
||
964 | // *would* have been written to, so the |
||
965 | // ActualText is displayed in the right spot |
||
966 | $last_written_position = [$currentX, $currentY]; |
||
967 | $current_position['x'] = $currentX; |
||
968 | } |
||
969 | break; |
||
970 | |||
971 | // move to start of next line |
||
972 | case 'T*': |
||
973 | $current_position['x'] = 0; |
||
974 | $current_position_td['x'] = 0; |
||
975 | $current_position_td['y'] += $current_text_leading; |
||
976 | break; |
||
977 | |||
978 | // set character spacing |
||
979 | case 'Tc': |
||
980 | break; |
||
981 | |||
982 | // move text current point and set leading |
||
983 | case 'Td': |
||
984 | case 'TD': |
||
985 | // move text current point |
||
986 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
||
987 | $y = (float) array_pop($args); |
||
988 | $x = (float) array_pop($args); |
||
989 | |||
990 | if ('TD' == $command[self::OPERATOR]) { |
||
991 | $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; |
||
992 | } |
||
993 | |||
994 | $current_position_td = [ |
||
995 | 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], |
||
996 | 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], |
||
997 | ]; |
||
998 | break; |
||
999 | |||
1000 | case 'Tf': |
||
1001 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
||
1002 | $size = (float) array_pop($args); |
||
1003 | $id = trim(array_pop($args), '/'); |
||
1004 | if (null !== $page) { |
||
1005 | $new_font = $page->getFont($id); |
||
1006 | // If an invalid font ID is given, do not update the font. |
||
1007 | // This should theoretically never happen, as the PDF spec states for the Tf operator: |
||
1008 | // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" |
||
1009 | // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) |
||
1010 | // But we want to make sure that malformed PDFs do not simply crash. |
||
1011 | if (null !== $new_font) { |
||
1012 | $current_font = $new_font; |
||
1013 | $current_font_size = $size; |
||
1014 | } |
||
1015 | } |
||
1016 | break; |
||
1017 | |||
1018 | // set leading |
||
1019 | case 'TL': |
||
1020 | $y = (float) $command[self::COMMAND]; |
||
1021 | $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; |
||
1022 | break; |
||
1023 | |||
1024 | // set text position matrix |
||
1025 | case 'Tm': |
||
1026 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
||
1027 | $current_position_tm = [ |
||
1028 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
||
1029 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
||
1030 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
||
1031 | ]; |
||
1032 | break; |
||
1033 | |||
1034 | // set text rendering mode |
||
1035 | case 'Ts': |
||
1036 | break; |
||
1037 | |||
1038 | // set super/subscripting text rise |
||
1039 | case 'Ts': |
||
1040 | break; |
||
1041 | |||
1042 | // set word spacing |
||
1043 | case 'Tw': |
||
1044 | break; |
||
1045 | |||
1046 | // set horizontal scaling |
||
1047 | case 'Tz': |
||
1048 | break; |
||
1049 | |||
1050 | default: |
||
1051 | } |
||
1052 | } |
||
1053 | } |
||
1054 | |||
1055 | $result = array_merge($result, $text); |
||
1056 | |||
1057 | return $result; |
||
1058 | } |
||
1059 | |||
1060 | /** |
||
1061 | * getCommandsText() expects the content of $text_part to be an |
||
1062 | * already formatted, single-line command from a document stream. |
||
1063 | * The companion function getSectionsText() returns a document |
||
1064 | * stream as an array of single commands for just this purpose. |
||
1065 | * Because of this, the argument $offset is no longer used, and |
||
1066 | * may be removed in a future PdfParser release. |
||
1067 | * |
||
1068 | * A better name for this function would be getCommandText() |
||
1069 | * since it now always works on just one command. |
||
1070 | */ |
||
1071 | public function getCommandsText(string $text_part, int &$offset = 0): array |
||
1072 | { |
||
1073 | $commands = $matches = []; |
||
1074 | |||
1075 | preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches); |
||
1076 | |||
1077 | // If no valid command is detected, return an empty array |
||
1078 | if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { |
||
1079 | return []; |
||
1080 | } |
||
1081 | |||
1082 | $type = $matches[2]; |
||
1083 | $operator = $matches[3]; |
||
1084 | $command = trim($matches[1]); |
||
1085 | |||
1086 | if ('TJ' == $operator) { |
||
1087 | $subcommand = []; |
||
1088 | $command = trim($command, '[]'); |
||
1089 | do { |
||
1090 | $oldCommand = $command; |
||
1091 | |||
1092 | // Search for parentheses string () format |
||
1093 | if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) { |
||
1094 | $subcommand[] = [ |
||
1095 | self::TYPE => '(', |
||
1096 | self::OPERATOR => 'TJ', |
||
1097 | self::COMMAND => $tjmatch[1], |
||
1098 | ]; |
||
1099 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
||
1100 | $subcommand[] = [ |
||
1101 | self::TYPE => 'n', |
||
1102 | self::OPERATOR => '', |
||
1103 | self::COMMAND => $tjmatch[2], |
||
1104 | ]; |
||
1105 | } |
||
1106 | $command = substr($command, \strlen($tjmatch[0])); |
||
1107 | } |
||
1108 | |||
1109 | // Search for hexadecimal <> format |
||
1110 | if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { |
||
1111 | $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); |
||
1112 | $subcommand[] = [ |
||
1113 | self::TYPE => '<', |
||
1114 | self::OPERATOR => 'TJ', |
||
1115 | self::COMMAND => $tjmatch[1], |
||
1116 | ]; |
||
1117 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
||
1118 | $subcommand[] = [ |
||
1119 | self::TYPE => 'n', |
||
1120 | self::OPERATOR => '', |
||
1121 | self::COMMAND => $tjmatch[2], |
||
1122 | ]; |
||
1123 | } |
||
1124 | $command = substr($command, \strlen($tjmatch[0])); |
||
1125 | } |
||
1126 | } while ($command != $oldCommand); |
||
1127 | |||
1128 | $command = $subcommand; |
||
1129 | } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { |
||
1130 | // Depending on the string type, trim the data of the |
||
1131 | // appropriate delimiters |
||
1132 | if ('(' == $type) { |
||
1133 | // Don't use trim() here since a () string may end with |
||
1134 | // a balanced or escaped right parentheses, and trim() |
||
1135 | // will delete both. Both strings below are valid: |
||
1136 | // eg. (String()) |
||
1137 | // eg. (String\)) |
||
1138 | $command = preg_replace('/^\(|\)$/', '', $command); |
||
1139 | } elseif ('<' == $type) { |
||
1140 | $command = trim($command, '<>'); |
||
1141 | } |
||
1142 | } elseif ('/' == $type) { |
||
1143 | $command = substr($command, 1); |
||
1144 | } |
||
1145 | |||
1146 | $commands[] = [ |
||
1147 | self::TYPE => $type, |
||
1148 | self::OPERATOR => $operator, |
||
1149 | self::COMMAND => $command, |
||
1150 | ]; |
||
1151 | |||
1152 | return $commands; |
||
1153 | } |
||
1154 | |||
1155 | public static function factory( |
||
1156 | Document $document, |
||
1157 | Header $header, |
||
1158 | ?string $content, |
||
1159 | ?Config $config = null |
||
1160 | ): self { |
||
1161 | switch ($header->get('Type')->getContent()) { |
||
1162 | case 'XObject': |
||
1163 | switch ($header->get('Subtype')->getContent()) { |
||
1164 | case 'Image': |
||
1165 | return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config); |
||
1166 | |||
1167 | case 'Form': |
||
1168 | return new Form($document, $header, $content, $config); |
||
1169 | } |
||
1170 | |||
1171 | return new self($document, $header, $content, $config); |
||
1172 | |||
1173 | case 'Pages': |
||
1174 | return new Pages($document, $header, $content, $config); |
||
1175 | |||
1176 | case 'Page': |
||
1177 | return new Page($document, $header, $content, $config); |
||
1178 | |||
1179 | case 'Encoding': |
||
1180 | return new Encoding($document, $header, $content, $config); |
||
1181 | |||
1182 | case 'Font': |
||
1183 | $subtype = $header->get('Subtype')->getContent(); |
||
1184 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
||
1185 | |||
1186 | if (class_exists($classname)) { |
||
1187 | return new $classname($document, $header, $content, $config); |
||
1188 | } |
||
1189 | |||
1190 | return new Font($document, $header, $content, $config); |
||
1191 | |||
1192 | default: |
||
1193 | return new self($document, $header, $content, $config); |
||
1194 | } |
||
1195 | } |
||
1196 | |||
1197 | /** |
||
1198 | * Returns unique id identifying the object. |
||
1199 | */ |
||
1200 | protected function getUniqueId(): string |
||
1203 | } |
||
1204 | } |
||
1205 |