1 | <?php |
||||
2 | |||||
3 | /** |
||||
4 | * @file |
||||
5 | * This file is part of the PdfParser library. |
||||
6 | * |
||||
7 | * @author Sébastien MALOT <[email protected]> |
||||
8 | * |
||||
9 | * @date 2017-01-03 |
||||
10 | * |
||||
11 | * @license LGPLv3 |
||||
12 | * |
||||
13 | * @url <https://github.com/smalot/pdfparser> |
||||
14 | * |
||||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||
17 | * |
||||
18 | * This program is free software: you can redistribute it and/or modify |
||||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||||
20 | * the Free Software Foundation, either version 3 of the License, or |
||||
21 | * (at your option) any later version. |
||||
22 | * |
||||
23 | * This program is distributed in the hope that it will be useful, |
||||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||
26 | * GNU Lesser General Public License for more details. |
||||
27 | * |
||||
28 | * You should have received a copy of the GNU Lesser General Public License |
||||
29 | * along with this program. |
||||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||
31 | */ |
||||
32 | |||||
33 | namespace Smalot\PdfParser; |
||||
34 | |||||
35 | use Smalot\PdfParser\Exception\InvalidDictionaryObjectException; |
||||
36 | use Smalot\PdfParser\XObject\Form; |
||||
37 | use Smalot\PdfParser\XObject\Image; |
||||
38 | |||||
39 | /** |
||||
40 | * Class PDFObject |
||||
41 | */ |
||||
42 | class PDFObject |
||||
43 | { |
||||
44 | public const TYPE = 't'; |
||||
45 | |||||
46 | public const OPERATOR = 'o'; |
||||
47 | |||||
48 | public const COMMAND = 'c'; |
||||
49 | |||||
50 | /** |
||||
51 | * The recursion stack. |
||||
52 | * |
||||
53 | * @var array |
||||
54 | */ |
||||
55 | public static $recursionStack = []; |
||||
56 | |||||
57 | /** |
||||
58 | * @var Document|null |
||||
59 | */ |
||||
60 | protected $document; |
||||
61 | |||||
62 | /** |
||||
63 | * @var Header |
||||
64 | */ |
||||
65 | protected $header; |
||||
66 | |||||
67 | /** |
||||
68 | * @var string |
||||
69 | */ |
||||
70 | protected $content; |
||||
71 | |||||
72 | /** |
||||
73 | * @var Config|null |
||||
74 | */ |
||||
75 | protected $config; |
||||
76 | |||||
77 | /** |
||||
78 | * @var bool |
||||
79 | */ |
||||
80 | protected $addPositionWhitespace = false; |
||||
81 | |||||
82 | 98 | public function __construct( |
|||
83 | Document $document, |
||||
84 | ?Header $header = null, |
||||
85 | ?string $content = null, |
||||
86 | ?Config $config = null |
||||
87 | ) { |
||||
88 | 98 | $this->document = $document; |
|||
89 | 98 | $this->header = $header ?? new Header(); |
|||
90 | 98 | $this->content = $content; |
|||
91 | 98 | $this->config = $config; |
|||
92 | } |
||||
93 | |||||
94 | 74 | public function init() |
|||
95 | { |
||||
96 | 74 | } |
|||
97 | |||||
98 | 4 | public function getDocument(): Document |
|||
99 | { |
||||
100 | 4 | return $this->document; |
|||
101 | } |
||||
102 | |||||
103 | 74 | public function getHeader(): ?Header |
|||
104 | { |
||||
105 | 74 | return $this->header; |
|||
106 | } |
||||
107 | |||||
108 | 4 | public function getConfig(): ?Config |
|||
109 | { |
||||
110 | 4 | return $this->config; |
|||
111 | } |
||||
112 | |||||
113 | /** |
||||
114 | * @return Element|PDFObject|Header |
||||
115 | */ |
||||
116 | 77 | public function get(string $name) |
|||
117 | { |
||||
118 | 77 | return $this->header->get($name); |
|||
119 | } |
||||
120 | |||||
121 | 76 | public function has(string $name): bool |
|||
122 | { |
||||
123 | 76 | return $this->header->has($name); |
|||
124 | } |
||||
125 | |||||
126 | 4 | public function getDetails(bool $deep = true): array |
|||
127 | { |
||||
128 | 4 | return $this->header->getDetails($deep); |
|||
129 | } |
||||
130 | |||||
131 | 60 | public function getContent(): ?string |
|||
132 | { |
||||
133 | 60 | return $this->content; |
|||
134 | } |
||||
135 | |||||
136 | /** |
||||
137 | * Creates a duplicate of the document stream with |
||||
138 | * strings and other items replaced by $char. Formerly |
||||
139 | * getSectionsText() used this output to more easily gather offset |
||||
140 | * values to extract text from the *actual* document stream. |
||||
141 | * |
||||
142 | * @deprecated function is no longer used and will be removed in a future release |
||||
143 | * |
||||
144 | * @internal |
||||
145 | */ |
||||
146 | 1 | public function cleanContent(string $content, string $char = 'X') |
|||
147 | { |
||||
148 | 1 | $char = $char[0]; |
|||
149 | 1 | $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content); |
|||
150 | |||||
151 | // Remove image bloc with binary content |
||||
152 | 1 | preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||
153 | 1 | foreach ($matches[0] as $part) { |
|||
154 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||||
155 | } |
||||
156 | |||||
157 | // Clean content in square brackets [.....] |
||||
158 | 1 | preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||
0 ignored issues
–
show
|
|||||
159 | 1 | foreach ($matches[1] as $part) { |
|||
160 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||
161 | } |
||||
162 | |||||
163 | // Clean content in round brackets (.....) |
||||
164 | 1 | preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||
165 | 1 | foreach ($matches[1] as $part) { |
|||
166 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||
167 | } |
||||
168 | |||||
169 | // Clean structure |
||||
170 | 1 | if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { |
|||
171 | 1 | $content = ''; |
|||
172 | 1 | $level = 0; |
|||
173 | 1 | foreach ($parts as $part) { |
|||
174 | 1 | if ('<' == $part) { |
|||
175 | 1 | ++$level; |
|||
176 | } |
||||
177 | |||||
178 | 1 | $content .= (0 == $level ? $part : str_repeat($char, \strlen($part))); |
|||
179 | |||||
180 | 1 | if ('>' == $part) { |
|||
181 | 1 | --$level; |
|||
182 | } |
||||
183 | } |
||||
184 | } |
||||
185 | |||||
186 | // Clean BDC and EMC markup |
||||
187 | 1 | preg_match_all( |
|||
188 | 1 | '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', |
|||
189 | 1 | $content, |
|||
190 | 1 | $matches, |
|||
191 | 1 | \PREG_OFFSET_CAPTURE |
|||
192 | 1 | ); |
|||
193 | 1 | foreach ($matches[1] as $part) { |
|||
194 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||
195 | } |
||||
196 | |||||
197 | 1 | preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||
198 | 1 | foreach ($matches[1] as $part) { |
|||
199 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||
200 | } |
||||
201 | |||||
202 | 1 | return $content; |
|||
203 | } |
||||
204 | |||||
205 | /** |
||||
206 | * Takes a string of PDF document stream text and formats |
||||
207 | * it into a multi-line string with one PDF command on each line, |
||||
208 | * separated by \r\n. If the given string is null, or binary data |
||||
209 | * is detected instead of a document stream then return an empty |
||||
210 | * string. |
||||
211 | */ |
||||
212 | 56 | private function formatContent(?string $content): string |
|||
213 | { |
||||
214 | 56 | if (null === $content) { |
|||
215 | 3 | return ''; |
|||
216 | } |
||||
217 | |||||
218 | // Outside of (String) and inline image content in PDF document |
||||
219 | // streams, all text should conform to UTF-8. Test for binary |
||||
220 | // content by deleting everything after the first open- |
||||
221 | // parenthesis ( which indicates the beginning of a string, or |
||||
222 | // the first ID command which indicates the beginning of binary |
||||
223 | // inline image content. Then test what remains for valid |
||||
224 | // UTF-8. If it's not UTF-8, return an empty string as this |
||||
225 | // $content is most likely binary. Unfortunately, using |
||||
226 | // mb_check_encoding(..., 'UTF-8') is not strict enough, so the |
||||
227 | // following regexp, adapted from the W3, is used. See: |
||||
228 | // https://www.w3.org/International/questions/qa-forms-utf-8.en |
||||
229 | // We use preg_replace() instead of preg_match() to avoid "JIT |
||||
230 | // stack limit exhausted" errors on larger files. |
||||
231 | 53 | $utf8Filter = preg_replace('/( |
|||
232 | [\x09\x0A\x0D\x20-\x7E] | # ASCII |
||||
233 | [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte |
||||
234 | \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs |
||||
235 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte |
||||
236 | \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates |
||||
237 | \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 |
||||
238 | [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 |
||||
239 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||||
240 | 53 | )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); |
|||
241 | |||||
242 | 53 | if ('' !== $utf8Filter) { |
|||
243 | 1 | return ''; |
|||
244 | } |
||||
245 | |||||
246 | // Find all inline image content and replace them so they aren't |
||||
247 | // affected by the next steps |
||||
248 | 53 | $pdfInlineImages = []; |
|||
249 | 53 | $offsetBI = 0; |
|||
250 | 53 | while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { |
|||
251 | // Attempt to detemine if this instance of the 'BI' command |
||||
252 | // actually occured within a (string) using the following |
||||
253 | // steps: |
||||
254 | |||||
255 | // Step 1: Remove any escaped slashes and parentheses from |
||||
256 | // the alleged image characteristics data |
||||
257 | 1 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]); |
|||
258 | |||||
259 | // Step 2: Remove all correctly ordered and balanced |
||||
260 | // parentheses from (strings) |
||||
261 | do { |
||||
262 | 1 | $paraTest = $para; |
|||
263 | 1 | $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); |
|||
264 | 1 | } while ($para != $paraTest); |
|||
265 | |||||
266 | 1 | $paraOpen = strpos($para, '('); |
|||
267 | 1 | $paraClose = strpos($para, ')'); |
|||
268 | |||||
269 | // Check: If the remaining text contains a close parenthesis |
||||
270 | // ')' AND it occurs before any open parenthesis, then we |
||||
271 | // are almost certain to be inside a (string) |
||||
272 | 1 | if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { |
|||
273 | // Bump the search offset forward and match again |
||||
274 | 1 | $offsetBI = (int) $text[1][1]; |
|||
275 | 1 | continue; |
|||
276 | } |
||||
277 | |||||
278 | // Step 3: Double check that this is actually inline image |
||||
279 | // data by parsing the alleged image characteristics as a |
||||
280 | // dictionary |
||||
281 | 1 | $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); |
|||
282 | |||||
283 | // Check if an image Width and Height are set in the dict |
||||
284 | 1 | if ((isset($dict['W']) || isset($dict['Width'])) |
|||
285 | 1 | && (isset($dict['H']) || isset($dict['Height']))) { |
|||
286 | 1 | $id = uniqid('IMAGE_', true); |
|||
287 | 1 | $pdfInlineImages[$id] = [ |
|||
288 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), |
|||
289 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), |
|||
290 | 1 | ]; |
|||
291 | 1 | $content = preg_replace( |
|||
292 | 1 | '/'.preg_quote($text[0][0], '/').'/', |
|||
293 | 1 | '^^^'.$id.'^^^', |
|||
294 | 1 | $content, |
|||
295 | 1 | 1 |
|||
296 | 1 | ); |
|||
297 | } else { |
||||
298 | // If there was no valid dictionary, or a height and width |
||||
299 | // weren't specified, then we don't know what this is, so |
||||
300 | // just leave it alone; bump the search offset forward and |
||||
301 | // match again |
||||
302 | $offsetBI = (int) $text[1][1]; |
||||
303 | } |
||||
304 | } |
||||
305 | |||||
306 | // Find all strings () and replace them so they aren't affected |
||||
307 | // by the next steps |
||||
308 | 53 | $pdfstrings = []; |
|||
309 | 53 | $attempt = '('; |
|||
310 | 53 | while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) { |
|||
311 | // Remove all escaped slashes and parentheses from the target text |
||||
312 | 42 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]); |
|||
313 | |||||
314 | // PDF strings can contain unescaped parentheses as long as |
||||
315 | // they're balanced, so check for balanced parentheses |
||||
316 | 42 | $left = preg_match_all('/\(/', $para); |
|||
317 | 42 | $right = preg_match_all('/\)/', $para); |
|||
318 | |||||
319 | 42 | if (')' == $para[-1] && $left == $right) { |
|||
320 | // Replace the string with a unique placeholder |
||||
321 | 42 | $id = uniqid('STRING_', true); |
|||
322 | 42 | $pdfstrings[$id] = $text[0]; |
|||
323 | 42 | $content = preg_replace( |
|||
324 | 42 | '/'.preg_quote($text[0], '/').'/', |
|||
325 | 42 | '@@@'.$id.'@@@', |
|||
326 | 42 | $content, |
|||
327 | 42 | 1 |
|||
328 | 42 | ); |
|||
329 | |||||
330 | // Reset to search for the next string |
||||
331 | 42 | $attempt = '('; |
|||
332 | } else { |
||||
333 | // We had unbalanced parentheses, so use the current |
||||
334 | // match as a base to find a longer string |
||||
335 | 21 | $attempt = $text[0]; |
|||
336 | } |
||||
337 | } |
||||
338 | |||||
339 | // Remove all carriage returns and line-feeds from the document stream |
||||
340 | 53 | $content = str_replace(["\r", "\n"], ' ', trim($content)); |
|||
341 | |||||
342 | // Find all dictionary << >> commands and replace them so they |
||||
343 | // aren't affected by the next steps |
||||
344 | 53 | $dictstore = []; |
|||
345 | 53 | while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { |
|||
346 | 18 | $dictid = uniqid('DICT_', true); |
|||
347 | 18 | $dictstore[$dictid] = $dicttext[1]; |
|||
348 | 18 | $content = preg_replace( |
|||
349 | 18 | '/'.preg_quote($dicttext[0], '/').'/', |
|||
350 | 18 | ' ###'.$dictid.'###'.$dicttext[2], |
|||
351 | 18 | $content, |
|||
352 | 18 | 1 |
|||
353 | 18 | ); |
|||
354 | } |
||||
355 | |||||
356 | // Normalize white-space in the document stream |
||||
357 | 53 | $content = preg_replace('/\s{2,}/', ' ', $content); |
|||
358 | |||||
359 | // Find all valid PDF operators and add \r\n after each; this |
||||
360 | // ensures there is just one command on every line |
||||
361 | // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A |
||||
362 | // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A |
||||
363 | // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while |
||||
364 | // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions |
||||
365 | // appear here in the list for completeness. |
||||
366 | 53 | $operators = [ |
|||
367 | 53 | 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', |
|||
368 | 53 | 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', |
|||
369 | 53 | 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', |
|||
370 | 53 | 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', |
|||
371 | 53 | 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', |
|||
372 | 53 | 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', |
|||
373 | 53 | ]; |
|||
374 | 53 | foreach ($operators as $operator) { |
|||
375 | 53 | $content = preg_replace( |
|||
376 | 53 | '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/', |
|||
377 | 53 | $operator."\r\n", |
|||
378 | 53 | $content |
|||
379 | 53 | ); |
|||
380 | } |
||||
381 | |||||
382 | // Restore the original content of the dictionary << >> commands |
||||
383 | 53 | $dictstore = array_reverse($dictstore, true); |
|||
384 | 53 | foreach ($dictstore as $id => $dict) { |
|||
385 | 18 | $content = str_replace('###'.$id.'###', $dict, $content); |
|||
386 | } |
||||
387 | |||||
388 | // Restore the original string content |
||||
389 | 53 | $pdfstrings = array_reverse($pdfstrings, true); |
|||
390 | 53 | foreach ($pdfstrings as $id => $text) { |
|||
391 | // Strings may contain escaped newlines, or literal newlines |
||||
392 | // and we should clean these up before replacing the string |
||||
393 | // back into the content stream; this ensures no strings are |
||||
394 | // split between two lines (every command must be on one line) |
||||
395 | 42 | $text = str_replace( |
|||
396 | 42 | ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], |
|||
397 | 42 | ['', '', '', '\r', '\n'], |
|||
398 | 42 | $text |
|||
399 | 42 | ); |
|||
400 | |||||
401 | 42 | $content = str_replace('@@@'.$id.'@@@', $text, $content); |
|||
402 | } |
||||
403 | |||||
404 | // Restore the original content of any inline images |
||||
405 | 53 | $pdfInlineImages = array_reverse($pdfInlineImages, true); |
|||
406 | 53 | foreach ($pdfInlineImages as $id => $image) { |
|||
407 | 1 | $content = str_replace( |
|||
408 | 1 | '^^^'.$id.'^^^', |
|||
409 | 1 | "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", |
|||
410 | 1 | $content |
|||
411 | 1 | ); |
|||
412 | } |
||||
413 | |||||
414 | 53 | $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); |
|||
415 | |||||
416 | 53 | return $content; |
|||
417 | } |
||||
418 | |||||
419 | /** |
||||
420 | * getSectionsText() now takes an entire, unformatted |
||||
421 | * document stream as a string, cleans it, then filters out |
||||
422 | * commands that aren't needed for text positioning/extraction. It |
||||
423 | * returns an array of unprocessed PDF commands, one command per |
||||
424 | * element. |
||||
425 | * |
||||
426 | * @internal |
||||
427 | */ |
||||
428 | 54 | public function getSectionsText(?string $content): array |
|||
429 | { |
||||
430 | 54 | $sections = []; |
|||
431 | |||||
432 | // A cleaned stream has one command on every line, so split the |
||||
433 | // cleaned stream content on \r\n into an array |
||||
434 | 54 | $textCleaned = preg_split( |
|||
435 | 54 | '/(\r\n|\n|\r)/', |
|||
436 | 54 | $this->formatContent($content), |
|||
437 | 54 | -1, |
|||
438 | 54 | \PREG_SPLIT_NO_EMPTY |
|||
439 | 54 | ); |
|||
440 | |||||
441 | 54 | $inTextBlock = false; |
|||
442 | 54 | foreach ($textCleaned as $line) { |
|||
443 | 51 | $line = trim($line); |
|||
444 | |||||
445 | // Skip empty lines |
||||
446 | 51 | if ('' === $line) { |
|||
447 | continue; |
||||
448 | } |
||||
449 | |||||
450 | // If a 'BT' is encountered, set the $inTextBlock flag |
||||
451 | 51 | if (preg_match('/BT$/', $line)) { |
|||
452 | 50 | $inTextBlock = true; |
|||
453 | 50 | $sections[] = $line; |
|||
454 | |||||
455 | // If an 'ET' is encountered, unset the $inTextBlock flag |
||||
456 | 51 | } elseif ('ET' == $line) { |
|||
457 | 50 | $inTextBlock = false; |
|||
458 | 50 | $sections[] = $line; |
|||
459 | 51 | } elseif ($inTextBlock) { |
|||
460 | // If we are inside a BT ... ET text block, save all lines |
||||
461 | 50 | $sections[] = trim($line); |
|||
462 | } else { |
||||
463 | // Otherwise, if we are outside of a text block, only |
||||
464 | // save specific, necessary lines. Care should be taken |
||||
465 | // to ensure a command being checked for *only* matches |
||||
466 | // that command. For instance, a simple search for 'c' |
||||
467 | // may also match the 'sc' command. See the command |
||||
468 | // list in the formatContent() method above. |
||||
469 | // Add more commands to save here as you find them in |
||||
470 | // weird PDFs! |
||||
471 | 50 | if ('q' == $line[-1] || 'Q' == $line[-1]) { |
|||
472 | // Save and restore graphics state commands |
||||
473 | 44 | $sections[] = $line; |
|||
474 | 50 | } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) { |
|||
475 | // Begin marked content sequence |
||||
476 | 16 | $sections[] = $line; |
|||
477 | 50 | } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) { |
|||
478 | // Marked content point |
||||
479 | 1 | $sections[] = $line; |
|||
480 | 49 | } elseif (preg_match('/(?<!\w)EMC$/', $line)) { |
|||
481 | // End marked content sequence |
||||
482 | 15 | $sections[] = $line; |
|||
483 | 47 | } elseif (preg_match('/(?<!\w)cm$/', $line)) { |
|||
484 | // Graphics position change commands |
||||
485 | 35 | $sections[] = $line; |
|||
486 | 47 | } elseif (preg_match('/(?<!\w)Tf$/', $line)) { |
|||
487 | // Font change commands |
||||
488 | 3 | $sections[] = $line; |
|||
489 | 47 | } elseif (preg_match('/(?<!\w)Do$/', $line)) { |
|||
490 | // Invoke named XObject command |
||||
491 | 16 | $sections[] = $line; |
|||
492 | } |
||||
493 | } |
||||
494 | } |
||||
495 | |||||
496 | 54 | return $sections; |
|||
497 | } |
||||
498 | |||||
499 | 48 | private function getDefaultFont(?Page $page = null): Font |
|||
500 | { |
||||
501 | 48 | $fonts = []; |
|||
502 | 48 | if (null !== $page) { |
|||
503 | 46 | $fonts = $page->getFonts(); |
|||
504 | } |
||||
505 | |||||
506 | 48 | $firstFont = $this->document->getFirstFont(); |
|||
507 | 48 | if (null !== $firstFont) { |
|||
508 | 44 | $fonts[] = $firstFont; |
|||
509 | } |
||||
510 | |||||
511 | 48 | if (\count($fonts) > 0) { |
|||
512 | 44 | return reset($fonts); |
|||
513 | } |
||||
514 | |||||
515 | 4 | return new Font($this->document, null, null, $this->config); |
|||
0 ignored issues
–
show
It seems like
$this->document can also be of type null ; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
516 | } |
||||
517 | |||||
518 | /** |
||||
519 | * Decode a '[]TJ' command and attempt to use alternate |
||||
520 | * fonts if the current font results in output that contains |
||||
521 | * Unicode control characters. |
||||
522 | * |
||||
523 | * @internal |
||||
524 | * |
||||
525 | * @param array<int,array<string,string|bool>> $command |
||||
526 | */ |
||||
527 | 44 | private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string |
|||
528 | { |
||||
529 | 44 | $orig_text = $font->decodeText($command, $fontFactor); |
|||
530 | 44 | $text = $orig_text; |
|||
531 | |||||
532 | // If we make this a Config option, we can add a check if it's |
||||
533 | // enabled here. |
||||
534 | 44 | if (null !== $page) { |
|||
535 | 44 | $font_ids = array_keys($page->getFonts()); |
|||
536 | |||||
537 | // If the decoded text contains UTF-8 control characters |
||||
538 | // then the font page being used is probably the wrong one. |
||||
539 | // Loop through the rest of the fonts to see if we can get |
||||
540 | // a good decode. Allow x09 to x0d which are whitespace. |
||||
541 | 44 | while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { |
|||
542 | // If we're out of font IDs, then give up and use the |
||||
543 | // original string |
||||
544 | 3 | if (0 == \count($font_ids)) { |
|||
545 | 3 | return $orig_text; |
|||
546 | } |
||||
547 | |||||
548 | // Try the next font ID |
||||
549 | 3 | $font = $page->getFont(array_shift($font_ids)); |
|||
550 | 3 | $text = $font->decodeText($command, $fontFactor); |
|||
551 | } |
||||
552 | } |
||||
553 | |||||
554 | 44 | return $text; |
|||
555 | } |
||||
556 | |||||
557 | /** |
||||
558 | * Expects a string that is a full PDF dictionary object, |
||||
559 | * including the outer enclosing << >> angle brackets |
||||
560 | * |
||||
561 | * @internal |
||||
562 | * |
||||
563 | * @throws InvalidDictionaryObjectException |
||||
564 | */ |
||||
565 | 18 | public function parseDictionary(string $dictionary): array |
|||
566 | { |
||||
567 | // Normalize whitespace |
||||
568 | 18 | $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); |
|||
569 | |||||
570 | 18 | if ('<<' != substr($dictionary, 0, 2)) { |
|||
571 | throw new InvalidDictionaryObjectException('Not a valid dictionary object.'); |
||||
572 | } |
||||
573 | |||||
574 | 18 | $parsed = []; |
|||
575 | 18 | $stack = []; |
|||
576 | 18 | $currentName = ''; |
|||
577 | 18 | $arrayTypeNumeric = false; |
|||
578 | |||||
579 | // Remove outer layer of dictionary, and split on tokens |
||||
580 | 18 | $split = preg_split( |
|||
581 | 18 | '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', |
|||
582 | 18 | trim(preg_replace('/^<<|>>$/', '', $dictionary)), |
|||
583 | 18 | -1, |
|||
584 | 18 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|||
585 | 18 | ); |
|||
586 | |||||
587 | 18 | foreach ($split as $token) { |
|||
588 | 18 | $token = trim($token); |
|||
589 | switch ($token) { |
||||
590 | 18 | case '': |
|||
591 | 8 | break; |
|||
592 | |||||
593 | // Open numeric array |
||||
594 | 18 | case '[': |
|||
595 | 8 | $parsed[$currentName] = []; |
|||
596 | 8 | $arrayTypeNumeric = true; |
|||
597 | |||||
598 | // Move up one level in the stack |
||||
599 | 8 | $stack[\count($stack)] = &$parsed; |
|||
600 | 8 | $parsed = &$parsed[$currentName]; |
|||
601 | 8 | $currentName = ''; |
|||
602 | 8 | break; |
|||
603 | |||||
604 | // Open hashed array |
||||
605 | 18 | case '<<': |
|||
606 | 1 | $parsed[$currentName] = []; |
|||
607 | 1 | $arrayTypeNumeric = false; |
|||
608 | |||||
609 | // Move up one level in the stack |
||||
610 | 1 | $stack[\count($stack)] = &$parsed; |
|||
611 | 1 | $parsed = &$parsed[$currentName]; |
|||
612 | 1 | $currentName = ''; |
|||
613 | 1 | break; |
|||
614 | |||||
615 | // Close numeric array |
||||
616 | 18 | case ']': |
|||
617 | // Revert string type arrays back to a single element |
||||
618 | 8 | if (\is_array($parsed) && 1 == \count($parsed) |
|||
619 | 8 | && isset($parsed[0]) && \is_string($parsed[0]) |
|||
620 | 8 | && '' !== $parsed[0] && '/' != $parsed[0][0]) { |
|||
621 | 6 | $parsed = '['.$parsed[0].']'; |
|||
622 | } |
||||
623 | // Close hashed array |
||||
624 | // no break |
||||
625 | 18 | case '>>': |
|||
626 | 8 | $arrayTypeNumeric = false; |
|||
627 | |||||
628 | // Move down one level in the stack |
||||
629 | 8 | $parsed = &$stack[\count($stack) - 1]; |
|||
630 | 8 | unset($stack[\count($stack) - 1]); |
|||
631 | 8 | break; |
|||
632 | |||||
633 | default: |
||||
634 | // If value begins with a slash, then this is a name |
||||
635 | // Add it to the appropriate array |
||||
636 | 18 | if ('/' == substr($token, 0, 1)) { |
|||
637 | 18 | $currentName = substr($token, 1); |
|||
638 | 18 | if (true == $arrayTypeNumeric) { |
|||
639 | 7 | $parsed[] = $currentName; |
|||
640 | 18 | $currentName = ''; |
|||
641 | } |
||||
642 | 18 | } elseif ('' != $currentName) { |
|||
643 | 18 | if (false == $arrayTypeNumeric) { |
|||
644 | 18 | $parsed[$currentName] = $token; |
|||
645 | } |
||||
646 | 18 | $currentName = ''; |
|||
647 | 5 | } elseif ('' == $currentName) { |
|||
648 | 5 | $parsed[] = $token; |
|||
649 | } |
||||
650 | } |
||||
651 | } |
||||
652 | |||||
653 | 18 | return $parsed; |
|||
654 | } |
||||
655 | |||||
656 | /** |
||||
657 | * Returns the text content of a PDF as a string. Attempts to add |
||||
658 | * whitespace for spacing and line-breaks where appropriate. |
||||
659 | * |
||||
660 | * getText() leverages getTextArray() to get the content |
||||
661 | * of the document, setting the addPositionWhitespace flag to true |
||||
662 | * so whitespace is inserted in a logical way for reading by |
||||
663 | * humans. |
||||
664 | */ |
||||
665 | 38 | public function getText(?Page $page = null): string |
|||
666 | { |
||||
667 | 38 | $this->addPositionWhitespace = true; |
|||
668 | 38 | $result = $this->getTextArray($page); |
|||
669 | 38 | $this->addPositionWhitespace = false; |
|||
670 | |||||
671 | 38 | return implode('', $result).' '; |
|||
672 | } |
||||
673 | |||||
674 | /** |
||||
675 | * Returns the text content of a PDF as an array of strings. No |
||||
676 | * extra whitespace is inserted besides what is actually encoded in |
||||
677 | * the PDF text. |
||||
678 | * |
||||
679 | * @throws \Exception |
||||
680 | */ |
||||
681 | 48 | public function getTextArray(?Page $page = null): array |
|||
682 | { |
||||
683 | 48 | $result = []; |
|||
684 | 48 | $text = []; |
|||
685 | |||||
686 | 48 | $marked_stack = []; |
|||
687 | 48 | $last_written_position = false; |
|||
688 | |||||
689 | 48 | $sections = $this->getSectionsText($this->content); |
|||
690 | 48 | $current_font = $this->getDefaultFont($page); |
|||
691 | 48 | $current_font_size = 1; |
|||
692 | 48 | $current_text_leading = 0; |
|||
693 | |||||
694 | 48 | $current_position = ['x' => false, 'y' => false]; |
|||
695 | 48 | $current_position_tm = [ |
|||
696 | 48 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||
697 | 48 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||
698 | 48 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||
699 | 48 | ]; |
|||
700 | 48 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||
701 | 48 | $current_position_cm = [ |
|||
702 | 48 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||
703 | 48 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||
704 | 48 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||
705 | 48 | ]; |
|||
706 | |||||
707 | 48 | $clipped_font = []; |
|||
708 | 48 | $clipped_position_cm = []; |
|||
709 | |||||
710 | 48 | self::$recursionStack[] = $this->getUniqueId(); |
|||
711 | |||||
712 | 48 | foreach ($sections as $section) { |
|||
713 | 45 | $commands = $this->getCommandsText($section); |
|||
714 | 45 | foreach ($commands as $command) { |
|||
715 | 45 | switch ($command[self::OPERATOR]) { |
|||
716 | // Begin text object |
||||
717 | 45 | case 'BT': |
|||
718 | // Reset text positioning matrices |
||||
719 | 44 | $current_position_tm = [ |
|||
720 | 44 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||
721 | 44 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||
722 | 44 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||
723 | 44 | ]; |
|||
724 | 44 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||
725 | 44 | $current_text_leading = 0; |
|||
726 | 44 | break; |
|||
727 | |||||
728 | // Begin marked content sequence with property list |
||||
729 | 45 | case 'BDC': |
|||
730 | 16 | if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { |
|||
731 | 16 | $dict = $this->parseDictionary($match[1]); |
|||
732 | |||||
733 | // Check for ActualText block |
||||
734 | 16 | if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { |
|||
735 | 4 | if ('[' == $dict['ActualText'][0]) { |
|||
736 | // Simulate a 'TJ' command on the stack |
||||
737 | $marked_stack[] = [ |
||||
738 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], |
||||
739 | ]; |
||||
740 | 4 | } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { |
|||
741 | // Simulate a 'Tj' command on the stack |
||||
742 | 4 | $marked_stack[] = [ |
|||
743 | 4 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], |
|||
744 | 4 | ]; |
|||
745 | } |
||||
746 | } |
||||
747 | } |
||||
748 | 16 | break; |
|||
749 | |||||
750 | // Begin marked content sequence |
||||
751 | 45 | case 'BMC': |
|||
752 | 2 | if ('ReversedChars' == $command[self::COMMAND]) { |
|||
753 | // Upon encountering a ReversedChars command, |
||||
754 | // add the characters we've built up so far to |
||||
755 | // the result array |
||||
756 | 1 | $result = array_merge($result, $text); |
|||
757 | |||||
758 | // Start a fresh $text array that will contain |
||||
759 | // reversed characters |
||||
760 | 1 | $text = []; |
|||
761 | |||||
762 | // Add the reversed text flag to the stack |
||||
763 | 1 | $marked_stack[] = ['ReversedChars' => true]; |
|||
764 | } |
||||
765 | 2 | break; |
|||
766 | |||||
767 | // set graphics position matrix |
||||
768 | 45 | case 'cm': |
|||
769 | 31 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||
770 | 31 | $current_position_cm = [ |
|||
771 | 31 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||
772 | 31 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||
773 | 31 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||
774 | 31 | ]; |
|||
775 | 31 | break; |
|||
776 | |||||
777 | 45 | case 'Do': |
|||
778 | 16 | if (null !== $page) { |
|||
779 | 16 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||
780 | 16 | $id = trim(array_pop($args), '/ '); |
|||
781 | 16 | $xobject = $page->getXObject($id); |
|||
782 | |||||
783 | // @todo $xobject could be a ElementXRef object, which would then throw an error |
||||
784 | 16 | if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { |
|||
785 | // Not a circular reference. |
||||
786 | 16 | $text[] = $xobject->getText($page); |
|||
787 | } |
||||
788 | } |
||||
789 | 16 | break; |
|||
790 | |||||
791 | // Marked content point with (DP) & without (MP) property list |
||||
792 | 45 | case 'DP': |
|||
793 | 45 | case 'MP': |
|||
794 | 1 | break; |
|||
795 | |||||
796 | // End text object |
||||
797 | 45 | case 'ET': |
|||
798 | 44 | break; |
|||
799 | |||||
800 | // Store current selected font and graphics matrix |
||||
801 | 45 | case 'q': |
|||
802 | 39 | $clipped_font[] = [$current_font, $current_font_size]; |
|||
803 | 39 | $clipped_position_cm[] = $current_position_cm; |
|||
804 | 39 | break; |
|||
805 | |||||
806 | // Restore previous selected font and graphics matrix |
||||
807 | 45 | case 'Q': |
|||
808 | 39 | list($current_font, $current_font_size) = array_pop($clipped_font); |
|||
809 | 39 | $current_position_cm = array_pop($clipped_position_cm); |
|||
810 | 39 | break; |
|||
811 | |||||
812 | // End marked content sequence |
||||
813 | 44 | case 'EMC': |
|||
814 | 17 | $data = false; |
|||
815 | 17 | if (\count($marked_stack)) { |
|||
816 | 5 | $marked = array_pop($marked_stack); |
|||
817 | 5 | $action = key($marked); |
|||
818 | 5 | $data = $marked[$action]; |
|||
819 | |||||
820 | switch ($action) { |
||||
821 | // If we are in ReversedChars mode... |
||||
822 | 5 | case 'ReversedChars': |
|||
823 | // Reverse the characters we've built up so far |
||||
824 | 1 | foreach ($text as $key => $t) { |
|||
825 | 1 | $text[$key] = implode('', array_reverse( |
|||
826 | 1 | mb_str_split($t, 1, mb_internal_encoding()) |
|||
827 | 1 | )); |
|||
828 | } |
||||
829 | |||||
830 | // Add these characters to the result array |
||||
831 | 1 | $result = array_merge($result, $text); |
|||
832 | |||||
833 | // Start a fresh $text array that will contain |
||||
834 | // non-reversed characters |
||||
835 | 1 | $text = []; |
|||
836 | 1 | break; |
|||
837 | |||||
838 | 4 | case 'ActualText': |
|||
839 | // Use the content of the ActualText as a command |
||||
840 | 4 | $command = $data; |
|||
841 | 4 | break; |
|||
842 | } |
||||
843 | } |
||||
844 | |||||
845 | // If this EMC command has been transformed into a 'Tj' |
||||
846 | // or 'TJ' command because of being ActualText, then bypass |
||||
847 | // the break to proceed to the writing section below. |
||||
848 | 17 | if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { |
|||
849 | 17 | break; |
|||
850 | } |
||||
851 | |||||
852 | // no break |
||||
853 | 44 | case "'": |
|||
854 | 44 | case '"': |
|||
855 | 4 | if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { |
|||
856 | // Move to next line and write text |
||||
857 | $current_position['x'] = 0; |
||||
858 | $current_position_td['x'] = 0; |
||||
859 | $current_position_td['y'] += $current_text_leading; |
||||
860 | } |
||||
861 | // no break |
||||
862 | 44 | case 'Tj': |
|||
863 | 35 | $command[self::COMMAND] = [$command]; |
|||
864 | // no break |
||||
865 | 44 | case 'TJ': |
|||
866 | // Check the marked content stack for flags |
||||
867 | 44 | $actual_text = false; |
|||
868 | 44 | $reverse_text = false; |
|||
869 | 44 | foreach ($marked_stack as $marked) { |
|||
870 | 5 | if (isset($marked['ActualText'])) { |
|||
871 | 4 | $actual_text = true; |
|||
872 | } |
||||
873 | 5 | if (isset($marked['ReversedChars'])) { |
|||
874 | 1 | $reverse_text = true; |
|||
875 | } |
||||
876 | } |
||||
877 | |||||
878 | // Account for text position ONLY just before we write text |
||||
879 | 44 | if (false === $actual_text && \is_array($last_written_position)) { |
|||
880 | // If $last_written_position is an array, that |
||||
881 | // means we have stored text position coordinates |
||||
882 | // for placing an ActualText |
||||
883 | 4 | $currentX = $last_written_position[0]; |
|||
884 | 4 | $currentY = $last_written_position[1]; |
|||
885 | 4 | $last_written_position = false; |
|||
886 | } else { |
||||
887 | 44 | $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; |
|||
888 | 44 | $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; |
|||
889 | } |
||||
890 | 44 | $whiteSpace = ''; |
|||
891 | |||||
892 | 44 | $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; |
|||
893 | 44 | $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; |
|||
894 | |||||
895 | 44 | if (true === $this->addPositionWhitespace && false !== $current_position['x']) { |
|||
896 | 31 | $curY = $currentY - $current_position['y']; |
|||
897 | 31 | if (abs($curY) >= abs($factorY) / 4) { |
|||
898 | 30 | $whiteSpace = "\n"; |
|||
899 | } else { |
||||
900 | 30 | if (true === $reverse_text) { |
|||
901 | 1 | $curX = $current_position['x'] - $currentX; |
|||
902 | } else { |
||||
903 | 30 | $curX = $currentX - $current_position['x']; |
|||
904 | } |
||||
905 | |||||
906 | // In abs($factorX * 7) below, the 7 is chosen arbitrarily |
||||
907 | // as the number of apparent "spaces" in a document we |
||||
908 | // would need before considering them a "tab". In the |
||||
909 | // future, we might offer this value to users as a config |
||||
910 | // option. |
||||
911 | 30 | if ($curX >= abs($factorX * 7)) { |
|||
912 | 20 | $whiteSpace = "\t"; |
|||
913 | 29 | } elseif ($curX >= abs($factorX * 2)) { |
|||
914 | 19 | $whiteSpace = ' '; |
|||
915 | } |
||||
916 | } |
||||
917 | } |
||||
918 | |||||
919 | 44 | $newtext = $this->getTJUsingFontFallback( |
|||
920 | 44 | $current_font, |
|||
921 | 44 | $command[self::COMMAND], |
|||
922 | 44 | $page, |
|||
923 | 44 | $factorX |
|||
924 | 44 | ); |
|||
925 | |||||
926 | // If there is no ActualText pending then write |
||||
927 | 44 | if (false === $actual_text) { |
|||
928 | 44 | $newtext = str_replace(["\r", "\n"], '', $newtext); |
|||
929 | 44 | if (false !== $reverse_text) { |
|||
930 | // If we are in ReversedChars mode, add the whitespace last |
||||
931 | 1 | $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); |
|||
932 | } else { |
||||
933 | // Otherwise add the whitespace first |
||||
934 | 44 | if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { |
|||
935 | 18 | $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); |
|||
936 | } |
||||
937 | 44 | $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); |
|||
938 | } |
||||
939 | |||||
940 | // Record the position of this inserted text for comparison |
||||
941 | // with the next text block. |
||||
942 | // Provide a 'fudge' factor guess on how wide this text block |
||||
943 | // is based on the number of characters. This helps limit the |
||||
944 | // number of tabs inserted, but isn't perfect. |
||||
945 | 44 | $factor = $factorX / 2; |
|||
946 | 44 | $current_position = [ |
|||
947 | 44 | 'x' => $currentX - mb_strlen($newtext) * $factor, |
|||
948 | 44 | 'y' => $currentY, |
|||
949 | 44 | ]; |
|||
950 | 4 | } elseif (false === $last_written_position) { |
|||
951 | // If there is an ActualText in the pipeline |
||||
952 | // store the position this undisplayed text |
||||
953 | // *would* have been written to, so the |
||||
954 | // ActualText is displayed in the right spot |
||||
955 | 4 | $last_written_position = [$currentX, $currentY]; |
|||
956 | 4 | $current_position['x'] = $currentX; |
|||
957 | } |
||||
958 | 44 | break; |
|||
959 | |||||
960 | // move to start of next line |
||||
961 | 44 | case 'T*': |
|||
962 | 13 | $current_position['x'] = 0; |
|||
963 | 13 | $current_position_td['x'] = 0; |
|||
964 | 13 | $current_position_td['y'] += $current_text_leading; |
|||
965 | 13 | break; |
|||
966 | |||||
967 | // set character spacing |
||||
968 | 44 | case 'Tc': |
|||
969 | 13 | break; |
|||
970 | |||||
971 | // move text current point and set leading |
||||
972 | 44 | case 'Td': |
|||
973 | 44 | case 'TD': |
|||
974 | // move text current point |
||||
975 | 32 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||
976 | 32 | $y = (float) array_pop($args); |
|||
977 | 32 | $x = (float) array_pop($args); |
|||
978 | |||||
979 | 32 | if ('TD' == $command[self::OPERATOR]) { |
|||
980 | 7 | $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; |
|||
981 | } |
||||
982 | |||||
983 | 32 | $current_position_td = [ |
|||
984 | 32 | 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], |
|||
985 | 32 | 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], |
|||
986 | 32 | ]; |
|||
987 | 32 | break; |
|||
988 | |||||
989 | 44 | case 'Tf': |
|||
990 | 44 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||
991 | 44 | $size = (float) array_pop($args); |
|||
992 | 44 | $id = trim(array_pop($args), '/'); |
|||
993 | 44 | if (null !== $page) { |
|||
994 | 44 | $new_font = $page->getFont($id); |
|||
995 | // If an invalid font ID is given, do not update the font. |
||||
996 | // This should theoretically never happen, as the PDF spec states for the Tf operator: |
||||
997 | // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" |
||||
998 | // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) |
||||
999 | // But we want to make sure that malformed PDFs do not simply crash. |
||||
1000 | 44 | if (null !== $new_font) { |
|||
1001 | 44 | $current_font = $new_font; |
|||
1002 | 44 | $current_font_size = $size; |
|||
1003 | } |
||||
1004 | } |
||||
1005 | 44 | break; |
|||
1006 | |||||
1007 | // set leading |
||||
1008 | 38 | case 'TL': |
|||
1009 | 6 | $y = (float) $command[self::COMMAND]; |
|||
1010 | 6 | $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; |
|||
1011 | 6 | break; |
|||
1012 | |||||
1013 | // set text position matrix |
||||
1014 | 38 | case 'Tm': |
|||
1015 | 35 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||
1016 | 35 | $current_position_tm = [ |
|||
1017 | 35 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||
1018 | 35 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||
1019 | 35 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||
1020 | 35 | ]; |
|||
1021 | 35 | break; |
|||
1022 | |||||
1023 | // set text rendering mode |
||||
1024 | 23 | case 'Ts': |
|||
1025 | break; |
||||
1026 | |||||
1027 | // set super/subscripting text rise |
||||
1028 | 23 | case 'Ts': |
|||
1029 | break; |
||||
1030 | |||||
1031 | // set word spacing |
||||
1032 | 23 | case 'Tw': |
|||
1033 | 9 | break; |
|||
1034 | |||||
1035 | // set horizontal scaling |
||||
1036 | 23 | case 'Tz': |
|||
1037 | break; |
||||
1038 | |||||
1039 | default: |
||||
1040 | } |
||||
1041 | } |
||||
1042 | } |
||||
1043 | |||||
1044 | 48 | $result = array_merge($result, $text); |
|||
1045 | |||||
1046 | 48 | return $result; |
|||
1047 | } |
||||
1048 | |||||
1049 | /** |
||||
1050 | * getCommandsText() expects the content of $text_part to be an |
||||
1051 | * already formatted, single-line command from a document stream. |
||||
1052 | * The companion function getSectionsText() returns a document |
||||
1053 | * stream as an array of single commands for just this purpose. |
||||
1054 | * Because of this, the argument $offset is no longer used, and |
||||
1055 | * may be removed in a future PdfParser release. |
||||
1056 | * |
||||
1057 | * A better name for this function would be getCommandText() |
||||
1058 | * since it now always works on just one command. |
||||
1059 | */ |
||||
1060 | 52 | public function getCommandsText(string $text_part, int &$offset = 0): array |
|||
1061 | { |
||||
1062 | 52 | $commands = $matches = []; |
|||
1063 | |||||
1064 | 52 | preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches); |
|||
1065 | |||||
1066 | // If no valid command is detected, return an empty array |
||||
1067 | 52 | if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { |
|||
1068 | 1 | return []; |
|||
1069 | } |
||||
1070 | |||||
1071 | 52 | $type = $matches[2]; |
|||
1072 | 52 | $operator = $matches[3]; |
|||
1073 | 52 | $command = trim($matches[1]); |
|||
1074 | |||||
1075 | 52 | if ('TJ' == $operator) { |
|||
1076 | 41 | $subcommand = []; |
|||
1077 | 41 | $command = trim($command, '[]'); |
|||
1078 | do { |
||||
1079 | 41 | $oldCommand = $command; |
|||
1080 | |||||
1081 | // Search for parentheses string () format |
||||
1082 | 41 | if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) { |
|||
1083 | 34 | $subcommand[] = [ |
|||
1084 | 34 | self::TYPE => '(', |
|||
1085 | 34 | self::OPERATOR => 'TJ', |
|||
1086 | 34 | self::COMMAND => $tjmatch[1], |
|||
1087 | 34 | ]; |
|||
1088 | 34 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||
1089 | 28 | $subcommand[] = [ |
|||
1090 | 28 | self::TYPE => 'n', |
|||
1091 | 28 | self::OPERATOR => '', |
|||
1092 | 28 | self::COMMAND => $tjmatch[2], |
|||
1093 | 28 | ]; |
|||
1094 | } |
||||
1095 | 34 | $command = substr($command, \strlen($tjmatch[0])); |
|||
1096 | } |
||||
1097 | |||||
1098 | // Search for hexadecimal <> format |
||||
1099 | 41 | if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { |
|||
1100 | 20 | $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); |
|||
1101 | 20 | $subcommand[] = [ |
|||
1102 | 20 | self::TYPE => '<', |
|||
1103 | 20 | self::OPERATOR => 'TJ', |
|||
1104 | 20 | self::COMMAND => $tjmatch[1], |
|||
1105 | 20 | ]; |
|||
1106 | 20 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||
1107 | 19 | $subcommand[] = [ |
|||
1108 | 19 | self::TYPE => 'n', |
|||
1109 | 19 | self::OPERATOR => '', |
|||
1110 | 19 | self::COMMAND => $tjmatch[2], |
|||
1111 | 19 | ]; |
|||
1112 | } |
||||
1113 | 20 | $command = substr($command, \strlen($tjmatch[0])); |
|||
1114 | } |
||||
1115 | 41 | } while ($command != $oldCommand); |
|||
1116 | |||||
1117 | 41 | $command = $subcommand; |
|||
1118 | 52 | } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { |
|||
1119 | // Depending on the string type, trim the data of the |
||||
1120 | // appropriate delimiters |
||||
1121 | 39 | if ('(' == $type) { |
|||
1122 | // Don't use trim() here since a () string may end with |
||||
1123 | // a balanced or escaped right parentheses, and trim() |
||||
1124 | // will delete both. Both strings below are valid: |
||||
1125 | // eg. (String()) |
||||
1126 | // eg. (String\)) |
||||
1127 | 33 | $command = preg_replace('/^\(|\)$/', '', $command); |
|||
1128 | 15 | } elseif ('<' == $type) { |
|||
1129 | 39 | $command = trim($command, '<>'); |
|||
1130 | } |
||||
1131 | 52 | } elseif ('/' == $type) { |
|||
1132 | 51 | $command = substr($command, 1); |
|||
1133 | } |
||||
1134 | |||||
1135 | 52 | $commands[] = [ |
|||
1136 | 52 | self::TYPE => $type, |
|||
1137 | 52 | self::OPERATOR => $operator, |
|||
1138 | 52 | self::COMMAND => $command, |
|||
1139 | 52 | ]; |
|||
1140 | |||||
1141 | 52 | return $commands; |
|||
1142 | } |
||||
1143 | |||||
1144 | 67 | public static function factory( |
|||
1145 | Document $document, |
||||
1146 | Header $header, |
||||
1147 | ?string $content, |
||||
1148 | ?Config $config = null |
||||
1149 | ): self { |
||||
1150 | 67 | switch ($header->get('Type')->getContent()) { |
|||
1151 | 67 | case 'XObject': |
|||
1152 | 20 | switch ($header->get('Subtype')->getContent()) { |
|||
1153 | 20 | case 'Image': |
|||
1154 | 13 | return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config); |
|||
1155 | |||||
1156 | 8 | case 'Form': |
|||
1157 | 8 | return new Form($document, $header, $content, $config); |
|||
1158 | } |
||||
1159 | |||||
1160 | return new self($document, $header, $content, $config); |
||||
1161 | |||||
1162 | 67 | case 'Pages': |
|||
1163 | 66 | return new Pages($document, $header, $content, $config); |
|||
1164 | |||||
1165 | 67 | case 'Page': |
|||
1166 | 66 | return new Page($document, $header, $content, $config); |
|||
1167 | |||||
1168 | 67 | case 'Encoding': |
|||
1169 | 12 | return new Encoding($document, $header, $content, $config); |
|||
1170 | |||||
1171 | 67 | case 'Font': |
|||
1172 | 65 | $subtype = $header->get('Subtype')->getContent(); |
|||
1173 | 65 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
|||
1174 | |||||
1175 | 65 | if (class_exists($classname)) { |
|||
1176 | 65 | return new $classname($document, $header, $content, $config); |
|||
1177 | } |
||||
1178 | |||||
1179 | return new Font($document, $header, $content, $config); |
||||
1180 | |||||
1181 | default: |
||||
1182 | 67 | return new self($document, $header, $content, $config); |
|||
1183 | } |
||||
1184 | } |
||||
1185 | |||||
1186 | /** |
||||
1187 | * Returns unique id identifying the object. |
||||
1188 | */ |
||||
1189 | 48 | protected function getUniqueId(): string |
|||
1190 | { |
||||
1191 | 48 | return spl_object_hash($this); |
|||
1192 | } |
||||
1193 | } |
||||
1194 |
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.