1 | <?php |
||||||
2 | |||||||
3 | /** |
||||||
4 | * @file |
||||||
5 | * This file is part of the PdfParser library. |
||||||
6 | * |
||||||
7 | * @author Sébastien MALOT <[email protected]> |
||||||
8 | * |
||||||
9 | * @date 2017-01-03 |
||||||
10 | * |
||||||
11 | * @license LGPLv3 |
||||||
12 | * |
||||||
13 | * @url <https://github.com/smalot/pdfparser> |
||||||
14 | * |
||||||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||||
17 | * |
||||||
18 | * This program is free software: you can redistribute it and/or modify |
||||||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||||||
20 | * the Free Software Foundation, either version 3 of the License, or |
||||||
21 | * (at your option) any later version. |
||||||
22 | * |
||||||
23 | * This program is distributed in the hope that it will be useful, |
||||||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
26 | * GNU Lesser General Public License for more details. |
||||||
27 | * |
||||||
28 | * You should have received a copy of the GNU Lesser General Public License |
||||||
29 | * along with this program. |
||||||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||||
31 | */ |
||||||
32 | |||||||
33 | namespace Smalot\PdfParser; |
||||||
34 | |||||||
35 | use Smalot\PdfParser\XObject\Form; |
||||||
36 | use Smalot\PdfParser\XObject\Image; |
||||||
37 | |||||||
38 | /** |
||||||
39 | * Class PDFObject |
||||||
40 | */ |
||||||
41 | class PDFObject |
||||||
42 | { |
||||||
43 | public const TYPE = 't'; |
||||||
44 | |||||||
45 | public const OPERATOR = 'o'; |
||||||
46 | |||||||
47 | public const COMMAND = 'c'; |
||||||
48 | |||||||
49 | /** |
||||||
50 | * The recursion stack. |
||||||
51 | * |
||||||
52 | * @var array |
||||||
53 | */ |
||||||
54 | public static $recursionStack = []; |
||||||
55 | |||||||
56 | /** |
||||||
57 | * @var Document|null |
||||||
58 | */ |
||||||
59 | protected $document; |
||||||
60 | |||||||
61 | /** |
||||||
62 | * @var Header |
||||||
63 | */ |
||||||
64 | protected $header; |
||||||
65 | |||||||
66 | /** |
||||||
67 | * @var string |
||||||
68 | */ |
||||||
69 | protected $content; |
||||||
70 | |||||||
71 | /** |
||||||
72 | * @var Config|null |
||||||
73 | */ |
||||||
74 | protected $config; |
||||||
75 | |||||||
76 | /** |
||||||
77 | * @var bool |
||||||
78 | */ |
||||||
79 | protected $addPositionWhitespace = false; |
||||||
80 | |||||||
81 | 96 | public function __construct( |
|||||
82 | Document $document, |
||||||
83 | ?Header $header = null, |
||||||
84 | ?string $content = null, |
||||||
85 | ?Config $config = null |
||||||
86 | ) { |
||||||
87 | 96 | $this->document = $document; |
|||||
88 | 96 | $this->header = $header ?? new Header(); |
|||||
89 | 96 | $this->content = $content; |
|||||
90 | 96 | $this->config = $config; |
|||||
91 | } |
||||||
92 | |||||||
93 | 72 | public function init() |
|||||
94 | { |
||||||
95 | 72 | } |
|||||
96 | |||||||
97 | 4 | public function getDocument(): Document |
|||||
98 | { |
||||||
99 | 4 | return $this->document; |
|||||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
Loading history...
|
|||||||
100 | } |
||||||
101 | |||||||
102 | 72 | public function getHeader(): ?Header |
|||||
103 | { |
||||||
104 | 72 | return $this->header; |
|||||
105 | } |
||||||
106 | |||||||
107 | 4 | public function getConfig(): ?Config |
|||||
108 | { |
||||||
109 | 4 | return $this->config; |
|||||
110 | } |
||||||
111 | |||||||
112 | /** |
||||||
113 | * @return Element|PDFObject|Header |
||||||
114 | */ |
||||||
115 | 75 | public function get(string $name) |
|||||
116 | { |
||||||
117 | 75 | return $this->header->get($name); |
|||||
118 | } |
||||||
119 | |||||||
120 | 74 | public function has(string $name): bool |
|||||
121 | { |
||||||
122 | 74 | return $this->header->has($name); |
|||||
123 | } |
||||||
124 | |||||||
125 | 4 | public function getDetails(bool $deep = true): array |
|||||
126 | { |
||||||
127 | 4 | return $this->header->getDetails($deep); |
|||||
128 | } |
||||||
129 | |||||||
130 | 59 | public function getContent(): ?string |
|||||
131 | { |
||||||
132 | 59 | return $this->content; |
|||||
133 | } |
||||||
134 | |||||||
135 | /** |
||||||
136 | * Creates a duplicate of the document stream with |
||||||
137 | * strings and other items replaced by $char. Formerly |
||||||
138 | * getSectionsText() used this output to more easily gather offset |
||||||
139 | * values to extract text from the *actual* document stream. |
||||||
140 | * |
||||||
141 | * @deprecated function is no longer used and will be removed in a future release |
||||||
142 | * |
||||||
143 | * @internal |
||||||
144 | */ |
||||||
145 | 1 | public function cleanContent(string $content, string $char = 'X') |
|||||
146 | { |
||||||
147 | 1 | $char = $char[0]; |
|||||
148 | 1 | $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content); |
|||||
149 | |||||||
150 | // Remove image bloc with binary content |
||||||
151 | 1 | preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
152 | 1 | foreach ($matches[0] as $part) { |
|||||
153 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||||||
154 | } |
||||||
155 | |||||||
156 | // Clean content in square brackets [.....] |
||||||
157 | 1 | preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
158 | 1 | foreach ($matches[1] as $part) { |
|||||
159 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
160 | } |
||||||
161 | |||||||
162 | // Clean content in round brackets (.....) |
||||||
163 | 1 | preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
164 | 1 | foreach ($matches[1] as $part) { |
|||||
165 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
166 | } |
||||||
167 | |||||||
168 | // Clean structure |
||||||
169 | 1 | if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { |
|||||
0 ignored issues
–
show
It seems like
$content can also be of type array ; however, parameter $subject of preg_split() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
170 | 1 | $content = ''; |
|||||
171 | 1 | $level = 0; |
|||||
172 | 1 | foreach ($parts as $part) { |
|||||
173 | 1 | if ('<' == $part) { |
|||||
174 | 1 | ++$level; |
|||||
175 | } |
||||||
176 | |||||||
177 | 1 | $content .= (0 == $level ? $part : str_repeat($char, \strlen($part))); |
|||||
178 | |||||||
179 | 1 | if ('>' == $part) { |
|||||
180 | 1 | --$level; |
|||||
181 | } |
||||||
182 | } |
||||||
183 | } |
||||||
184 | |||||||
185 | // Clean BDC and EMC markup |
||||||
186 | 1 | preg_match_all( |
|||||
187 | 1 | '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', |
|||||
188 | 1 | $content, |
|||||
189 | 1 | $matches, |
|||||
190 | 1 | \PREG_OFFSET_CAPTURE |
|||||
191 | 1 | ); |
|||||
192 | 1 | foreach ($matches[1] as $part) { |
|||||
193 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
194 | } |
||||||
195 | |||||||
196 | 1 | preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
197 | 1 | foreach ($matches[1] as $part) { |
|||||
198 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
199 | } |
||||||
200 | |||||||
201 | 1 | return $content; |
|||||
202 | } |
||||||
203 | |||||||
204 | /** |
||||||
205 | * Takes a string of PDF document stream text and formats |
||||||
206 | * it into a multi-line string with one PDF command on each line, |
||||||
207 | * separated by \r\n. If the given string is null, or binary data |
||||||
208 | * is detected instead of a document stream then return an empty |
||||||
209 | * string. |
||||||
210 | */ |
||||||
211 | 54 | private function formatContent(?string $content): string |
|||||
212 | { |
||||||
213 | 54 | if (null === $content) { |
|||||
214 | 3 | return ''; |
|||||
215 | } |
||||||
216 | |||||||
217 | // Outside of (String) and inline image content in PDF document |
||||||
218 | // streams, all text should conform to UTF-8. Test for binary |
||||||
219 | // content by deleting everything after the first open- |
||||||
220 | // parenthesis ( which indicates the beginning of a string, or |
||||||
221 | // the first ID command which indicates the beginning of binary |
||||||
222 | // inline image content. Then test what remains for valid |
||||||
223 | // UTF-8. If it's not UTF-8, return an empty string as this |
||||||
224 | // $content is most likely binary. Unfortunately, using |
||||||
225 | // mb_check_encoding(..., 'UTF-8') is not strict enough, so the |
||||||
226 | // following regexp, adapted from the W3, is used. See: |
||||||
227 | // https://www.w3.org/International/questions/qa-forms-utf-8.en |
||||||
228 | // We use preg_replace() instead of preg_match() to avoid "JIT |
||||||
229 | // stack limit exhausted" errors on larger files. |
||||||
230 | 51 | $utf8Filter = preg_replace('/( |
|||||
231 | [\x09\x0A\x0D\x20-\x7E] | # ASCII |
||||||
232 | [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte |
||||||
233 | \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs |
||||||
234 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte |
||||||
235 | \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates |
||||||
236 | \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 |
||||||
237 | [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 |
||||||
238 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||||||
239 | 51 | )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); |
|||||
240 | |||||||
241 | 51 | if ('' !== $utf8Filter) { |
|||||
242 | 1 | return ''; |
|||||
243 | } |
||||||
244 | |||||||
245 | // Find all inline image content and replace them so they aren't |
||||||
246 | // affected by the next steps |
||||||
247 | 51 | $pdfInlineImages = []; |
|||||
248 | 51 | $offsetBI = 0; |
|||||
249 | 51 | while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { |
|||||
250 | // Attempt to detemine if this instance of the 'BI' command |
||||||
251 | // actually occured within a (string) using the following |
||||||
252 | // steps: |
||||||
253 | |||||||
254 | // Step 1: Remove any escaped slashes and parentheses from |
||||||
255 | // the alleged image characteristics data |
||||||
256 | 1 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]); |
|||||
257 | |||||||
258 | // Step 2: Remove all correctly ordered and balanced |
||||||
259 | // parentheses from (strings) |
||||||
260 | do { |
||||||
261 | 1 | $paraTest = $para; |
|||||
262 | 1 | $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); |
|||||
263 | 1 | } while ($para != $paraTest); |
|||||
264 | |||||||
265 | 1 | $paraOpen = strpos($para, '('); |
|||||
266 | 1 | $paraClose = strpos($para, ')'); |
|||||
267 | |||||||
268 | // Check: If the remaining text contains a close parenthesis |
||||||
269 | // ')' AND it occurs before any open parenthesis, then we |
||||||
270 | // are almost certain to be inside a (string) |
||||||
271 | 1 | if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { |
|||||
272 | // Bump the search offset forward and match again |
||||||
273 | 1 | $offsetBI = (int) $text[1][1]; |
|||||
274 | 1 | continue; |
|||||
275 | } |
||||||
276 | |||||||
277 | // Step 3: Double check that this is actually inline image |
||||||
278 | // data by parsing the alleged image characteristics as a |
||||||
279 | // dictionary |
||||||
280 | 1 | $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); |
|||||
281 | |||||||
282 | // Check if an image Width and Height are set in the dict |
||||||
283 | 1 | if ((isset($dict['W']) || isset($dict['Width'])) |
|||||
284 | 1 | && (isset($dict['H']) || isset($dict['Height']))) { |
|||||
285 | 1 | $id = uniqid('IMAGE_', true); |
|||||
286 | 1 | $pdfInlineImages[$id] = [ |
|||||
287 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), |
|||||
288 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), |
|||||
289 | 1 | ]; |
|||||
290 | 1 | $content = preg_replace( |
|||||
291 | 1 | '/'.preg_quote($text[0][0], '/').'/', |
|||||
292 | 1 | '^^^'.$id.'^^^', |
|||||
293 | 1 | $content, |
|||||
294 | 1 | 1 |
|||||
295 | 1 | ); |
|||||
296 | } else { |
||||||
297 | // If there was no valid dictionary, or a height and width |
||||||
298 | // weren't specified, then we don't know what this is, so |
||||||
299 | // just leave it alone; bump the search offset forward and |
||||||
300 | // match again |
||||||
301 | $offsetBI = (int) $text[1][1]; |
||||||
302 | } |
||||||
303 | } |
||||||
304 | |||||||
305 | // Find all strings () and replace them so they aren't affected |
||||||
306 | // by the next steps |
||||||
307 | 51 | $pdfstrings = []; |
|||||
308 | 51 | $attempt = '('; |
|||||
309 | 51 | while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) { |
|||||
310 | // Remove all escaped slashes and parentheses from the target text |
||||||
311 | 42 | $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]); |
|||||
312 | |||||||
313 | // PDF strings can contain unescaped parentheses as long as |
||||||
314 | // they're balanced, so check for balanced parentheses |
||||||
315 | 42 | $left = preg_match_all('/\(/', $para); |
|||||
316 | 42 | $right = preg_match_all('/\)/', $para); |
|||||
317 | |||||||
318 | 42 | if (')' == $para[-1] && $left == $right) { |
|||||
319 | // Replace the string with a unique placeholder |
||||||
320 | 42 | $id = uniqid('STRING_', true); |
|||||
321 | 42 | $pdfstrings[$id] = $text[0]; |
|||||
322 | 42 | $content = preg_replace( |
|||||
323 | 42 | '/'.preg_quote($text[0], '/').'/', |
|||||
324 | 42 | '@@@'.$id.'@@@', |
|||||
325 | 42 | $content, |
|||||
326 | 42 | 1 |
|||||
327 | 42 | ); |
|||||
328 | |||||||
329 | // Reset to search for the next string |
||||||
330 | 42 | $attempt = '('; |
|||||
331 | } else { |
||||||
332 | // We had unbalanced parentheses, so use the current |
||||||
333 | // match as a base to find a longer string |
||||||
334 | 21 | $attempt = $text[0]; |
|||||
335 | } |
||||||
336 | } |
||||||
337 | |||||||
338 | // Remove all carriage returns and line-feeds from the document stream |
||||||
339 | 51 | $content = str_replace(["\r", "\n"], ' ', trim($content)); |
|||||
340 | |||||||
341 | // Find all dictionary << >> commands and replace them so they |
||||||
342 | // aren't affected by the next steps |
||||||
343 | 51 | $dictstore = []; |
|||||
344 | 51 | while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { |
|||||
345 | 18 | $dictid = uniqid('DICT_', true); |
|||||
346 | 18 | $dictstore[$dictid] = $dicttext[1]; |
|||||
347 | 18 | $content = preg_replace( |
|||||
348 | 18 | '/'.preg_quote($dicttext[0], '/').'/', |
|||||
349 | 18 | ' ###'.$dictid.'###'.$dicttext[2], |
|||||
350 | 18 | $content, |
|||||
351 | 18 | 1 |
|||||
352 | 18 | ); |
|||||
353 | } |
||||||
354 | |||||||
355 | // Normalize white-space in the document stream |
||||||
356 | 51 | $content = preg_replace('/\s{2,}/', ' ', $content); |
|||||
357 | |||||||
358 | // Find all valid PDF operators and add \r\n after each; this |
||||||
359 | // ensures there is just one command on every line |
||||||
360 | // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A |
||||||
361 | // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A |
||||||
362 | // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while |
||||||
363 | // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions |
||||||
364 | // appear here in the list for completeness. |
||||||
365 | 51 | $operators = [ |
|||||
366 | 51 | 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', |
|||||
367 | 51 | 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', |
|||||
368 | 51 | 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', |
|||||
369 | 51 | 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', |
|||||
370 | 51 | 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', |
|||||
371 | 51 | 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', |
|||||
372 | 51 | ]; |
|||||
373 | 51 | foreach ($operators as $operator) { |
|||||
374 | 51 | $content = preg_replace( |
|||||
375 | 51 | '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/', |
|||||
376 | 51 | $operator."\r\n", |
|||||
377 | 51 | $content |
|||||
378 | 51 | ); |
|||||
379 | } |
||||||
380 | |||||||
381 | // Restore the original content of the dictionary << >> commands |
||||||
382 | 51 | $dictstore = array_reverse($dictstore, true); |
|||||
383 | 51 | foreach ($dictstore as $id => $dict) { |
|||||
384 | 18 | $content = str_replace('###'.$id.'###', $dict, $content); |
|||||
385 | } |
||||||
386 | |||||||
387 | // Restore the original string content |
||||||
388 | 51 | $pdfstrings = array_reverse($pdfstrings, true); |
|||||
389 | 51 | foreach ($pdfstrings as $id => $text) { |
|||||
390 | // Strings may contain escaped newlines, or literal newlines |
||||||
391 | // and we should clean these up before replacing the string |
||||||
392 | // back into the content stream; this ensures no strings are |
||||||
393 | // split between two lines (every command must be on one line) |
||||||
394 | 42 | $text = str_replace( |
|||||
395 | 42 | ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], |
|||||
396 | 42 | ['', '', '', '\r', '\n'], |
|||||
397 | 42 | $text |
|||||
398 | 42 | ); |
|||||
399 | |||||||
400 | 42 | $content = str_replace('@@@'.$id.'@@@', $text, $content); |
|||||
401 | } |
||||||
402 | |||||||
403 | // Restore the original content of any inline images |
||||||
404 | 51 | $pdfInlineImages = array_reverse($pdfInlineImages, true); |
|||||
405 | 51 | foreach ($pdfInlineImages as $id => $image) { |
|||||
406 | 1 | $content = str_replace( |
|||||
407 | 1 | '^^^'.$id.'^^^', |
|||||
408 | 1 | "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", |
|||||
409 | 1 | $content |
|||||
410 | 1 | ); |
|||||
411 | } |
||||||
412 | |||||||
413 | 51 | $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); |
|||||
414 | |||||||
415 | 51 | return $content; |
|||||
416 | } |
||||||
417 | |||||||
418 | /** |
||||||
419 | * getSectionsText() now takes an entire, unformatted |
||||||
420 | * document stream as a string, cleans it, then filters out |
||||||
421 | * commands that aren't needed for text positioning/extraction. It |
||||||
422 | * returns an array of unprocessed PDF commands, one command per |
||||||
423 | * element. |
||||||
424 | * |
||||||
425 | * @internal |
||||||
426 | */ |
||||||
427 | 52 | public function getSectionsText(?string $content): array |
|||||
428 | { |
||||||
429 | 52 | $sections = []; |
|||||
430 | |||||||
431 | // A cleaned stream has one command on every line, so split the |
||||||
432 | // cleaned stream content on \r\n into an array |
||||||
433 | 52 | $textCleaned = preg_split( |
|||||
434 | 52 | '/(\r\n|\n|\r)/', |
|||||
435 | 52 | $this->formatContent($content), |
|||||
436 | 52 | -1, |
|||||
437 | 52 | \PREG_SPLIT_NO_EMPTY |
|||||
438 | 52 | ); |
|||||
439 | |||||||
440 | 52 | $inTextBlock = false; |
|||||
441 | 52 | foreach ($textCleaned as $line) { |
|||||
442 | 49 | $line = trim($line); |
|||||
443 | |||||||
444 | // Skip empty lines |
||||||
445 | 49 | if ('' === $line) { |
|||||
446 | continue; |
||||||
447 | } |
||||||
448 | |||||||
449 | // If a 'BT' is encountered, set the $inTextBlock flag |
||||||
450 | 49 | if (preg_match('/BT$/', $line)) { |
|||||
451 | 49 | $inTextBlock = true; |
|||||
452 | 49 | $sections[] = $line; |
|||||
453 | |||||||
454 | // If an 'ET' is encountered, unset the $inTextBlock flag |
||||||
455 | 49 | } elseif ('ET' == $line) { |
|||||
456 | 49 | $inTextBlock = false; |
|||||
457 | 49 | $sections[] = $line; |
|||||
458 | 49 | } elseif ($inTextBlock) { |
|||||
459 | // If we are inside a BT ... ET text block, save all lines |
||||||
460 | 49 | $sections[] = trim($line); |
|||||
461 | } else { |
||||||
462 | // Otherwise, if we are outside of a text block, only |
||||||
463 | // save specific, necessary lines. Care should be taken |
||||||
464 | // to ensure a command being checked for *only* matches |
||||||
465 | // that command. For instance, a simple search for 'c' |
||||||
466 | // may also match the 'sc' command. See the command |
||||||
467 | // list in the formatContent() method above. |
||||||
468 | // Add more commands to save here as you find them in |
||||||
469 | // weird PDFs! |
||||||
470 | 48 | if ('q' == $line[-1] || 'Q' == $line[-1]) { |
|||||
471 | // Save and restore graphics state commands |
||||||
472 | 42 | $sections[] = $line; |
|||||
473 | 48 | } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) { |
|||||
474 | // Begin marked content sequence |
||||||
475 | 16 | $sections[] = $line; |
|||||
476 | 48 | } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) { |
|||||
477 | // Marked content point |
||||||
478 | 1 | $sections[] = $line; |
|||||
479 | 47 | } elseif (preg_match('/(?<!\w)EMC$/', $line)) { |
|||||
480 | // End marked content sequence |
||||||
481 | 15 | $sections[] = $line; |
|||||
482 | 45 | } elseif (preg_match('/(?<!\w)cm$/', $line)) { |
|||||
483 | // Graphics position change commands |
||||||
484 | 33 | $sections[] = $line; |
|||||
485 | 45 | } elseif (preg_match('/(?<!\w)Tf$/', $line)) { |
|||||
486 | // Font change commands |
||||||
487 | 3 | $sections[] = $line; |
|||||
488 | 45 | } elseif (preg_match('/(?<!\w)Do$/', $line)) { |
|||||
489 | // Invoke named XObject command |
||||||
490 | 15 | $sections[] = $line; |
|||||
491 | } |
||||||
492 | } |
||||||
493 | } |
||||||
494 | |||||||
495 | 52 | return $sections; |
|||||
496 | } |
||||||
497 | |||||||
498 | 46 | private function getDefaultFont(?Page $page = null): Font |
|||||
499 | { |
||||||
500 | 46 | $fonts = []; |
|||||
501 | 46 | if (null !== $page) { |
|||||
502 | 44 | $fonts = $page->getFonts(); |
|||||
503 | } |
||||||
504 | |||||||
505 | 46 | $firstFont = $this->document->getFirstFont(); |
|||||
0 ignored issues
–
show
The method
getFirstFont() does not exist on null .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
506 | 46 | if (null !== $firstFont) { |
|||||
507 | 43 | $fonts[] = $firstFont; |
|||||
508 | } |
||||||
509 | |||||||
510 | 46 | if (\count($fonts) > 0) { |
|||||
511 | 43 | return reset($fonts); |
|||||
512 | } |
||||||
513 | |||||||
514 | 3 | return new Font($this->document, null, null, $this->config); |
|||||
0 ignored issues
–
show
It seems like
$this->document can also be of type null ; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
515 | } |
||||||
516 | |||||||
517 | /** |
||||||
518 | * Decode a '[]TJ' command and attempt to use alternate |
||||||
519 | * fonts if the current font results in output that contains |
||||||
520 | * Unicode control characters. |
||||||
521 | * |
||||||
522 | * @internal |
||||||
523 | * |
||||||
524 | * @param array<int,array<string,string|bool>> $command |
||||||
525 | */ |
||||||
526 | 43 | private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string |
|||||
527 | { |
||||||
528 | 43 | $orig_text = $font->decodeText($command, $fontFactor); |
|||||
529 | 43 | $text = $orig_text; |
|||||
530 | |||||||
531 | // If we make this a Config option, we can add a check if it's |
||||||
532 | // enabled here. |
||||||
533 | 43 | if (null !== $page) { |
|||||
534 | 43 | $font_ids = array_keys($page->getFonts()); |
|||||
535 | |||||||
536 | // If the decoded text contains UTF-8 control characters |
||||||
537 | // then the font page being used is probably the wrong one. |
||||||
538 | // Loop through the rest of the fonts to see if we can get |
||||||
539 | // a good decode. Allow x09 to x0d which are whitespace. |
||||||
540 | 43 | while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { |
|||||
541 | // If we're out of font IDs, then give up and use the |
||||||
542 | // original string |
||||||
543 | 3 | if (0 == \count($font_ids)) { |
|||||
544 | 3 | return $orig_text; |
|||||
545 | } |
||||||
546 | |||||||
547 | // Try the next font ID |
||||||
548 | 3 | $font = $page->getFont(array_shift($font_ids)); |
|||||
549 | 3 | $text = $font->decodeText($command, $fontFactor); |
|||||
550 | } |
||||||
551 | } |
||||||
552 | |||||||
553 | 43 | return $text; |
|||||
554 | } |
||||||
555 | |||||||
556 | /** |
||||||
557 | * Expects a string that is a full PDF dictionary object, |
||||||
558 | * including the outer enclosing << >> angle brackets |
||||||
559 | * |
||||||
560 | * @internal |
||||||
561 | * |
||||||
562 | * @throws \Exception |
||||||
563 | */ |
||||||
564 | 18 | public function parseDictionary(string $dictionary): array |
|||||
565 | { |
||||||
566 | // Normalize whitespace |
||||||
567 | 18 | $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); |
|||||
568 | |||||||
569 | 18 | if ('<<' != substr($dictionary, 0, 2)) { |
|||||
570 | throw new \Exception('Not a valid dictionary object.'); |
||||||
571 | } |
||||||
572 | |||||||
573 | 18 | $parsed = []; |
|||||
574 | 18 | $stack = []; |
|||||
575 | 18 | $currentName = ''; |
|||||
576 | 18 | $arrayTypeNumeric = false; |
|||||
577 | |||||||
578 | // Remove outer layer of dictionary, and split on tokens |
||||||
579 | 18 | $split = preg_split( |
|||||
580 | 18 | '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', |
|||||
581 | 18 | trim(preg_replace('/^<<|>>$/', '', $dictionary)), |
|||||
582 | 18 | -1, |
|||||
583 | 18 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|||||
584 | 18 | ); |
|||||
585 | |||||||
586 | 18 | foreach ($split as $token) { |
|||||
587 | 18 | $token = trim($token); |
|||||
588 | switch ($token) { |
||||||
589 | 18 | case '': |
|||||
590 | 8 | break; |
|||||
591 | |||||||
592 | // Open numeric array |
||||||
593 | 18 | case '[': |
|||||
594 | 8 | $parsed[$currentName] = []; |
|||||
595 | 8 | $arrayTypeNumeric = true; |
|||||
596 | |||||||
597 | // Move up one level in the stack |
||||||
598 | 8 | $stack[\count($stack)] = &$parsed; |
|||||
599 | 8 | $parsed = &$parsed[$currentName]; |
|||||
600 | 8 | $currentName = ''; |
|||||
601 | 8 | break; |
|||||
602 | |||||||
603 | // Open hashed array |
||||||
604 | 18 | case '<<': |
|||||
605 | 1 | $parsed[$currentName] = []; |
|||||
606 | 1 | $arrayTypeNumeric = false; |
|||||
607 | |||||||
608 | // Move up one level in the stack |
||||||
609 | 1 | $stack[\count($stack)] = &$parsed; |
|||||
610 | 1 | $parsed = &$parsed[$currentName]; |
|||||
611 | 1 | $currentName = ''; |
|||||
612 | 1 | break; |
|||||
613 | |||||||
614 | // Close numeric array |
||||||
615 | 18 | case ']': |
|||||
616 | // Revert string type arrays back to a single element |
||||||
617 | 8 | if (\is_array($parsed) && 1 == \count($parsed) |
|||||
618 | 8 | && isset($parsed[0]) && \is_string($parsed[0]) |
|||||
619 | 8 | && '' !== $parsed[0] && '/' != $parsed[0][0]) { |
|||||
620 | 6 | $parsed = '['.$parsed[0].']'; |
|||||
621 | } |
||||||
622 | // Close hashed array |
||||||
623 | // no break |
||||||
624 | 18 | case '>>': |
|||||
625 | 8 | $arrayTypeNumeric = false; |
|||||
626 | |||||||
627 | // Move down one level in the stack |
||||||
628 | 8 | $parsed = &$stack[\count($stack) - 1]; |
|||||
629 | 8 | unset($stack[\count($stack) - 1]); |
|||||
630 | 8 | break; |
|||||
631 | |||||||
632 | default: |
||||||
633 | // If value begins with a slash, then this is a name |
||||||
634 | // Add it to the appropriate array |
||||||
635 | 18 | if ('/' == substr($token, 0, 1)) { |
|||||
636 | 18 | $currentName = substr($token, 1); |
|||||
637 | 18 | if (true == $arrayTypeNumeric) { |
|||||
638 | 7 | $parsed[] = $currentName; |
|||||
639 | 18 | $currentName = ''; |
|||||
640 | } |
||||||
641 | 18 | } elseif ('' != $currentName) { |
|||||
642 | 18 | if (false == $arrayTypeNumeric) { |
|||||
643 | 18 | $parsed[$currentName] = $token; |
|||||
644 | } |
||||||
645 | 18 | $currentName = ''; |
|||||
646 | 5 | } elseif ('' == $currentName) { |
|||||
647 | 5 | $parsed[] = $token; |
|||||
648 | } |
||||||
649 | } |
||||||
650 | } |
||||||
651 | |||||||
652 | 18 | return $parsed; |
|||||
653 | } |
||||||
654 | |||||||
655 | /** |
||||||
656 | * Returns the text content of a PDF as a string. Attempts to add |
||||||
657 | * whitespace for spacing and line-breaks where appropriate. |
||||||
658 | * |
||||||
659 | * getText() leverages getTextArray() to get the content |
||||||
660 | * of the document, setting the addPositionWhitespace flag to true |
||||||
661 | * so whitespace is inserted in a logical way for reading by |
||||||
662 | * humans. |
||||||
663 | */ |
||||||
664 | 37 | public function getText(?Page $page = null): string |
|||||
665 | { |
||||||
666 | 37 | $this->addPositionWhitespace = true; |
|||||
667 | 37 | $result = $this->getTextArray($page); |
|||||
668 | 37 | $this->addPositionWhitespace = false; |
|||||
669 | |||||||
670 | 37 | return implode('', $result).' '; |
|||||
671 | } |
||||||
672 | |||||||
673 | /** |
||||||
674 | * Returns the text content of a PDF as an array of strings. No |
||||||
675 | * extra whitespace is inserted besides what is actually encoded in |
||||||
676 | * the PDF text. |
||||||
677 | * |
||||||
678 | * @throws \Exception |
||||||
679 | */ |
||||||
680 | 46 | public function getTextArray(?Page $page = null): array |
|||||
681 | { |
||||||
682 | 46 | $result = []; |
|||||
683 | 46 | $text = []; |
|||||
684 | |||||||
685 | 46 | $marked_stack = []; |
|||||
686 | 46 | $last_written_position = false; |
|||||
687 | |||||||
688 | 46 | $sections = $this->getSectionsText($this->content); |
|||||
689 | 46 | $current_font = $this->getDefaultFont($page); |
|||||
690 | 46 | $current_font_size = 1; |
|||||
691 | 46 | $current_text_leading = 0; |
|||||
692 | |||||||
693 | 46 | $current_position = ['x' => false, 'y' => false]; |
|||||
694 | 46 | $current_position_tm = [ |
|||||
695 | 46 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
696 | 46 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
697 | 46 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
698 | 46 | ]; |
|||||
699 | 46 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||||
700 | 46 | $current_position_cm = [ |
|||||
701 | 46 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
702 | 46 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
703 | 46 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
704 | 46 | ]; |
|||||
705 | |||||||
706 | 46 | $clipped_font = []; |
|||||
707 | 46 | $clipped_position_cm = []; |
|||||
708 | |||||||
709 | 46 | self::$recursionStack[] = $this->getUniqueId(); |
|||||
710 | |||||||
711 | 46 | foreach ($sections as $section) { |
|||||
712 | 43 | $commands = $this->getCommandsText($section); |
|||||
713 | 43 | foreach ($commands as $command) { |
|||||
714 | 43 | switch ($command[self::OPERATOR]) { |
|||||
715 | // Begin text object |
||||||
716 | 43 | case 'BT': |
|||||
717 | // Reset text positioning matrices |
||||||
718 | 43 | $current_position_tm = [ |
|||||
719 | 43 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
720 | 43 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
721 | 43 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
722 | 43 | ]; |
|||||
723 | 43 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||||
724 | 43 | $current_text_leading = 0; |
|||||
725 | 43 | break; |
|||||
726 | |||||||
727 | // Begin marked content sequence with property list |
||||||
728 | 43 | case 'BDC': |
|||||
729 | 16 | if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { |
|||||
730 | 16 | $dict = $this->parseDictionary($match[1]); |
|||||
731 | |||||||
732 | // Check for ActualText block |
||||||
733 | 16 | if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { |
|||||
734 | 4 | if ('[' == $dict['ActualText'][0]) { |
|||||
735 | // Simulate a 'TJ' command on the stack |
||||||
736 | $marked_stack[] = [ |
||||||
737 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], |
||||||
738 | ]; |
||||||
739 | 4 | } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { |
|||||
740 | // Simulate a 'Tj' command on the stack |
||||||
741 | 4 | $marked_stack[] = [ |
|||||
742 | 4 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], |
|||||
743 | 4 | ]; |
|||||
744 | } |
||||||
745 | } |
||||||
746 | } |
||||||
747 | 16 | break; |
|||||
748 | |||||||
749 | // Begin marked content sequence |
||||||
750 | 43 | case 'BMC': |
|||||
751 | 2 | if ('ReversedChars' == $command[self::COMMAND]) { |
|||||
752 | // Upon encountering a ReversedChars command, |
||||||
753 | // add the characters we've built up so far to |
||||||
754 | // the result array |
||||||
755 | 1 | $result = array_merge($result, $text); |
|||||
756 | |||||||
757 | // Start a fresh $text array that will contain |
||||||
758 | // reversed characters |
||||||
759 | 1 | $text = []; |
|||||
760 | |||||||
761 | // Add the reversed text flag to the stack |
||||||
762 | 1 | $marked_stack[] = ['ReversedChars' => true]; |
|||||
763 | } |
||||||
764 | 2 | break; |
|||||
765 | |||||||
766 | // set graphics position matrix |
||||||
767 | 43 | case 'cm': |
|||||
768 | 29 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
769 | 29 | $current_position_cm = [ |
|||||
770 | 29 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||||
771 | 29 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||||
772 | 29 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||||
773 | 29 | ]; |
|||||
774 | 29 | break; |
|||||
775 | |||||||
776 | 43 | case 'Do': |
|||||
777 | 15 | if (null !== $page) { |
|||||
778 | 15 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||||
779 | 15 | $id = trim(array_pop($args), '/ '); |
|||||
780 | 15 | $xobject = $page->getXObject($id); |
|||||
781 | |||||||
782 | // @todo $xobject could be a ElementXRef object, which would then throw an error |
||||||
783 | 15 | if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { |
|||||
784 | // Not a circular reference. |
||||||
785 | 15 | $text[] = $xobject->getText($page); |
|||||
786 | } |
||||||
787 | } |
||||||
788 | 15 | break; |
|||||
789 | |||||||
790 | // Marked content point with (DP) & without (MP) property list |
||||||
791 | 43 | case 'DP': |
|||||
792 | 43 | case 'MP': |
|||||
793 | 1 | break; |
|||||
794 | |||||||
795 | // End text object |
||||||
796 | 43 | case 'ET': |
|||||
797 | 43 | break; |
|||||
798 | |||||||
799 | // Store current selected font and graphics matrix |
||||||
800 | 43 | case 'q': |
|||||
801 | 37 | $clipped_font[] = [$current_font, $current_font_size]; |
|||||
802 | 37 | $clipped_position_cm[] = $current_position_cm; |
|||||
803 | 37 | break; |
|||||
804 | |||||||
805 | // Restore previous selected font and graphics matrix |
||||||
806 | 43 | case 'Q': |
|||||
807 | 37 | list($current_font, $current_font_size) = array_pop($clipped_font); |
|||||
808 | 37 | $current_position_cm = array_pop($clipped_position_cm); |
|||||
809 | 37 | break; |
|||||
810 | |||||||
811 | // End marked content sequence |
||||||
812 | 43 | case 'EMC': |
|||||
813 | 17 | $data = false; |
|||||
814 | 17 | if (\count($marked_stack)) { |
|||||
815 | 5 | $marked = array_pop($marked_stack); |
|||||
816 | 5 | $action = key($marked); |
|||||
817 | 5 | $data = $marked[$action]; |
|||||
818 | |||||||
819 | switch ($action) { |
||||||
820 | // If we are in ReversedChars mode... |
||||||
821 | 5 | case 'ReversedChars': |
|||||
822 | // Reverse the characters we've built up so far |
||||||
823 | 1 | foreach ($text as $key => $t) { |
|||||
824 | 1 | $text[$key] = implode('', array_reverse( |
|||||
825 | 1 | mb_str_split($t, 1, mb_internal_encoding()) |
|||||
0 ignored issues
–
show
It seems like
mb_internal_encoding() can also be of type true ; however, parameter $encoding of mb_str_split() does only seem to accept null|string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
826 | 1 | )); |
|||||
827 | } |
||||||
828 | |||||||
829 | // Add these characters to the result array |
||||||
830 | 1 | $result = array_merge($result, $text); |
|||||
831 | |||||||
832 | // Start a fresh $text array that will contain |
||||||
833 | // non-reversed characters |
||||||
834 | 1 | $text = []; |
|||||
835 | 1 | break; |
|||||
836 | |||||||
837 | 4 | case 'ActualText': |
|||||
838 | // Use the content of the ActualText as a command |
||||||
839 | 4 | $command = $data; |
|||||
840 | 4 | break; |
|||||
841 | } |
||||||
842 | } |
||||||
843 | |||||||
844 | // If this EMC command has been transformed into a 'Tj' |
||||||
845 | // or 'TJ' command because of being ActualText, then bypass |
||||||
846 | // the break to proceed to the writing section below. |
||||||
847 | 17 | if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { |
|||||
848 | 17 | break; |
|||||
849 | } |
||||||
850 | |||||||
851 | // no break |
||||||
852 | 43 | case "'": |
|||||
853 | 43 | case '"': |
|||||
854 | 4 | if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { |
|||||
855 | // Move to next line and write text |
||||||
856 | $current_position['x'] = 0; |
||||||
857 | $current_position_td['x'] = 0; |
||||||
858 | $current_position_td['y'] += $current_text_leading; |
||||||
859 | } |
||||||
860 | // no break |
||||||
861 | 43 | case 'Tj': |
|||||
862 | 35 | $command[self::COMMAND] = [$command]; |
|||||
863 | // no break |
||||||
864 | 43 | case 'TJ': |
|||||
865 | // Check the marked content stack for flags |
||||||
866 | 43 | $actual_text = false; |
|||||
867 | 43 | $reverse_text = false; |
|||||
868 | 43 | foreach ($marked_stack as $marked) { |
|||||
869 | 5 | if (isset($marked['ActualText'])) { |
|||||
870 | 4 | $actual_text = true; |
|||||
871 | } |
||||||
872 | 5 | if (isset($marked['ReversedChars'])) { |
|||||
873 | 1 | $reverse_text = true; |
|||||
874 | } |
||||||
875 | } |
||||||
876 | |||||||
877 | // Account for text position ONLY just before we write text |
||||||
878 | 43 | if (false === $actual_text && \is_array($last_written_position)) { |
|||||
879 | // If $last_written_position is an array, that |
||||||
880 | // means we have stored text position coordinates |
||||||
881 | // for placing an ActualText |
||||||
882 | 4 | $currentX = $last_written_position[0]; |
|||||
883 | 4 | $currentY = $last_written_position[1]; |
|||||
884 | 4 | $last_written_position = false; |
|||||
885 | } else { |
||||||
886 | 43 | $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; |
|||||
887 | 43 | $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; |
|||||
888 | } |
||||||
889 | 43 | $whiteSpace = ''; |
|||||
890 | |||||||
891 | 43 | $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; |
|||||
892 | 43 | $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; |
|||||
893 | |||||||
894 | 43 | if (true === $this->addPositionWhitespace && false !== $current_position['x']) { |
|||||
895 | 31 | $curY = $currentY - $current_position['y']; |
|||||
896 | 31 | if (abs($curY) >= abs($factorY) / 4) { |
|||||
897 | 30 | $whiteSpace = "\n"; |
|||||
898 | } else { |
||||||
899 | 30 | if (true === $reverse_text) { |
|||||
900 | 1 | $curX = $current_position['x'] - $currentX; |
|||||
901 | } else { |
||||||
902 | 30 | $curX = $currentX - $current_position['x']; |
|||||
903 | } |
||||||
904 | |||||||
905 | // In abs($factorX * 7) below, the 7 is chosen arbitrarily |
||||||
906 | // as the number of apparent "spaces" in a document we |
||||||
907 | // would need before considering them a "tab". In the |
||||||
908 | // future, we might offer this value to users as a config |
||||||
909 | // option. |
||||||
910 | 30 | if ($curX >= abs($factorX * 7)) { |
|||||
911 | 20 | $whiteSpace = "\t"; |
|||||
912 | 29 | } elseif ($curX >= abs($factorX * 2)) { |
|||||
913 | 19 | $whiteSpace = ' '; |
|||||
914 | } |
||||||
915 | } |
||||||
916 | } |
||||||
917 | |||||||
918 | 43 | $newtext = $this->getTJUsingFontFallback( |
|||||
919 | 43 | $current_font, |
|||||
920 | 43 | $command[self::COMMAND], |
|||||
921 | 43 | $page, |
|||||
922 | 43 | $factorX |
|||||
923 | 43 | ); |
|||||
924 | |||||||
925 | // If there is no ActualText pending then write |
||||||
926 | 43 | if (false === $actual_text) { |
|||||
927 | 43 | $newtext = str_replace(["\r", "\n"], '', $newtext); |
|||||
928 | 43 | if (false !== $reverse_text) { |
|||||
929 | // If we are in ReversedChars mode, add the whitespace last |
||||||
930 | 1 | $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); |
|||||
931 | } else { |
||||||
932 | // Otherwise add the whitespace first |
||||||
933 | 43 | if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { |
|||||
934 | 18 | $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); |
|||||
935 | } |
||||||
936 | 43 | $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); |
|||||
937 | } |
||||||
938 | |||||||
939 | // Record the position of this inserted text for comparison |
||||||
940 | // with the next text block. |
||||||
941 | // Provide a 'fudge' factor guess on how wide this text block |
||||||
942 | // is based on the number of characters. This helps limit the |
||||||
943 | // number of tabs inserted, but isn't perfect. |
||||||
944 | 43 | $factor = $factorX / 2; |
|||||
945 | 43 | $current_position = [ |
|||||
946 | 43 | 'x' => $currentX - mb_strlen($newtext) * $factor, |
|||||
947 | 43 | 'y' => $currentY, |
|||||
948 | 43 | ]; |
|||||
949 | 4 | } elseif (false === $last_written_position) { |
|||||
950 | // If there is an ActualText in the pipeline |
||||||
951 | // store the position this undisplayed text |
||||||
952 | // *would* have been written to, so the |
||||||
953 | // ActualText is displayed in the right spot |
||||||
954 | 4 | $last_written_position = [$currentX, $currentY]; |
|||||
955 | 4 | $current_position['x'] = $currentX; |
|||||
956 | } |
||||||
957 | 43 | break; |
|||||
958 | |||||||
959 | // move to start of next line |
||||||
960 | 43 | case 'T*': |
|||||
961 | 13 | $current_position['x'] = 0; |
|||||
962 | 13 | $current_position_td['x'] = 0; |
|||||
963 | 13 | $current_position_td['y'] += $current_text_leading; |
|||||
964 | 13 | break; |
|||||
965 | |||||||
966 | // set character spacing |
||||||
967 | 43 | case 'Tc': |
|||||
968 | 13 | break; |
|||||
969 | |||||||
970 | // move text current point and set leading |
||||||
971 | 43 | case 'Td': |
|||||
972 | 43 | case 'TD': |
|||||
973 | // move text current point |
||||||
974 | 32 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
975 | 32 | $y = (float) array_pop($args); |
|||||
976 | 32 | $x = (float) array_pop($args); |
|||||
977 | |||||||
978 | 32 | if ('TD' == $command[self::OPERATOR]) { |
|||||
979 | 7 | $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; |
|||||
980 | } |
||||||
981 | |||||||
982 | 32 | $current_position_td = [ |
|||||
983 | 32 | 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], |
|||||
984 | 32 | 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], |
|||||
985 | 32 | ]; |
|||||
986 | 32 | break; |
|||||
987 | |||||||
988 | 43 | case 'Tf': |
|||||
989 | 43 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||||
990 | 43 | $size = (float) array_pop($args); |
|||||
991 | 43 | $id = trim(array_pop($args), '/'); |
|||||
992 | 43 | if (null !== $page) { |
|||||
993 | 43 | $new_font = $page->getFont($id); |
|||||
994 | // If an invalid font ID is given, do not update the font. |
||||||
995 | // This should theoretically never happen, as the PDF spec states for the Tf operator: |
||||||
996 | // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" |
||||||
997 | // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) |
||||||
998 | // But we want to make sure that malformed PDFs do not simply crash. |
||||||
999 | 43 | if (null !== $new_font) { |
|||||
1000 | 43 | $current_font = $new_font; |
|||||
1001 | 43 | $current_font_size = $size; |
|||||
1002 | } |
||||||
1003 | } |
||||||
1004 | 43 | break; |
|||||
1005 | |||||||
1006 | // set leading |
||||||
1007 | 37 | case 'TL': |
|||||
1008 | 6 | $y = (float) $command[self::COMMAND]; |
|||||
1009 | 6 | $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; |
|||||
1010 | 6 | break; |
|||||
1011 | |||||||
1012 | // set text position matrix |
||||||
1013 | 37 | case 'Tm': |
|||||
1014 | 34 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
1015 | 34 | $current_position_tm = [ |
|||||
1016 | 34 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||||
1017 | 34 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||||
1018 | 34 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||||
1019 | 34 | ]; |
|||||
1020 | 34 | break; |
|||||
1021 | |||||||
1022 | // set text rendering mode |
||||||
1023 | 22 | case 'Ts': |
|||||
1024 | break; |
||||||
1025 | |||||||
1026 | // set super/subscripting text rise |
||||||
1027 | 22 | case 'Ts': |
|||||
1028 | break; |
||||||
1029 | |||||||
1030 | // set word spacing |
||||||
1031 | 22 | case 'Tw': |
|||||
1032 | 9 | break; |
|||||
1033 | |||||||
1034 | // set horizontal scaling |
||||||
1035 | 22 | case 'Tz': |
|||||
1036 | break; |
||||||
1037 | |||||||
1038 | default: |
||||||
1039 | } |
||||||
1040 | } |
||||||
1041 | } |
||||||
1042 | |||||||
1043 | 46 | $result = array_merge($result, $text); |
|||||
1044 | |||||||
1045 | 46 | return $result; |
|||||
1046 | } |
||||||
1047 | |||||||
1048 | /** |
||||||
1049 | * getCommandsText() expects the content of $text_part to be an |
||||||
1050 | * already formatted, single-line command from a document stream. |
||||||
1051 | * The companion function getSectionsText() returns a document |
||||||
1052 | * stream as an array of single commands for just this purpose. |
||||||
1053 | * Because of this, the argument $offset is no longer used, and |
||||||
1054 | * may be removed in a future PdfParser release. |
||||||
1055 | * |
||||||
1056 | * A better name for this function would be getCommandText() |
||||||
1057 | * since it now always works on just one command. |
||||||
1058 | */ |
||||||
1059 | 50 | public function getCommandsText(string $text_part, int &$offset = 0): array |
|||||
1060 | { |
||||||
1061 | 50 | $commands = $matches = []; |
|||||
1062 | |||||||
1063 | 50 | preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches); |
|||||
1064 | |||||||
1065 | // If no valid command is detected, return an empty array |
||||||
1066 | 50 | if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { |
|||||
1067 | 1 | return []; |
|||||
1068 | } |
||||||
1069 | |||||||
1070 | 50 | $type = $matches[2]; |
|||||
1071 | 50 | $operator = $matches[3]; |
|||||
1072 | 50 | $command = trim($matches[1]); |
|||||
1073 | |||||||
1074 | 50 | if ('TJ' == $operator) { |
|||||
1075 | 40 | $subcommand = []; |
|||||
1076 | 40 | $command = trim($command, '[]'); |
|||||
1077 | do { |
||||||
1078 | 40 | $oldCommand = $command; |
|||||
1079 | |||||||
1080 | // Search for parentheses string () format |
||||||
1081 | 40 | if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) { |
|||||
1082 | 34 | $subcommand[] = [ |
|||||
1083 | 34 | self::TYPE => '(', |
|||||
1084 | 34 | self::OPERATOR => 'TJ', |
|||||
1085 | 34 | self::COMMAND => $tjmatch[1], |
|||||
1086 | 34 | ]; |
|||||
1087 | 34 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||||
1088 | 28 | $subcommand[] = [ |
|||||
1089 | 28 | self::TYPE => 'n', |
|||||
1090 | 28 | self::OPERATOR => '', |
|||||
1091 | 28 | self::COMMAND => $tjmatch[2], |
|||||
1092 | 28 | ]; |
|||||
1093 | } |
||||||
1094 | 34 | $command = substr($command, \strlen($tjmatch[0])); |
|||||
1095 | } |
||||||
1096 | |||||||
1097 | // Search for hexadecimal <> format |
||||||
1098 | 40 | if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { |
|||||
1099 | 19 | $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); |
|||||
1100 | 19 | $subcommand[] = [ |
|||||
1101 | 19 | self::TYPE => '<', |
|||||
1102 | 19 | self::OPERATOR => 'TJ', |
|||||
1103 | 19 | self::COMMAND => $tjmatch[1], |
|||||
1104 | 19 | ]; |
|||||
1105 | 19 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||||
1106 | 18 | $subcommand[] = [ |
|||||
1107 | 18 | self::TYPE => 'n', |
|||||
1108 | 18 | self::OPERATOR => '', |
|||||
1109 | 18 | self::COMMAND => $tjmatch[2], |
|||||
1110 | 18 | ]; |
|||||
1111 | } |
||||||
1112 | 19 | $command = substr($command, \strlen($tjmatch[0])); |
|||||
1113 | } |
||||||
1114 | 40 | } while ($command != $oldCommand); |
|||||
1115 | |||||||
1116 | 40 | $command = $subcommand; |
|||||
1117 | 50 | } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { |
|||||
1118 | // Depending on the string type, trim the data of the |
||||||
1119 | // appropriate delimiters |
||||||
1120 | 39 | if ('(' == $type) { |
|||||
1121 | // Don't use trim() here since a () string may end with |
||||||
1122 | // a balanced or escaped right parentheses, and trim() |
||||||
1123 | // will delete both. Both strings below are valid: |
||||||
1124 | // eg. (String()) |
||||||
1125 | // eg. (String\)) |
||||||
1126 | 33 | $command = preg_replace('/^\(|\)$/', '', $command); |
|||||
1127 | 15 | } elseif ('<' == $type) { |
|||||
1128 | 39 | $command = trim($command, '<>'); |
|||||
1129 | } |
||||||
1130 | 50 | } elseif ('/' == $type) { |
|||||
1131 | 49 | $command = substr($command, 1); |
|||||
1132 | } |
||||||
1133 | |||||||
1134 | 50 | $commands[] = [ |
|||||
1135 | 50 | self::TYPE => $type, |
|||||
1136 | 50 | self::OPERATOR => $operator, |
|||||
1137 | 50 | self::COMMAND => $command, |
|||||
1138 | 50 | ]; |
|||||
1139 | |||||||
1140 | 50 | return $commands; |
|||||
1141 | } |
||||||
1142 | |||||||
1143 | 65 | public static function factory( |
|||||
1144 | Document $document, |
||||||
1145 | Header $header, |
||||||
1146 | ?string $content, |
||||||
1147 | ?Config $config = null |
||||||
1148 | ): self { |
||||||
1149 | 65 | switch ($header->get('Type')->getContent()) { |
|||||
1150 | 65 | case 'XObject': |
|||||
1151 | 19 | switch ($header->get('Subtype')->getContent()) { |
|||||
1152 | 19 | case 'Image': |
|||||
1153 | 12 | return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config); |
|||||
0 ignored issues
–
show
The method
getRetainImageContent() does not exist on null .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
1154 | |||||||
1155 | 8 | case 'Form': |
|||||
1156 | 8 | return new Form($document, $header, $content, $config); |
|||||
1157 | } |
||||||
1158 | |||||||
1159 | return new self($document, $header, $content, $config); |
||||||
1160 | |||||||
1161 | 65 | case 'Pages': |
|||||
1162 | 64 | return new Pages($document, $header, $content, $config); |
|||||
1163 | |||||||
1164 | 65 | case 'Page': |
|||||
1165 | 64 | return new Page($document, $header, $content, $config); |
|||||
1166 | |||||||
1167 | 65 | case 'Encoding': |
|||||
1168 | 12 | return new Encoding($document, $header, $content, $config); |
|||||
1169 | |||||||
1170 | 65 | case 'Font': |
|||||
1171 | 64 | $subtype = $header->get('Subtype')->getContent(); |
|||||
1172 | 64 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
|||||
1173 | |||||||
1174 | 64 | if (class_exists($classname)) { |
|||||
1175 | 64 | return new $classname($document, $header, $content, $config); |
|||||
1176 | } |
||||||
1177 | |||||||
1178 | return new Font($document, $header, $content, $config); |
||||||
1179 | |||||||
1180 | default: |
||||||
1181 | 65 | return new self($document, $header, $content, $config); |
|||||
1182 | } |
||||||
1183 | } |
||||||
1184 | |||||||
1185 | /** |
||||||
1186 | * Returns unique id identifying the object. |
||||||
1187 | */ |
||||||
1188 | 46 | protected function getUniqueId(): string |
|||||
1189 | { |
||||||
1190 | 46 | return spl_object_hash($this); |
|||||
1191 | } |
||||||
1192 | } |
||||||
1193 |