1 | <?php |
||||||
2 | |||||||
3 | /** |
||||||
4 | * @file |
||||||
5 | * This file is part of the PdfParser library. |
||||||
6 | * |
||||||
7 | * @author Sébastien MALOT <[email protected]> |
||||||
8 | * |
||||||
9 | * @date 2017-01-03 |
||||||
10 | * |
||||||
11 | * @license LGPLv3 |
||||||
12 | * |
||||||
13 | * @url <https://github.com/smalot/pdfparser> |
||||||
14 | * |
||||||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||||
17 | * |
||||||
18 | * This program is free software: you can redistribute it and/or modify |
||||||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||||||
20 | * the Free Software Foundation, either version 3 of the License, or |
||||||
21 | * (at your option) any later version. |
||||||
22 | * |
||||||
23 | * This program is distributed in the hope that it will be useful, |
||||||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
26 | * GNU Lesser General Public License for more details. |
||||||
27 | * |
||||||
28 | * You should have received a copy of the GNU Lesser General Public License |
||||||
29 | * along with this program. |
||||||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||||
31 | */ |
||||||
32 | |||||||
33 | namespace Smalot\PdfParser; |
||||||
34 | |||||||
35 | use Smalot\PdfParser\XObject\Form; |
||||||
36 | use Smalot\PdfParser\XObject\Image; |
||||||
37 | |||||||
38 | /** |
||||||
39 | * Class PDFObject |
||||||
40 | */ |
||||||
41 | class PDFObject |
||||||
42 | { |
||||||
43 | public const TYPE = 't'; |
||||||
44 | |||||||
45 | public const OPERATOR = 'o'; |
||||||
46 | |||||||
47 | public const COMMAND = 'c'; |
||||||
48 | |||||||
49 | /** |
||||||
50 | * The recursion stack. |
||||||
51 | * |
||||||
52 | * @var array |
||||||
53 | */ |
||||||
54 | public static $recursionStack = []; |
||||||
55 | |||||||
56 | /** |
||||||
57 | * @var Document|null |
||||||
58 | */ |
||||||
59 | protected $document; |
||||||
60 | |||||||
61 | /** |
||||||
62 | * @var Header |
||||||
63 | */ |
||||||
64 | protected $header; |
||||||
65 | |||||||
66 | /** |
||||||
67 | * @var string |
||||||
68 | */ |
||||||
69 | protected $content; |
||||||
70 | |||||||
71 | /** |
||||||
72 | * @var Config|null |
||||||
73 | */ |
||||||
74 | protected $config; |
||||||
75 | |||||||
76 | /** |
||||||
77 | * @var bool |
||||||
78 | */ |
||||||
79 | protected $addPositionWhitespace = false; |
||||||
80 | |||||||
81 | 95 | public function __construct( |
|||||
82 | Document $document, |
||||||
83 | ?Header $header = null, |
||||||
84 | ?string $content = null, |
||||||
85 | ?Config $config = null |
||||||
86 | ) { |
||||||
87 | 95 | $this->document = $document; |
|||||
88 | 95 | $this->header = $header ?? new Header(); |
|||||
89 | 95 | $this->content = $content; |
|||||
90 | 95 | $this->config = $config; |
|||||
91 | } |
||||||
92 | |||||||
93 | 72 | public function init() |
|||||
94 | { |
||||||
95 | 72 | } |
|||||
96 | |||||||
97 | 4 | public function getDocument(): Document |
|||||
98 | { |
||||||
99 | 4 | return $this->document; |
|||||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
Loading history...
|
|||||||
100 | } |
||||||
101 | |||||||
102 | 72 | public function getHeader(): ?Header |
|||||
103 | { |
||||||
104 | 72 | return $this->header; |
|||||
105 | } |
||||||
106 | |||||||
107 | 4 | public function getConfig(): ?Config |
|||||
108 | { |
||||||
109 | 4 | return $this->config; |
|||||
110 | } |
||||||
111 | |||||||
112 | /** |
||||||
113 | * @return Element|PDFObject|Header |
||||||
114 | */ |
||||||
115 | 75 | public function get(string $name) |
|||||
116 | { |
||||||
117 | 75 | return $this->header->get($name); |
|||||
118 | } |
||||||
119 | |||||||
120 | 74 | public function has(string $name): bool |
|||||
121 | { |
||||||
122 | 74 | return $this->header->has($name); |
|||||
123 | } |
||||||
124 | |||||||
125 | 4 | public function getDetails(bool $deep = true): array |
|||||
126 | { |
||||||
127 | 4 | return $this->header->getDetails($deep); |
|||||
128 | } |
||||||
129 | |||||||
130 | 59 | public function getContent(): ?string |
|||||
131 | { |
||||||
132 | 59 | return $this->content; |
|||||
133 | } |
||||||
134 | |||||||
135 | /** |
||||||
136 | * Creates a duplicate of the document stream with |
||||||
137 | * strings and other items replaced by $char. Formerly |
||||||
138 | * getSectionsText() used this output to more easily gather offset |
||||||
139 | * values to extract text from the *actual* document stream. |
||||||
140 | * |
||||||
141 | * @deprecated function is no longer used and will be removed in a future release |
||||||
142 | * |
||||||
143 | * @internal |
||||||
144 | */ |
||||||
145 | 1 | public function cleanContent(string $content, string $char = 'X') |
|||||
146 | { |
||||||
147 | 1 | $char = $char[0]; |
|||||
148 | 1 | $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content); |
|||||
149 | |||||||
150 | // Remove image bloc with binary content |
||||||
151 | 1 | preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
152 | 1 | foreach ($matches[0] as $part) { |
|||||
153 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
||||||
154 | } |
||||||
155 | |||||||
156 | // Clean content in square brackets [.....] |
||||||
157 | 1 | preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
0 ignored issues
–
show
The call to
preg_match_all() has too many arguments starting with PREG_OFFSET_CAPTURE .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.
Loading history...
|
|||||||
158 | 1 | foreach ($matches[1] as $part) { |
|||||
159 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
160 | } |
||||||
161 | |||||||
162 | // Clean content in round brackets (.....) |
||||||
163 | 1 | preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
164 | 1 | foreach ($matches[1] as $part) { |
|||||
165 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
166 | } |
||||||
167 | |||||||
168 | // Clean structure |
||||||
169 | 1 | if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { |
|||||
0 ignored issues
–
show
It seems like
$content can also be of type array ; however, parameter $subject of preg_split() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
170 | 1 | $content = ''; |
|||||
171 | 1 | $level = 0; |
|||||
172 | 1 | foreach ($parts as $part) { |
|||||
173 | 1 | if ('<' == $part) { |
|||||
174 | 1 | ++$level; |
|||||
175 | } |
||||||
176 | |||||||
177 | 1 | $content .= (0 == $level ? $part : str_repeat($char, \strlen($part))); |
|||||
178 | |||||||
179 | 1 | if ('>' == $part) { |
|||||
180 | 1 | --$level; |
|||||
181 | } |
||||||
182 | } |
||||||
183 | } |
||||||
184 | |||||||
185 | // Clean BDC and EMC markup |
||||||
186 | 1 | preg_match_all( |
|||||
187 | 1 | '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', |
|||||
188 | 1 | $content, |
|||||
189 | 1 | $matches, |
|||||
190 | 1 | \PREG_OFFSET_CAPTURE |
|||||
191 | 1 | ); |
|||||
192 | 1 | foreach ($matches[1] as $part) { |
|||||
193 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
194 | } |
||||||
195 | |||||||
196 | 1 | preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); |
|||||
197 | 1 | foreach ($matches[1] as $part) { |
|||||
198 | 1 | $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); |
|||||
199 | } |
||||||
200 | |||||||
201 | 1 | return $content; |
|||||
202 | } |
||||||
203 | |||||||
204 | /** |
||||||
205 | * Takes a string of PDF document stream text and formats |
||||||
206 | * it into a multi-line string with one PDF command on each line, |
||||||
207 | * separated by \r\n. If the given string is null, or binary data |
||||||
208 | * is detected instead of a document stream then return an empty |
||||||
209 | * string. |
||||||
210 | */ |
||||||
211 | 53 | private function formatContent(?string $content): string |
|||||
212 | { |
||||||
213 | 53 | if (null === $content) { |
|||||
214 | 3 | return ''; |
|||||
215 | } |
||||||
216 | |||||||
217 | // Outside of (String) and inline image content in PDF document |
||||||
218 | // streams, all text should conform to UTF-8. Test for binary |
||||||
219 | // content by deleting everything after the first open- |
||||||
220 | // parenthesis ( which indicates the beginning of a string, or |
||||||
221 | // the first ID command which indicates the beginning of binary |
||||||
222 | // inline image content. Then test what remains for valid |
||||||
223 | // UTF-8. If it's not UTF-8, return an empty string as this |
||||||
224 | // $content is most likely binary. Unfortunately, using |
||||||
225 | // mb_check_encoding(..., 'UTF-8') is not strict enough, so the |
||||||
226 | // following regexp, adapted from the W3, is used. See: |
||||||
227 | // https://www.w3.org/International/questions/qa-forms-utf-8.en |
||||||
228 | // We use preg_replace() instead of preg_match() to avoid "JIT |
||||||
229 | // stack limit exhausted" errors on larger files. |
||||||
230 | 50 | $utf8Filter = preg_replace('/( |
|||||
231 | [\x09\x0A\x0D\x20-\x7E] | # ASCII |
||||||
232 | [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte |
||||||
233 | \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs |
||||||
234 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte |
||||||
235 | \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates |
||||||
236 | \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 |
||||||
237 | [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 |
||||||
238 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 |
||||||
239 | 50 | )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); |
|||||
240 | |||||||
241 | 50 | if ('' !== $utf8Filter) { |
|||||
242 | 1 | return ''; |
|||||
243 | } |
||||||
244 | |||||||
245 | // Find all inline image content and replace them so they aren't |
||||||
246 | // affected by the next steps |
||||||
247 | 50 | $pdfInlineImages = []; |
|||||
248 | 50 | $offsetBI = 0; |
|||||
249 | 50 | while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { |
|||||
250 | // Attempt to detemine if this instance of the 'BI' command |
||||||
251 | // actually occured within a (string) using the following |
||||||
252 | // steps: |
||||||
253 | |||||||
254 | // Step 1: Remove any escaped parentheses from the alleged |
||||||
255 | // image characteristics data |
||||||
256 | 1 | $para = str_replace(['\\(', '\\)'], '', $text[1][0]); |
|||||
257 | |||||||
258 | // Step 2: Remove all correctly ordered and balanced |
||||||
259 | // parentheses from (strings) |
||||||
260 | do { |
||||||
261 | 1 | $paraTest = $para; |
|||||
262 | 1 | $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); |
|||||
263 | 1 | } while ($para != $paraTest); |
|||||
264 | |||||||
265 | 1 | $paraOpen = strpos($para, '('); |
|||||
266 | 1 | $paraClose = strpos($para, ')'); |
|||||
267 | |||||||
268 | // Check: If the remaining text contains a close parenthesis |
||||||
269 | // ')' AND it occurs before any open parenthesis, then we |
||||||
270 | // are almost certain to be inside a (string) |
||||||
271 | 1 | if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { |
|||||
272 | // Bump the search offset forward and match again |
||||||
273 | 1 | $offsetBI = (int) $text[1][1]; |
|||||
274 | 1 | continue; |
|||||
275 | } |
||||||
276 | |||||||
277 | // Step 3: Double check that this is actually inline image |
||||||
278 | // data by parsing the alleged image characteristics as a |
||||||
279 | // dictionary |
||||||
280 | 1 | $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); |
|||||
281 | |||||||
282 | // Check if an image Width and Height are set in the dict |
||||||
283 | 1 | if ((isset($dict['W']) || isset($dict['Width'])) |
|||||
284 | 1 | && (isset($dict['H']) || isset($dict['Height']))) { |
|||||
285 | 1 | $id = uniqid('IMAGE_', true); |
|||||
286 | 1 | $pdfInlineImages[$id] = [ |
|||||
287 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), |
|||||
288 | 1 | preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), |
|||||
289 | 1 | ]; |
|||||
290 | 1 | $content = preg_replace( |
|||||
291 | 1 | '/'.preg_quote($text[0][0], '/').'/', |
|||||
292 | 1 | '^^^'.$id.'^^^', |
|||||
293 | 1 | $content, |
|||||
294 | 1 | 1 |
|||||
295 | 1 | ); |
|||||
296 | } else { |
||||||
297 | // If there was no valid dictionary, or a height and width |
||||||
298 | // weren't specified, then we don't know what this is, so |
||||||
299 | // just leave it alone; bump the search offset forward and |
||||||
300 | // match again |
||||||
301 | $offsetBI = (int) $text[1][1]; |
||||||
302 | } |
||||||
303 | } |
||||||
304 | |||||||
305 | // Find all strings () and replace them so they aren't affected |
||||||
306 | // by the next steps |
||||||
307 | 50 | $pdfstrings = []; |
|||||
308 | 50 | $attempt = '('; |
|||||
309 | 50 | while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) { |
|||||
310 | // PDF strings can contain unescaped parentheses as long as |
||||||
311 | // they're balanced, so check for balanced parentheses |
||||||
312 | 41 | $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]); |
|||||
313 | 41 | $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]); |
|||||
314 | |||||||
315 | 41 | if ($left == $right) { |
|||||
316 | // Replace the string with a unique placeholder |
||||||
317 | 41 | $id = uniqid('STRING_', true); |
|||||
318 | 41 | $pdfstrings[$id] = $text[0]; |
|||||
319 | 41 | $content = preg_replace( |
|||||
320 | 41 | '/'.preg_quote($text[0], '/').'/', |
|||||
321 | 41 | '@@@'.$id.'@@@', |
|||||
322 | 41 | $content, |
|||||
323 | 41 | 1 |
|||||
324 | 41 | ); |
|||||
325 | |||||||
326 | // Reset to search for the next string |
||||||
327 | 41 | $attempt = '('; |
|||||
328 | } else { |
||||||
329 | // We had unbalanced parentheses, so use the current |
||||||
330 | // match as a base to find a longer string |
||||||
331 | 1 | $attempt = $text[0]; |
|||||
332 | } |
||||||
333 | } |
||||||
334 | |||||||
335 | // Remove all carriage returns and line-feeds from the document stream |
||||||
336 | 50 | $content = str_replace(["\r", "\n"], ' ', trim($content)); |
|||||
337 | |||||||
338 | // Find all dictionary << >> commands and replace them so they |
||||||
339 | // aren't affected by the next steps |
||||||
340 | 50 | $dictstore = []; |
|||||
341 | 50 | while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { |
|||||
342 | 18 | $dictid = uniqid('DICT_', true); |
|||||
343 | 18 | $dictstore[$dictid] = $dicttext[1]; |
|||||
344 | 18 | $content = preg_replace( |
|||||
345 | 18 | '/'.preg_quote($dicttext[0], '/').'/', |
|||||
346 | 18 | ' ###'.$dictid.'###'.$dicttext[2], |
|||||
347 | 18 | $content, |
|||||
348 | 18 | 1 |
|||||
349 | 18 | ); |
|||||
350 | } |
||||||
351 | |||||||
352 | // Normalize white-space in the document stream |
||||||
353 | 50 | $content = preg_replace('/\s{2,}/', ' ', $content); |
|||||
354 | |||||||
355 | // Find all valid PDF operators and add \r\n after each; this |
||||||
356 | // ensures there is just one command on every line |
||||||
357 | // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A |
||||||
358 | // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A |
||||||
359 | // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while |
||||||
360 | // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions |
||||||
361 | // appear here in the list for completeness. |
||||||
362 | 50 | $operators = [ |
|||||
363 | 50 | 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', |
|||||
364 | 50 | 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', |
|||||
365 | 50 | 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', |
|||||
366 | 50 | 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', |
|||||
367 | 50 | 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', |
|||||
368 | 50 | 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', |
|||||
369 | 50 | ]; |
|||||
370 | 50 | foreach ($operators as $operator) { |
|||||
371 | 50 | $content = preg_replace( |
|||||
372 | 50 | '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/', |
|||||
373 | 50 | $operator."\r\n", |
|||||
374 | 50 | $content |
|||||
375 | 50 | ); |
|||||
376 | } |
||||||
377 | |||||||
378 | // Restore the original content of the dictionary << >> commands |
||||||
379 | 50 | $dictstore = array_reverse($dictstore, true); |
|||||
380 | 50 | foreach ($dictstore as $id => $dict) { |
|||||
381 | 18 | $content = str_replace('###'.$id.'###', $dict, $content); |
|||||
382 | } |
||||||
383 | |||||||
384 | // Restore the original string content |
||||||
385 | 50 | $pdfstrings = array_reverse($pdfstrings, true); |
|||||
386 | 50 | foreach ($pdfstrings as $id => $text) { |
|||||
387 | // Strings may contain escaped newlines, or literal newlines |
||||||
388 | // and we should clean these up before replacing the string |
||||||
389 | // back into the content stream; this ensures no strings are |
||||||
390 | // split between two lines (every command must be on one line) |
||||||
391 | 41 | $text = str_replace( |
|||||
392 | 41 | ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], |
|||||
393 | 41 | ['', '', '', '\r', '\n'], |
|||||
394 | 41 | $text |
|||||
395 | 41 | ); |
|||||
396 | |||||||
397 | 41 | $content = str_replace('@@@'.$id.'@@@', $text, $content); |
|||||
398 | } |
||||||
399 | |||||||
400 | // Restore the original content of any inline images |
||||||
401 | 50 | $pdfInlineImages = array_reverse($pdfInlineImages, true); |
|||||
402 | 50 | foreach ($pdfInlineImages as $id => $image) { |
|||||
403 | 1 | $content = str_replace( |
|||||
404 | 1 | '^^^'.$id.'^^^', |
|||||
405 | 1 | "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", |
|||||
406 | 1 | $content |
|||||
407 | 1 | ); |
|||||
408 | } |
||||||
409 | |||||||
410 | 50 | $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); |
|||||
411 | |||||||
412 | 50 | return $content; |
|||||
413 | } |
||||||
414 | |||||||
415 | /** |
||||||
416 | * getSectionsText() now takes an entire, unformatted |
||||||
417 | * document stream as a string, cleans it, then filters out |
||||||
418 | * commands that aren't needed for text positioning/extraction. It |
||||||
419 | * returns an array of unprocessed PDF commands, one command per |
||||||
420 | * element. |
||||||
421 | * |
||||||
422 | * @internal |
||||||
423 | */ |
||||||
424 | 52 | public function getSectionsText(?string $content): array |
|||||
425 | { |
||||||
426 | 52 | $sections = []; |
|||||
427 | |||||||
428 | // A cleaned stream has one command on every line, so split the |
||||||
429 | // cleaned stream content on \r\n into an array |
||||||
430 | 52 | $textCleaned = preg_split( |
|||||
431 | 52 | '/(\r\n|\n|\r)/', |
|||||
432 | 52 | $this->formatContent($content), |
|||||
433 | 52 | -1, |
|||||
434 | 52 | \PREG_SPLIT_NO_EMPTY |
|||||
435 | 52 | ); |
|||||
436 | |||||||
437 | 52 | $inTextBlock = false; |
|||||
438 | 52 | foreach ($textCleaned as $line) { |
|||||
439 | 49 | $line = trim($line); |
|||||
440 | |||||||
441 | // Skip empty lines |
||||||
442 | 49 | if ('' === $line) { |
|||||
443 | continue; |
||||||
444 | } |
||||||
445 | |||||||
446 | // If a 'BT' is encountered, set the $inTextBlock flag |
||||||
447 | 49 | if (preg_match('/BT$/', $line)) { |
|||||
448 | 49 | $inTextBlock = true; |
|||||
449 | 49 | $sections[] = $line; |
|||||
450 | |||||||
451 | // If an 'ET' is encountered, unset the $inTextBlock flag |
||||||
452 | 49 | } elseif ('ET' == $line) { |
|||||
453 | 49 | $inTextBlock = false; |
|||||
454 | 49 | $sections[] = $line; |
|||||
455 | 49 | } elseif ($inTextBlock) { |
|||||
456 | // If we are inside a BT ... ET text block, save all lines |
||||||
457 | 49 | $sections[] = trim($line); |
|||||
458 | } else { |
||||||
459 | // Otherwise, if we are outside of a text block, only |
||||||
460 | // save specific, necessary lines. Care should be taken |
||||||
461 | // to ensure a command being checked for *only* matches |
||||||
462 | // that command. For instance, a simple search for 'c' |
||||||
463 | // may also match the 'sc' command. See the command |
||||||
464 | // list in the formatContent() method above. |
||||||
465 | // Add more commands to save here as you find them in |
||||||
466 | // weird PDFs! |
||||||
467 | 48 | if ('q' == $line[-1] || 'Q' == $line[-1]) { |
|||||
468 | // Save and restore graphics state commands |
||||||
469 | 42 | $sections[] = $line; |
|||||
470 | 48 | } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) { |
|||||
471 | // Begin marked content sequence |
||||||
472 | 16 | $sections[] = $line; |
|||||
473 | 48 | } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) { |
|||||
474 | // Marked content point |
||||||
475 | 1 | $sections[] = $line; |
|||||
476 | 47 | } elseif (preg_match('/(?<!\w)EMC$/', $line)) { |
|||||
477 | // End marked content sequence |
||||||
478 | 15 | $sections[] = $line; |
|||||
479 | 45 | } elseif (preg_match('/(?<!\w)cm$/', $line)) { |
|||||
480 | // Graphics position change commands |
||||||
481 | 33 | $sections[] = $line; |
|||||
482 | 45 | } elseif (preg_match('/(?<!\w)Tf$/', $line)) { |
|||||
483 | // Font change commands |
||||||
484 | 3 | $sections[] = $line; |
|||||
485 | 45 | } elseif (preg_match('/(?<!\w)Do$/', $line)) { |
|||||
486 | // Invoke named XObject command |
||||||
487 | 15 | $sections[] = $line; |
|||||
488 | } |
||||||
489 | } |
||||||
490 | } |
||||||
491 | |||||||
492 | 52 | return $sections; |
|||||
493 | } |
||||||
494 | |||||||
495 | 46 | private function getDefaultFont(?Page $page = null): Font |
|||||
496 | { |
||||||
497 | 46 | $fonts = []; |
|||||
498 | 46 | if (null !== $page) { |
|||||
499 | 44 | $fonts = $page->getFonts(); |
|||||
500 | } |
||||||
501 | |||||||
502 | 46 | $firstFont = $this->document->getFirstFont(); |
|||||
0 ignored issues
–
show
The method
getFirstFont() does not exist on null .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
503 | 46 | if (null !== $firstFont) { |
|||||
504 | 43 | $fonts[] = $firstFont; |
|||||
505 | } |
||||||
506 | |||||||
507 | 46 | if (\count($fonts) > 0) { |
|||||
508 | 43 | return reset($fonts); |
|||||
509 | } |
||||||
510 | |||||||
511 | 3 | return new Font($this->document, null, null, $this->config); |
|||||
0 ignored issues
–
show
It seems like
$this->document can also be of type null ; however, parameter $document of Smalot\PdfParser\Font::__construct() does only seem to accept Smalot\PdfParser\Document , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
512 | } |
||||||
513 | |||||||
514 | /** |
||||||
515 | * Decode a '[]TJ' command and attempt to use alternate |
||||||
516 | * fonts if the current font results in output that contains |
||||||
517 | * Unicode control characters. |
||||||
518 | * |
||||||
519 | * @internal |
||||||
520 | * |
||||||
521 | * @param array<int,array<string,string|bool>> $command |
||||||
522 | */ |
||||||
523 | 43 | private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string |
|||||
524 | { |
||||||
525 | 43 | $orig_text = $font->decodeText($command, $fontFactor); |
|||||
526 | 43 | $text = $orig_text; |
|||||
527 | |||||||
528 | // If we make this a Config option, we can add a check if it's |
||||||
529 | // enabled here. |
||||||
530 | 43 | if (null !== $page) { |
|||||
531 | 43 | $font_ids = array_keys($page->getFonts()); |
|||||
532 | |||||||
533 | // If the decoded text contains UTF-8 control characters |
||||||
534 | // then the font page being used is probably the wrong one. |
||||||
535 | // Loop through the rest of the fonts to see if we can get |
||||||
536 | // a good decode. Allow x09 to x0d which are whitespace. |
||||||
537 | 43 | while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { |
|||||
538 | // If we're out of font IDs, then give up and use the |
||||||
539 | // original string |
||||||
540 | 3 | if (0 == \count($font_ids)) { |
|||||
541 | 3 | return $orig_text; |
|||||
542 | } |
||||||
543 | |||||||
544 | // Try the next font ID |
||||||
545 | 3 | $font = $page->getFont(array_shift($font_ids)); |
|||||
546 | 3 | $text = $font->decodeText($command, $fontFactor); |
|||||
547 | } |
||||||
548 | } |
||||||
549 | |||||||
550 | 43 | return $text; |
|||||
551 | } |
||||||
552 | |||||||
553 | /** |
||||||
554 | * Expects a string that is a full PDF dictionary object, |
||||||
555 | * including the outer enclosing << >> angle brackets |
||||||
556 | * |
||||||
557 | * @internal |
||||||
558 | * |
||||||
559 | * @throws \Exception |
||||||
560 | */ |
||||||
561 | 18 | public function parseDictionary(string $dictionary): array |
|||||
562 | { |
||||||
563 | // Normalize whitespace |
||||||
564 | 18 | $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); |
|||||
565 | |||||||
566 | 18 | if ('<<' != substr($dictionary, 0, 2)) { |
|||||
567 | throw new \Exception('Not a valid dictionary object.'); |
||||||
568 | } |
||||||
569 | |||||||
570 | 18 | $parsed = []; |
|||||
571 | 18 | $stack = []; |
|||||
572 | 18 | $currentName = ''; |
|||||
573 | 18 | $arrayTypeNumeric = false; |
|||||
574 | |||||||
575 | // Remove outer layer of dictionary, and split on tokens |
||||||
576 | 18 | $split = preg_split( |
|||||
577 | 18 | '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', |
|||||
578 | 18 | trim(preg_replace('/^<<|>>$/', '', $dictionary)), |
|||||
579 | 18 | -1, |
|||||
580 | 18 | \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE |
|||||
581 | 18 | ); |
|||||
582 | |||||||
583 | 18 | foreach ($split as $token) { |
|||||
584 | 18 | $token = trim($token); |
|||||
585 | switch ($token) { |
||||||
586 | 18 | case '': |
|||||
587 | 8 | break; |
|||||
588 | |||||||
589 | // Open numeric array |
||||||
590 | 18 | case '[': |
|||||
591 | 8 | $parsed[$currentName] = []; |
|||||
592 | 8 | $arrayTypeNumeric = true; |
|||||
593 | |||||||
594 | // Move up one level in the stack |
||||||
595 | 8 | $stack[\count($stack)] = &$parsed; |
|||||
596 | 8 | $parsed = &$parsed[$currentName]; |
|||||
597 | 8 | $currentName = ''; |
|||||
598 | 8 | break; |
|||||
599 | |||||||
600 | // Open hashed array |
||||||
601 | 18 | case '<<': |
|||||
602 | 1 | $parsed[$currentName] = []; |
|||||
603 | 1 | $arrayTypeNumeric = false; |
|||||
604 | |||||||
605 | // Move up one level in the stack |
||||||
606 | 1 | $stack[\count($stack)] = &$parsed; |
|||||
607 | 1 | $parsed = &$parsed[$currentName]; |
|||||
608 | 1 | $currentName = ''; |
|||||
609 | 1 | break; |
|||||
610 | |||||||
611 | // Close numeric array |
||||||
612 | 18 | case ']': |
|||||
613 | // Revert string type arrays back to a single element |
||||||
614 | 8 | if (\is_array($parsed) && 1 == \count($parsed) |
|||||
615 | 8 | && isset($parsed[0]) && \is_string($parsed[0]) |
|||||
616 | 8 | && '' !== $parsed[0] && '/' != $parsed[0][0]) { |
|||||
617 | 6 | $parsed = '['.$parsed[0].']'; |
|||||
618 | } |
||||||
619 | // Close hashed array |
||||||
620 | // no break |
||||||
621 | 18 | case '>>': |
|||||
622 | 8 | $arrayTypeNumeric = false; |
|||||
623 | |||||||
624 | // Move down one level in the stack |
||||||
625 | 8 | $parsed = &$stack[\count($stack) - 1]; |
|||||
626 | 8 | unset($stack[\count($stack) - 1]); |
|||||
627 | 8 | break; |
|||||
628 | |||||||
629 | default: |
||||||
630 | // If value begins with a slash, then this is a name |
||||||
631 | // Add it to the appropriate array |
||||||
632 | 18 | if ('/' == substr($token, 0, 1)) { |
|||||
633 | 18 | $currentName = substr($token, 1); |
|||||
634 | 18 | if (true == $arrayTypeNumeric) { |
|||||
0 ignored issues
–
show
|
|||||||
635 | 7 | $parsed[] = $currentName; |
|||||
636 | 18 | $currentName = ''; |
|||||
637 | } |
||||||
638 | 18 | } elseif ('' != $currentName) { |
|||||
639 | 18 | if (false == $arrayTypeNumeric) { |
|||||
0 ignored issues
–
show
|
|||||||
640 | 18 | $parsed[$currentName] = $token; |
|||||
641 | } |
||||||
642 | 18 | $currentName = ''; |
|||||
643 | 5 | } elseif ('' == $currentName) { |
|||||
644 | 5 | $parsed[] = $token; |
|||||
645 | } |
||||||
646 | } |
||||||
647 | } |
||||||
648 | |||||||
649 | 18 | return $parsed; |
|||||
650 | } |
||||||
651 | |||||||
652 | /** |
||||||
653 | * Returns the text content of a PDF as a string. Attempts to add |
||||||
654 | * whitespace for spacing and line-breaks where appropriate. |
||||||
655 | * |
||||||
656 | * getText() leverages getTextArray() to get the content |
||||||
657 | * of the document, setting the addPositionWhitespace flag to true |
||||||
658 | * so whitespace is inserted in a logical way for reading by |
||||||
659 | * humans. |
||||||
660 | */ |
||||||
661 | 37 | public function getText(?Page $page = null): string |
|||||
662 | { |
||||||
663 | 37 | $this->addPositionWhitespace = true; |
|||||
664 | 37 | $result = $this->getTextArray($page); |
|||||
665 | 37 | $this->addPositionWhitespace = false; |
|||||
666 | |||||||
667 | 37 | return implode('', $result).' '; |
|||||
668 | } |
||||||
669 | |||||||
670 | /** |
||||||
671 | * Returns the text content of a PDF as an array of strings. No |
||||||
672 | * extra whitespace is inserted besides what is actually encoded in |
||||||
673 | * the PDF text. |
||||||
674 | * |
||||||
675 | * @throws \Exception |
||||||
676 | */ |
||||||
677 | 46 | public function getTextArray(?Page $page = null): array |
|||||
678 | { |
||||||
679 | 46 | $result = []; |
|||||
680 | 46 | $text = []; |
|||||
681 | |||||||
682 | 46 | $marked_stack = []; |
|||||
683 | 46 | $last_written_position = false; |
|||||
684 | |||||||
685 | 46 | $sections = $this->getSectionsText($this->content); |
|||||
686 | 46 | $current_font = $this->getDefaultFont($page); |
|||||
687 | 46 | $current_font_size = 1; |
|||||
688 | 46 | $current_text_leading = 0; |
|||||
689 | |||||||
690 | 46 | $current_position = ['x' => false, 'y' => false]; |
|||||
691 | 46 | $current_position_tm = [ |
|||||
692 | 46 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
693 | 46 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
694 | 46 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
695 | 46 | ]; |
|||||
696 | 46 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||||
697 | 46 | $current_position_cm = [ |
|||||
698 | 46 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
699 | 46 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
700 | 46 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
701 | 46 | ]; |
|||||
702 | |||||||
703 | 46 | $clipped_font = []; |
|||||
704 | 46 | $clipped_position_cm = []; |
|||||
705 | |||||||
706 | 46 | self::$recursionStack[] = $this->getUniqueId(); |
|||||
707 | |||||||
708 | 46 | foreach ($sections as $section) { |
|||||
709 | 43 | $commands = $this->getCommandsText($section); |
|||||
710 | 43 | foreach ($commands as $command) { |
|||||
711 | 43 | switch ($command[self::OPERATOR]) { |
|||||
712 | // Begin text object |
||||||
713 | 43 | case 'BT': |
|||||
714 | // Reset text positioning matrices |
||||||
715 | 43 | $current_position_tm = [ |
|||||
716 | 43 | 'a' => 1, 'b' => 0, 'c' => 0, |
|||||
717 | 43 | 'i' => 0, 'j' => 1, 'k' => 0, |
|||||
718 | 43 | 'x' => 0, 'y' => 0, 'z' => 1, |
|||||
719 | 43 | ]; |
|||||
720 | 43 | $current_position_td = ['x' => 0, 'y' => 0]; |
|||||
721 | 43 | $current_text_leading = 0; |
|||||
722 | 43 | break; |
|||||
723 | |||||||
724 | // Begin marked content sequence with property list |
||||||
725 | 43 | case 'BDC': |
|||||
726 | 16 | if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { |
|||||
727 | 16 | $dict = $this->parseDictionary($match[1]); |
|||||
728 | |||||||
729 | // Check for ActualText block |
||||||
730 | 16 | if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { |
|||||
731 | 4 | if ('[' == $dict['ActualText'][0]) { |
|||||
732 | // Simulate a 'TJ' command on the stack |
||||||
733 | $marked_stack[] = [ |
||||||
734 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], |
||||||
735 | ]; |
||||||
736 | 4 | } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { |
|||||
737 | // Simulate a 'Tj' command on the stack |
||||||
738 | 4 | $marked_stack[] = [ |
|||||
739 | 4 | 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], |
|||||
740 | 4 | ]; |
|||||
741 | } |
||||||
742 | } |
||||||
743 | } |
||||||
744 | 16 | break; |
|||||
745 | |||||||
746 | // Begin marked content sequence |
||||||
747 | 43 | case 'BMC': |
|||||
748 | 2 | if ('ReversedChars' == $command[self::COMMAND]) { |
|||||
749 | // Upon encountering a ReversedChars command, |
||||||
750 | // add the characters we've built up so far to |
||||||
751 | // the result array |
||||||
752 | 1 | $result = array_merge($result, $text); |
|||||
753 | |||||||
754 | // Start a fresh $text array that will contain |
||||||
755 | // reversed characters |
||||||
756 | 1 | $text = []; |
|||||
757 | |||||||
758 | // Add the reversed text flag to the stack |
||||||
759 | 1 | $marked_stack[] = ['ReversedChars' => true]; |
|||||
760 | } |
||||||
761 | 2 | break; |
|||||
762 | |||||||
763 | // set graphics position matrix |
||||||
764 | 43 | case 'cm': |
|||||
765 | 29 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
766 | 29 | $current_position_cm = [ |
|||||
767 | 29 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||||
768 | 29 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||||
769 | 29 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||||
770 | 29 | ]; |
|||||
771 | 29 | break; |
|||||
772 | |||||||
773 | 43 | case 'Do': |
|||||
774 | 15 | if (null !== $page) { |
|||||
775 | 15 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||||
776 | 15 | $id = trim(array_pop($args), '/ '); |
|||||
777 | 15 | $xobject = $page->getXObject($id); |
|||||
778 | |||||||
779 | // @todo $xobject could be a ElementXRef object, which would then throw an error |
||||||
780 | 15 | if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { |
|||||
781 | // Not a circular reference. |
||||||
782 | 15 | $text[] = $xobject->getText($page); |
|||||
783 | } |
||||||
784 | } |
||||||
785 | 15 | break; |
|||||
786 | |||||||
787 | // Marked content point with (DP) & without (MP) property list |
||||||
788 | 43 | case 'DP': |
|||||
789 | 43 | case 'MP': |
|||||
790 | 1 | break; |
|||||
791 | |||||||
792 | // End text object |
||||||
793 | 43 | case 'ET': |
|||||
794 | 43 | break; |
|||||
795 | |||||||
796 | // Store current selected font and graphics matrix |
||||||
797 | 43 | case 'q': |
|||||
798 | 37 | $clipped_font[] = [$current_font, $current_font_size]; |
|||||
799 | 37 | $clipped_position_cm[] = $current_position_cm; |
|||||
800 | 37 | break; |
|||||
801 | |||||||
802 | // Restore previous selected font and graphics matrix |
||||||
803 | 43 | case 'Q': |
|||||
804 | 37 | list($current_font, $current_font_size) = array_pop($clipped_font); |
|||||
805 | 37 | $current_position_cm = array_pop($clipped_position_cm); |
|||||
806 | 37 | break; |
|||||
807 | |||||||
808 | // End marked content sequence |
||||||
809 | 43 | case 'EMC': |
|||||
810 | 17 | $data = false; |
|||||
811 | 17 | if (\count($marked_stack)) { |
|||||
812 | 5 | $marked = array_pop($marked_stack); |
|||||
813 | 5 | $action = key($marked); |
|||||
814 | 5 | $data = $marked[$action]; |
|||||
815 | |||||||
816 | switch ($action) { |
||||||
817 | // If we are in ReversedChars mode... |
||||||
818 | 5 | case 'ReversedChars': |
|||||
819 | // Reverse the characters we've built up so far |
||||||
820 | 1 | foreach ($text as $key => $t) { |
|||||
821 | 1 | $text[$key] = implode('', array_reverse( |
|||||
822 | 1 | mb_str_split($t, 1, mb_internal_encoding()) |
|||||
0 ignored issues
–
show
It seems like
mb_internal_encoding() can also be of type true ; however, parameter $encoding of mb_str_split() does only seem to accept null|string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
823 | 1 | )); |
|||||
824 | } |
||||||
825 | |||||||
826 | // Add these characters to the result array |
||||||
827 | 1 | $result = array_merge($result, $text); |
|||||
828 | |||||||
829 | // Start a fresh $text array that will contain |
||||||
830 | // non-reversed characters |
||||||
831 | 1 | $text = []; |
|||||
832 | 1 | break; |
|||||
833 | |||||||
834 | 4 | case 'ActualText': |
|||||
835 | // Use the content of the ActualText as a command |
||||||
836 | 4 | $command = $data; |
|||||
837 | 4 | break; |
|||||
838 | } |
||||||
839 | } |
||||||
840 | |||||||
841 | // If this EMC command has been transformed into a 'Tj' |
||||||
842 | // or 'TJ' command because of being ActualText, then bypass |
||||||
843 | // the break to proceed to the writing section below. |
||||||
844 | 17 | if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { |
|||||
845 | 17 | break; |
|||||
846 | } |
||||||
847 | |||||||
848 | // no break |
||||||
849 | 43 | case "'": |
|||||
850 | 43 | case '"': |
|||||
851 | 4 | if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { |
|||||
852 | // Move to next line and write text |
||||||
853 | $current_position['x'] = 0; |
||||||
854 | $current_position_td['x'] = 0; |
||||||
855 | $current_position_td['y'] += $current_text_leading; |
||||||
856 | } |
||||||
857 | // no break |
||||||
858 | 43 | case 'Tj': |
|||||
859 | 35 | $command[self::COMMAND] = [$command]; |
|||||
860 | // no break |
||||||
861 | 43 | case 'TJ': |
|||||
862 | // Check the marked content stack for flags |
||||||
863 | 43 | $actual_text = false; |
|||||
864 | 43 | $reverse_text = false; |
|||||
865 | 43 | foreach ($marked_stack as $marked) { |
|||||
866 | 5 | if (isset($marked['ActualText'])) { |
|||||
867 | 4 | $actual_text = true; |
|||||
868 | } |
||||||
869 | 5 | if (isset($marked['ReversedChars'])) { |
|||||
870 | 1 | $reverse_text = true; |
|||||
871 | } |
||||||
872 | } |
||||||
873 | |||||||
874 | // Account for text position ONLY just before we write text |
||||||
875 | 43 | if (false === $actual_text && \is_array($last_written_position)) { |
|||||
876 | // If $last_written_position is an array, that |
||||||
877 | // means we have stored text position coordinates |
||||||
878 | // for placing an ActualText |
||||||
879 | 4 | $currentX = $last_written_position[0]; |
|||||
880 | 4 | $currentY = $last_written_position[1]; |
|||||
881 | 4 | $last_written_position = false; |
|||||
882 | } else { |
||||||
883 | 43 | $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; |
|||||
884 | 43 | $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; |
|||||
885 | } |
||||||
886 | 43 | $whiteSpace = ''; |
|||||
887 | |||||||
888 | 43 | $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; |
|||||
889 | 43 | $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; |
|||||
890 | |||||||
891 | 43 | if (true === $this->addPositionWhitespace && false !== $current_position['x']) { |
|||||
892 | 31 | $curY = $currentY - $current_position['y']; |
|||||
893 | 31 | if (abs($curY) >= abs($factorY) / 4) { |
|||||
894 | 30 | $whiteSpace = "\n"; |
|||||
895 | } else { |
||||||
896 | 30 | if (true === $reverse_text) { |
|||||
897 | 1 | $curX = $current_position['x'] - $currentX; |
|||||
898 | } else { |
||||||
899 | 30 | $curX = $currentX - $current_position['x']; |
|||||
900 | } |
||||||
901 | |||||||
902 | // In abs($factorX * 7) below, the 7 is chosen arbitrarily |
||||||
903 | // as the number of apparent "spaces" in a document we |
||||||
904 | // would need before considering them a "tab". In the |
||||||
905 | // future, we might offer this value to users as a config |
||||||
906 | // option. |
||||||
907 | 30 | if ($curX >= abs($factorX * 7)) { |
|||||
908 | 20 | $whiteSpace = "\t"; |
|||||
909 | 29 | } elseif ($curX >= abs($factorX * 2)) { |
|||||
910 | 19 | $whiteSpace = ' '; |
|||||
911 | } |
||||||
912 | } |
||||||
913 | } |
||||||
914 | |||||||
915 | 43 | $newtext = $this->getTJUsingFontFallback( |
|||||
916 | 43 | $current_font, |
|||||
917 | 43 | $command[self::COMMAND], |
|||||
918 | 43 | $page, |
|||||
919 | 43 | $factorX |
|||||
920 | 43 | ); |
|||||
921 | |||||||
922 | // If there is no ActualText pending then write |
||||||
923 | 43 | if (false === $actual_text) { |
|||||
924 | 43 | $newtext = str_replace(["\r", "\n"], '', $newtext); |
|||||
925 | 43 | if (false !== $reverse_text) { |
|||||
926 | // If we are in ReversedChars mode, add the whitespace last |
||||||
927 | 1 | $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); |
|||||
928 | } else { |
||||||
929 | // Otherwise add the whitespace first |
||||||
930 | 43 | if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { |
|||||
931 | 18 | $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); |
|||||
932 | } |
||||||
933 | 43 | $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); |
|||||
934 | } |
||||||
935 | |||||||
936 | // Record the position of this inserted text for comparison |
||||||
937 | // with the next text block. |
||||||
938 | // Provide a 'fudge' factor guess on how wide this text block |
||||||
939 | // is based on the number of characters. This helps limit the |
||||||
940 | // number of tabs inserted, but isn't perfect. |
||||||
941 | 43 | $factor = $factorX / 2; |
|||||
942 | 43 | $current_position = [ |
|||||
943 | 43 | 'x' => $currentX - mb_strlen($newtext) * $factor, |
|||||
944 | 43 | 'y' => $currentY, |
|||||
945 | 43 | ]; |
|||||
946 | 4 | } elseif (false === $last_written_position) { |
|||||
947 | // If there is an ActualText in the pipeline |
||||||
948 | // store the position this undisplayed text |
||||||
949 | // *would* have been written to, so the |
||||||
950 | // ActualText is displayed in the right spot |
||||||
951 | 4 | $last_written_position = [$currentX, $currentY]; |
|||||
952 | 4 | $current_position['x'] = $currentX; |
|||||
953 | } |
||||||
954 | 43 | break; |
|||||
955 | |||||||
956 | // move to start of next line |
||||||
957 | 43 | case 'T*': |
|||||
958 | 13 | $current_position['x'] = 0; |
|||||
959 | 13 | $current_position_td['x'] = 0; |
|||||
960 | 13 | $current_position_td['y'] += $current_text_leading; |
|||||
961 | 13 | break; |
|||||
962 | |||||||
963 | // set character spacing |
||||||
964 | 43 | case 'Tc': |
|||||
965 | 13 | break; |
|||||
966 | |||||||
967 | // move text current point and set leading |
||||||
968 | 43 | case 'Td': |
|||||
969 | 43 | case 'TD': |
|||||
970 | // move text current point |
||||||
971 | 32 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
972 | 32 | $y = (float) array_pop($args); |
|||||
973 | 32 | $x = (float) array_pop($args); |
|||||
974 | |||||||
975 | 32 | if ('TD' == $command[self::OPERATOR]) { |
|||||
976 | 7 | $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; |
|||||
977 | } |
||||||
978 | |||||||
979 | 32 | $current_position_td = [ |
|||||
980 | 32 | 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], |
|||||
981 | 32 | 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], |
|||||
982 | 32 | ]; |
|||||
983 | 32 | break; |
|||||
984 | |||||||
985 | 43 | case 'Tf': |
|||||
986 | 43 | $args = preg_split('/\s/s', $command[self::COMMAND]); |
|||||
987 | 43 | $size = (float) array_pop($args); |
|||||
988 | 43 | $id = trim(array_pop($args), '/'); |
|||||
989 | 43 | if (null !== $page) { |
|||||
990 | 43 | $new_font = $page->getFont($id); |
|||||
991 | // If an invalid font ID is given, do not update the font. |
||||||
992 | // This should theoretically never happen, as the PDF spec states for the Tf operator: |
||||||
993 | // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" |
||||||
994 | // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) |
||||||
995 | // But we want to make sure that malformed PDFs do not simply crash. |
||||||
996 | 43 | if (null !== $new_font) { |
|||||
997 | 43 | $current_font = $new_font; |
|||||
998 | 43 | $current_font_size = $size; |
|||||
999 | } |
||||||
1000 | } |
||||||
1001 | 43 | break; |
|||||
1002 | |||||||
1003 | // set leading |
||||||
1004 | 37 | case 'TL': |
|||||
1005 | 6 | $y = (float) $command[self::COMMAND]; |
|||||
1006 | 6 | $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; |
|||||
1007 | 6 | break; |
|||||
1008 | |||||||
1009 | // set text position matrix |
||||||
1010 | 37 | case 'Tm': |
|||||
1011 | 34 | $args = preg_split('/\s+/s', $command[self::COMMAND]); |
|||||
1012 | 34 | $current_position_tm = [ |
|||||
1013 | 34 | 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, |
|||||
1014 | 34 | 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, |
|||||
1015 | 34 | 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, |
|||||
1016 | 34 | ]; |
|||||
1017 | 34 | break; |
|||||
1018 | |||||||
1019 | // set text rendering mode |
||||||
1020 | 22 | case 'Ts': |
|||||
1021 | break; |
||||||
1022 | |||||||
1023 | // set super/subscripting text rise |
||||||
1024 | 22 | case 'Ts': |
|||||
1025 | break; |
||||||
1026 | |||||||
1027 | // set word spacing |
||||||
1028 | 22 | case 'Tw': |
|||||
1029 | 9 | break; |
|||||
1030 | |||||||
1031 | // set horizontal scaling |
||||||
1032 | 22 | case 'Tz': |
|||||
1033 | break; |
||||||
1034 | |||||||
1035 | default: |
||||||
1036 | } |
||||||
1037 | } |
||||||
1038 | } |
||||||
1039 | |||||||
1040 | 46 | $result = array_merge($result, $text); |
|||||
1041 | |||||||
1042 | 46 | return $result; |
|||||
1043 | } |
||||||
1044 | |||||||
1045 | /** |
||||||
1046 | * getCommandsText() expects the content of $text_part to be an |
||||||
1047 | * already formatted, single-line command from a document stream. |
||||||
1048 | * The companion function getSectionsText() returns a document |
||||||
1049 | * stream as an array of single commands for just this purpose. |
||||||
1050 | * Because of this, the argument $offset is no longer used, and |
||||||
1051 | * may be removed in a future PdfParser release. |
||||||
1052 | * |
||||||
1053 | * A better name for this function would be getCommandText() |
||||||
1054 | * since it now always works on just one command. |
||||||
1055 | */ |
||||||
1056 | 50 | public function getCommandsText(string $text_part, int &$offset = 0): array |
|||||
0 ignored issues
–
show
The parameter
$offset is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.
Loading history...
|
|||||||
1057 | { |
||||||
1058 | 50 | $commands = $matches = []; |
|||||
1059 | |||||||
1060 | 50 | preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches); |
|||||
1061 | |||||||
1062 | // If no valid command is detected, return an empty array |
||||||
1063 | 50 | if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { |
|||||
1064 | 1 | return []; |
|||||
1065 | } |
||||||
1066 | |||||||
1067 | 50 | $type = $matches[2]; |
|||||
1068 | 50 | $operator = $matches[3]; |
|||||
1069 | 50 | $command = trim($matches[1]); |
|||||
1070 | |||||||
1071 | 50 | if ('TJ' == $operator) { |
|||||
1072 | 40 | $subcommand = []; |
|||||
1073 | 40 | $command = trim($command, '[]'); |
|||||
1074 | do { |
||||||
1075 | 40 | $oldCommand = $command; |
|||||
1076 | |||||||
1077 | // Search for parentheses string () format |
||||||
1078 | 40 | if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) { |
|||||
1079 | 34 | $subcommand[] = [ |
|||||
1080 | 34 | self::TYPE => '(', |
|||||
1081 | 34 | self::OPERATOR => 'TJ', |
|||||
1082 | 34 | self::COMMAND => $tjmatch[1], |
|||||
1083 | 34 | ]; |
|||||
1084 | 34 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||||
1085 | 28 | $subcommand[] = [ |
|||||
1086 | 28 | self::TYPE => 'n', |
|||||
1087 | 28 | self::OPERATOR => '', |
|||||
1088 | 28 | self::COMMAND => $tjmatch[2], |
|||||
1089 | 28 | ]; |
|||||
1090 | } |
||||||
1091 | 34 | $command = substr($command, \strlen($tjmatch[0])); |
|||||
1092 | } |
||||||
1093 | |||||||
1094 | // Search for hexadecimal <> format |
||||||
1095 | 40 | if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { |
|||||
1096 | 19 | $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); |
|||||
1097 | 19 | $subcommand[] = [ |
|||||
1098 | 19 | self::TYPE => '<', |
|||||
1099 | 19 | self::OPERATOR => 'TJ', |
|||||
1100 | 19 | self::COMMAND => $tjmatch[1], |
|||||
1101 | 19 | ]; |
|||||
1102 | 19 | if (isset($tjmatch[2]) && trim($tjmatch[2])) { |
|||||
1103 | 18 | $subcommand[] = [ |
|||||
1104 | 18 | self::TYPE => 'n', |
|||||
1105 | 18 | self::OPERATOR => '', |
|||||
1106 | 18 | self::COMMAND => $tjmatch[2], |
|||||
1107 | 18 | ]; |
|||||
1108 | } |
||||||
1109 | 19 | $command = substr($command, \strlen($tjmatch[0])); |
|||||
1110 | } |
||||||
1111 | 40 | } while ($command != $oldCommand); |
|||||
1112 | |||||||
1113 | 40 | $command = $subcommand; |
|||||
1114 | 50 | } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { |
|||||
1115 | // Depending on the string type, trim the data of the |
||||||
1116 | // appropriate delimiters |
||||||
1117 | 39 | if ('(' == $type) { |
|||||
1118 | // Don't use trim() here since a () string may end with |
||||||
1119 | // a balanced or escaped right parentheses, and trim() |
||||||
1120 | // will delete both. Both strings below are valid: |
||||||
1121 | // eg. (String()) |
||||||
1122 | // eg. (String\)) |
||||||
1123 | 33 | $command = preg_replace('/^\(|\)$/', '', $command); |
|||||
1124 | 15 | } elseif ('<' == $type) { |
|||||
1125 | 39 | $command = trim($command, '<>'); |
|||||
1126 | } |
||||||
1127 | 50 | } elseif ('/' == $type) { |
|||||
1128 | 49 | $command = substr($command, 1); |
|||||
1129 | } |
||||||
1130 | |||||||
1131 | 50 | $commands[] = [ |
|||||
1132 | 50 | self::TYPE => $type, |
|||||
1133 | 50 | self::OPERATOR => $operator, |
|||||
1134 | 50 | self::COMMAND => $command, |
|||||
1135 | 50 | ]; |
|||||
1136 | |||||||
1137 | 50 | return $commands; |
|||||
1138 | } |
||||||
1139 | |||||||
1140 | 65 | public static function factory( |
|||||
1141 | Document $document, |
||||||
1142 | Header $header, |
||||||
1143 | ?string $content, |
||||||
1144 | ?Config $config = null |
||||||
1145 | ): self { |
||||||
1146 | 65 | switch ($header->get('Type')->getContent()) { |
|||||
1147 | 65 | case 'XObject': |
|||||
1148 | 19 | switch ($header->get('Subtype')->getContent()) { |
|||||
1149 | 19 | case 'Image': |
|||||
1150 | 12 | return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config); |
|||||
0 ignored issues
–
show
The method
getRetainImageContent() does not exist on null .
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
1151 | |||||||
1152 | 8 | case 'Form': |
|||||
1153 | 8 | return new Form($document, $header, $content, $config); |
|||||
1154 | } |
||||||
1155 | |||||||
1156 | return new self($document, $header, $content, $config); |
||||||
1157 | |||||||
1158 | 65 | case 'Pages': |
|||||
1159 | 64 | return new Pages($document, $header, $content, $config); |
|||||
1160 | |||||||
1161 | 65 | case 'Page': |
|||||
1162 | 64 | return new Page($document, $header, $content, $config); |
|||||
1163 | |||||||
1164 | 65 | case 'Encoding': |
|||||
1165 | 12 | return new Encoding($document, $header, $content, $config); |
|||||
1166 | |||||||
1167 | 65 | case 'Font': |
|||||
1168 | 64 | $subtype = $header->get('Subtype')->getContent(); |
|||||
1169 | 64 | $classname = '\Smalot\PdfParser\Font\Font'.$subtype; |
|||||
1170 | |||||||
1171 | 64 | if (class_exists($classname)) { |
|||||
1172 | 64 | return new $classname($document, $header, $content, $config); |
|||||
1173 | } |
||||||
1174 | |||||||
1175 | return new Font($document, $header, $content, $config); |
||||||
1176 | |||||||
1177 | default: |
||||||
1178 | 65 | return new self($document, $header, $content, $config); |
|||||
1179 | } |
||||||
1180 | } |
||||||
1181 | |||||||
1182 | /** |
||||||
1183 | * Returns unique id identifying the object. |
||||||
1184 | */ |
||||||
1185 | 46 | protected function getUniqueId(): string |
|||||
1186 | { |
||||||
1187 | 46 | return spl_object_hash($this); |
|||||
1188 | } |
||||||
1189 | } |
||||||
1190 |