1 | <?php |
||
2 | |||
3 | /** |
||
4 | * @file |
||
5 | * This file is part of the PdfParser library. |
||
6 | * |
||
7 | * @author Sébastien MALOT <[email protected]> |
||
8 | * |
||
9 | * @date 2017-01-03 |
||
10 | * |
||
11 | * @license LGPLv3 |
||
12 | * |
||
13 | * @url <https://github.com/smalot/pdfparser> |
||
14 | * |
||
15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||
16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||
17 | * |
||
18 | * This program is free software: you can redistribute it and/or modify |
||
19 | * it under the terms of the GNU Lesser General Public License as published by |
||
20 | * the Free Software Foundation, either version 3 of the License, or |
||
21 | * (at your option) any later version. |
||
22 | * |
||
23 | * This program is distributed in the hope that it will be useful, |
||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
26 | * GNU Lesser General Public License for more details. |
||
27 | * |
||
28 | * You should have received a copy of the GNU Lesser General Public License |
||
29 | * along with this program. |
||
30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||
31 | */ |
||
32 | |||
33 | namespace Smalot\PdfParser; |
||
34 | |||
35 | use Smalot\PdfParser\Element\ElementArray; |
||
36 | use Smalot\PdfParser\Element\ElementMissing; |
||
37 | use Smalot\PdfParser\Element\ElementNull; |
||
38 | use Smalot\PdfParser\Element\ElementXRef; |
||
39 | |||
40 | class Page extends PDFObject |
||
41 | { |
||
42 | /** |
||
43 | * @var Font[] |
||
44 | */ |
||
45 | protected $fonts; |
||
46 | |||
47 | /** |
||
48 | * @var PDFObject[] |
||
49 | */ |
||
50 | protected $xobjects; |
||
51 | |||
52 | /** |
||
53 | * @var array |
||
54 | */ |
||
55 | protected $dataTm; |
||
56 | |||
57 | /** |
||
58 | * @param array<\Smalot\PdfParser\Font> $fonts |
||
59 | * |
||
60 | * @internal |
||
61 | */ |
||
62 | 9 | public function setFonts($fonts) |
|
63 | { |
||
64 | 9 | if (empty($this->fonts)) { |
|
65 | 9 | $this->fonts = $fonts; |
|
66 | } |
||
67 | } |
||
68 | |||
69 | /** |
||
70 | * @return Font[] |
||
71 | */ |
||
72 | 51 | public function getFonts() |
|
73 | { |
||
74 | 51 | if (null !== $this->fonts) { |
|
75 | 49 | return $this->fonts; |
|
76 | } |
||
77 | |||
78 | 43 | $resources = $this->get('Resources'); |
|
79 | |||
80 | 43 | if (method_exists($resources, 'has') && $resources->has('Font')) { |
|
81 | 42 | if ($resources->get('Font') instanceof ElementMissing) { |
|
82 | 1 | return []; |
|
83 | } |
||
84 | |||
85 | 41 | if ($resources->get('Font') instanceof Header) { |
|
86 | 35 | $fonts = $resources->get('Font')->getElements(); |
|
87 | } else { |
||
88 | 10 | $fonts = $resources->get('Font')->getHeader()->getElements(); |
|
89 | } |
||
90 | |||
91 | 41 | $table = []; |
|
92 | |||
93 | 41 | foreach ($fonts as $id => $font) { |
|
94 | 41 | if ($font instanceof Font) { |
|
95 | 41 | $table[$id] = $font; |
|
96 | |||
97 | // Store too on cleaned id value (only numeric) |
||
98 | 41 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|
99 | 41 | if ('' != $id) { |
|
100 | 40 | $table[$id] = $font; |
|
101 | } |
||
102 | } |
||
103 | } |
||
104 | |||
105 | 41 | return $this->fonts = $table; |
|
106 | } |
||
107 | |||
108 | 4 | return []; |
|
109 | } |
||
110 | |||
111 | 47 | public function getFont(string $id): ?Font |
|
112 | { |
||
113 | 47 | $fonts = $this->getFonts(); |
|
114 | |||
115 | 47 | if (isset($fonts[$id])) { |
|
116 | 47 | return $fonts[$id]; |
|
117 | } |
||
118 | |||
119 | // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238) |
||
120 | // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources" |
||
121 | // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass. |
||
122 | |||
123 | 1 | if (isset($fonts[$id])) { |
|
124 | return $fonts[$id]; |
||
125 | } else { |
||
126 | 1 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|
127 | 1 | if (isset($fonts[$id])) { |
|
128 | 1 | return $fonts[$id]; |
|
129 | } |
||
130 | } |
||
131 | |||
132 | return null; |
||
133 | } |
||
134 | |||
135 | /** |
||
136 | * Support for XObject |
||
137 | * |
||
138 | * @return PDFObject[] |
||
139 | */ |
||
140 | 16 | public function getXObjects() |
|
141 | { |
||
142 | 16 | if (null !== $this->xobjects) { |
|
143 | 10 | return $this->xobjects; |
|
144 | } |
||
145 | |||
146 | 16 | $resources = $this->get('Resources'); |
|
147 | |||
148 | 16 | if (method_exists($resources, 'has') && $resources->has('XObject')) { |
|
149 | 16 | if ($resources->get('XObject') instanceof Header) { |
|
150 | 15 | $xobjects = $resources->get('XObject')->getElements(); |
|
151 | } else { |
||
152 | 1 | $xobjects = $resources->get('XObject')->getHeader()->getElements(); |
|
153 | } |
||
154 | |||
155 | 16 | $table = []; |
|
156 | |||
157 | 16 | foreach ($xobjects as $id => $xobject) { |
|
158 | 16 | $table[$id] = $xobject; |
|
159 | |||
160 | // Store too on cleaned id value (only numeric) |
||
161 | 16 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|
162 | 16 | if ('' != $id) { |
|
163 | 16 | $table[$id] = $xobject; |
|
164 | } |
||
165 | } |
||
166 | |||
167 | 16 | return $this->xobjects = $table; |
|
168 | } |
||
169 | |||
170 | return []; |
||
171 | } |
||
172 | |||
173 | 15 | public function getXObject(string $id): ?PDFObject |
|
174 | { |
||
175 | 15 | $xobjects = $this->getXObjects(); |
|
176 | |||
177 | 15 | if (isset($xobjects[$id])) { |
|
178 | 15 | return $xobjects[$id]; |
|
179 | } |
||
180 | |||
181 | return null; |
||
182 | /*$id = preg_replace('/[^0-9\.\-_]/', '', $id); |
||
183 | |||
184 | if (isset($xobjects[$id])) { |
||
185 | return $xobjects[$id]; |
||
186 | } else { |
||
187 | return null; |
||
188 | }*/ |
||
189 | } |
||
190 | |||
191 | 35 | public function getText(?self $page = null): string |
|
192 | { |
||
193 | 35 | if ($contents = $this->get('Contents')) { |
|
194 | 35 | if ($contents instanceof ElementMissing) { |
|
195 | return ''; |
||
196 | 35 | } elseif ($contents instanceof ElementNull) { |
|
197 | return ''; |
||
198 | 35 | } elseif ($contents instanceof PDFObject) { |
|
199 | 29 | $elements = $contents->getHeader()->getElements(); |
|
200 | |||
201 | 29 | if (is_numeric(key($elements))) { |
|
202 | $new_content = ''; |
||
203 | |||
204 | foreach ($elements as $element) { |
||
205 | if ($element instanceof ElementXRef) { |
||
206 | $new_content .= $element->getObject()->getContent(); |
||
207 | } else { |
||
208 | $new_content .= $element->getContent(); |
||
209 | } |
||
210 | } |
||
211 | |||
212 | $header = new Header([], $this->document); |
||
213 | 29 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|
214 | } |
||
215 | 9 | } elseif ($contents instanceof ElementArray) { |
|
216 | // Create a virtual global content. |
||
217 | 9 | $new_content = ''; |
|
218 | |||
219 | 9 | foreach ($contents->getContent() as $content) { |
|
220 | 9 | $new_content .= $content->getContent()."\n"; |
|
221 | } |
||
222 | |||
223 | 9 | $header = new Header([], $this->document); |
|
224 | 9 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|
225 | } |
||
226 | |||
227 | /* |
||
228 | * Elements referencing each other on the same page can cause endless loops during text parsing. |
||
229 | * To combat this we keep a recursionStack containing already parsed elements on the page. |
||
230 | * The stack is only emptied here after getting text from a page. |
||
231 | */ |
||
232 | 35 | $contentsText = $contents->getText($this); |
|
233 | 35 | PDFObject::$recursionStack = []; |
|
234 | |||
235 | 35 | return $contentsText; |
|
236 | } |
||
237 | |||
238 | return ''; |
||
239 | } |
||
240 | |||
241 | /** |
||
242 | * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document |
||
243 | * |
||
244 | * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the |
||
245 | * pdf file was generated by FPDF/Fpfi. |
||
246 | * |
||
247 | * @return bool true is the current page is a FPDI/FPDF document |
||
248 | */ |
||
249 | 13 | public function isFpdf(): bool |
|
250 | { |
||
251 | 13 | if (\array_key_exists('Producer', $this->document->getDetails()) |
|
252 | 13 | && \is_string($this->document->getDetails()['Producer']) |
|
253 | 13 | && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) { |
|
254 | 2 | return true; |
|
255 | } |
||
256 | |||
257 | 12 | return false; |
|
258 | } |
||
259 | |||
260 | /** |
||
261 | * Return the page number of the PDF document of the page object |
||
262 | * |
||
263 | * @return int the page number |
||
264 | */ |
||
265 | 2 | public function getPageNumber(): int |
|
266 | { |
||
267 | 2 | $pages = $this->document->getPages(); |
|
268 | 2 | $numOfPages = \count($pages); |
|
269 | 2 | for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) { |
|
270 | 2 | if ($pages[$pageNum] === $this) { |
|
271 | 2 | break; |
|
272 | } |
||
273 | } |
||
274 | |||
275 | 2 | return $pageNum; |
|
276 | } |
||
277 | |||
278 | /** |
||
279 | * Return the Object of the page if the document is a FPDF/FPDI document |
||
280 | * |
||
281 | * If the document was generated by FPDF/FPDI it returns the |
||
282 | * PDFObject of the given page |
||
283 | * |
||
284 | * @return PDFObject The PDFObject for the page |
||
285 | */ |
||
286 | 1 | public function getPDFObjectForFpdf(): PDFObject |
|
287 | { |
||
288 | 1 | $pageNum = $this->getPageNumber(); |
|
289 | 1 | $xObjects = $this->getXObjects(); |
|
290 | |||
291 | 1 | return $xObjects[$pageNum]; |
|
292 | } |
||
293 | |||
294 | /** |
||
295 | * Return a new PDFObject of the document created with FPDF/FPDI |
||
296 | * |
||
297 | * For a document generated by FPDF/FPDI, it generates a |
||
298 | * new PDFObject for that document |
||
299 | * |
||
300 | * @return PDFObject The PDFObject |
||
301 | */ |
||
302 | 1 | public function createPDFObjectForFpdf(): PDFObject |
|
303 | { |
||
304 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|
305 | 1 | $new_content = $pdfObject->getContent(); |
|
306 | 1 | $header = $pdfObject->getHeader(); |
|
307 | 1 | $config = $pdfObject->config; |
|
308 | |||
309 | 1 | return new PDFObject($pdfObject->document, $header, $new_content, $config); |
|
310 | } |
||
311 | |||
312 | /** |
||
313 | * Return page if document is a FPDF/FPDI document |
||
314 | * |
||
315 | * @return Page The page |
||
316 | */ |
||
317 | 1 | public function createPageForFpdf(): self |
|
318 | { |
||
319 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|
320 | 1 | $new_content = $pdfObject->getContent(); |
|
321 | 1 | $header = $pdfObject->getHeader(); |
|
322 | 1 | $config = $pdfObject->config; |
|
323 | |||
324 | 1 | return new self($pdfObject->document, $header, $new_content, $config); |
|
325 | } |
||
326 | |||
327 | 8 | public function getTextArray(?self $page = null): array |
|
328 | { |
||
329 | 8 | if ($this->isFpdf()) { |
|
330 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|
331 | 1 | $newPdfObject = $this->createPDFObjectForFpdf(); |
|
332 | |||
333 | 1 | return $newPdfObject->getTextArray($pdfObject); |
|
334 | } else { |
||
335 | 7 | if ($contents = $this->get('Contents')) { |
|
336 | 7 | if ($contents instanceof ElementMissing) { |
|
337 | return []; |
||
338 | 7 | } elseif ($contents instanceof ElementNull) { |
|
339 | return []; |
||
340 | 7 | } elseif ($contents instanceof PDFObject) { |
|
341 | 7 | $elements = $contents->getHeader()->getElements(); |
|
342 | |||
343 | 7 | if (is_numeric(key($elements))) { |
|
344 | $new_content = ''; |
||
345 | |||
346 | /** @var PDFObject $element */ |
||
347 | foreach ($elements as $element) { |
||
348 | if ($element instanceof ElementXRef) { |
||
349 | $new_content .= $element->getObject()->getContent(); |
||
350 | } else { |
||
351 | $new_content .= $element->getContent(); |
||
352 | } |
||
353 | } |
||
354 | |||
355 | $header = new Header([], $this->document); |
||
356 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
||
357 | } else { |
||
358 | try { |
||
359 | 7 | $contents->getTextArray($this); |
|
360 | } catch (\Throwable $e) { |
||
361 | 7 | return $contents->getTextArray(); |
|
362 | } |
||
363 | } |
||
364 | 1 | } elseif ($contents instanceof ElementArray) { |
|
365 | // Create a virtual global content. |
||
366 | 1 | $new_content = ''; |
|
367 | |||
368 | /** @var PDFObject $content */ |
||
369 | 1 | foreach ($contents->getContent() as $content) { |
|
370 | 1 | $new_content .= $content->getContent()."\n"; |
|
371 | } |
||
372 | |||
373 | 1 | $header = new Header([], $this->document); |
|
374 | 1 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|
375 | } |
||
376 | |||
377 | 7 | return $contents->getTextArray($this); |
|
378 | } |
||
379 | |||
380 | return []; |
||
381 | } |
||
382 | } |
||
383 | |||
384 | /** |
||
385 | * Gets all the text data with its internal representation of the page. |
||
386 | * |
||
387 | * Returns an array with the data and the internal representation |
||
388 | */ |
||
389 | 12 | public function extractRawData(): array |
|
390 | { |
||
391 | /* |
||
392 | * Now you can get the complete content of the object with the text on it |
||
393 | */ |
||
394 | 12 | $extractedData = []; |
|
395 | 12 | $content = $this->get('Contents'); |
|
396 | 12 | $values = $content->getContent(); |
|
397 | 12 | if (isset($values) && \is_array($values)) { |
|
398 | 1 | $text = ''; |
|
399 | 1 | foreach ($values as $section) { |
|
400 | 1 | $text .= $section->getContent(); |
|
401 | } |
||
402 | 1 | $sectionsText = $this->getSectionsText($text); |
|
403 | 1 | foreach ($sectionsText as $sectionText) { |
|
404 | 1 | $commandsText = $this->getCommandsText($sectionText); |
|
405 | 1 | foreach ($commandsText as $command) { |
|
406 | 1 | $extractedData[] = $command; |
|
407 | } |
||
408 | } |
||
409 | } else { |
||
410 | 12 | if ($this->isFpdf()) { |
|
411 | 1 | $content = $this->getPDFObjectForFpdf(); |
|
412 | } |
||
413 | 12 | $sectionsText = $content->getSectionsText($content->getContent()); |
|
414 | 12 | foreach ($sectionsText as $sectionText) { |
|
415 | 12 | $commandsText = $content->getCommandsText($sectionText); |
|
416 | 12 | foreach ($commandsText as $command) { |
|
417 | 12 | $extractedData[] = $command; |
|
418 | } |
||
419 | } |
||
420 | } |
||
421 | |||
422 | 12 | return $extractedData; |
|
423 | } |
||
424 | |||
425 | /** |
||
426 | * Gets all the decoded text data with it internal representation from a page. |
||
427 | * |
||
428 | * @param array $extractedRawData the extracted data return by extractRawData or |
||
429 | * null if extractRawData should be called |
||
430 | * |
||
431 | * @return array An array with the data and the internal representation |
||
432 | */ |
||
433 | 11 | public function extractDecodedRawData(?array $extractedRawData = null): array |
|
434 | { |
||
435 | 11 | if (!isset($extractedRawData) || !$extractedRawData) { |
|
0 ignored issues
–
show
|
|||
436 | 11 | $extractedRawData = $this->extractRawData(); |
|
437 | } |
||
438 | 11 | $currentFont = null; /** @var Font $currentFont */ |
|
439 | 11 | $clippedFont = null; |
|
440 | 11 | $fpdfPage = null; |
|
441 | 11 | if ($this->isFpdf()) { |
|
442 | 1 | $fpdfPage = $this->createPageForFpdf(); |
|
443 | } |
||
444 | 11 | foreach ($extractedRawData as &$command) { |
|
445 | 11 | if ('Tj' == $command['o'] || 'TJ' == $command['o']) { |
|
446 | 11 | $data = $command['c']; |
|
447 | 11 | if (!\is_array($data)) { |
|
448 | 9 | $tmpText = ''; |
|
449 | 9 | if (isset($currentFont)) { |
|
450 | 9 | $tmpText = $currentFont->decodeOctal($data); |
|
451 | // $tmpText = $currentFont->decodeHexadecimal($tmpText, false); |
||
452 | } |
||
453 | 9 | $tmpText = str_replace( |
|
454 | 9 | ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], |
|
455 | 9 | ['\\', '(', ')', "\n", "\r", "\t", ' '], |
|
456 | 9 | $tmpText |
|
457 | 9 | ); |
|
458 | 9 | $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1'); |
|
459 | 9 | if (isset($currentFont)) { |
|
460 | 9 | $tmpText = $currentFont->decodeContent($tmpText); |
|
461 | } |
||
462 | 9 | $command['c'] = $tmpText; |
|
463 | 9 | continue; |
|
464 | } |
||
465 | 11 | $numText = \count($data); |
|
466 | 11 | for ($i = 0; $i < $numText; ++$i) { |
|
467 | 11 | if (0 != ($i % 2)) { |
|
468 | 7 | continue; |
|
469 | } |
||
470 | 11 | $tmpText = $data[$i]['c']; |
|
471 | 11 | $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText; |
|
472 | 11 | $decodedText = str_replace( |
|
473 | 11 | ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], |
|
474 | 11 | ['\\', '(', ')', "\n", "\r", "\t", ' '], |
|
475 | 11 | $decodedText |
|
476 | 11 | ); |
|
477 | |||
478 | 11 | $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1'); |
|
479 | |||
480 | 11 | if (isset($currentFont)) { |
|
481 | 11 | $decodedText = $currentFont->decodeContent($decodedText); |
|
482 | } |
||
483 | 11 | $command['c'][$i]['c'] = $decodedText; |
|
484 | 11 | continue; |
|
485 | } |
||
486 | 11 | } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) { |
|
487 | 11 | $fontId = explode(' ', $command['c'])[0]; |
|
488 | // If document is a FPDI/FPDF the $page has the correct font |
||
489 | 11 | $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId); |
|
490 | 11 | continue; |
|
491 | 11 | } elseif ('Q' == $command['o']) { |
|
492 | 8 | $currentFont = $clippedFont; |
|
493 | 11 | } elseif ('q' == $command['o']) { |
|
494 | 8 | $clippedFont = $currentFont; |
|
495 | } |
||
496 | } |
||
497 | |||
498 | 11 | return $extractedRawData; |
|
499 | } |
||
500 | |||
501 | /** |
||
502 | * Gets just the Text commands that are involved in text positions and |
||
503 | * Text Matrix (Tm) |
||
504 | * |
||
505 | * It extract just the PDF commands that are involved with text positions, and |
||
506 | * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ |
||
507 | * |
||
508 | * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData. |
||
509 | * If it is null, the method extractDecodeRawData is called. |
||
510 | * |
||
511 | * @return array An array with the text command of the page |
||
512 | */ |
||
513 | 9 | public function getDataCommands(?array $extractedDecodedRawData = null): array |
|
514 | { |
||
515 | 9 | if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) { |
|
0 ignored issues
–
show
The expression
$extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using
Loading history...
|
|||
516 | 9 | $extractedDecodedRawData = $this->extractDecodedRawData(); |
|
517 | } |
||
518 | 9 | $extractedData = []; |
|
519 | 9 | foreach ($extractedDecodedRawData as $command) { |
|
520 | 9 | switch ($command['o']) { |
|
521 | /* |
||
522 | * BT |
||
523 | * Begin a text object, inicializind the Tm and Tlm to identity matrix |
||
524 | */ |
||
525 | 9 | case 'BT': |
|
526 | 9 | $extractedData[] = $command; |
|
527 | 9 | break; |
|
528 | |||
529 | /* |
||
530 | * ET |
||
531 | * End a text object, discarding the text matrix |
||
532 | */ |
||
533 | 9 | case 'ET': |
|
534 | 9 | $extractedData[] = $command; |
|
535 | 9 | break; |
|
536 | |||
537 | /* |
||
538 | * leading TL |
||
539 | * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. |
||
540 | * Initial value: 0 |
||
541 | */ |
||
542 | 9 | case 'TL': |
|
543 | 5 | $extractedData[] = $command; |
|
544 | 5 | break; |
|
545 | |||
546 | /* |
||
547 | * tx ty Td |
||
548 | * Move to the start of the next line, offset form the start of the |
||
549 | * current line by tx, ty. |
||
550 | */ |
||
551 | 9 | case 'Td': |
|
552 | 9 | $extractedData[] = $command; |
|
553 | 9 | break; |
|
554 | |||
555 | /* |
||
556 | * tx ty TD |
||
557 | * Move to the start of the next line, offset form the start of the |
||
558 | * current line by tx, ty. As a side effect, this operator set the leading |
||
559 | * parameter in the text state. This operator has the same effect as the |
||
560 | * code: |
||
561 | * -ty TL |
||
562 | * tx ty Td |
||
563 | */ |
||
564 | 9 | case 'TD': |
|
565 | $extractedData[] = $command; |
||
566 | break; |
||
567 | |||
568 | /* |
||
569 | * a b c d e f Tm |
||
570 | * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are |
||
571 | * all numbers, and the initial value for Tm and Tlm is the identity matrix |
||
572 | * [1 0 0 1 0 0] |
||
573 | */ |
||
574 | 9 | case 'Tm': |
|
575 | 7 | $extractedData[] = $command; |
|
576 | 7 | break; |
|
577 | |||
578 | /* |
||
579 | * T* |
||
580 | * Move to the start of the next line. This operator has the same effect |
||
581 | * as the code: |
||
582 | * 0 Tl Td |
||
583 | * Where Tl is the current leading parameter in the text state. |
||
584 | */ |
||
585 | 9 | case 'T*': |
|
586 | 5 | $extractedData[] = $command; |
|
587 | 5 | break; |
|
588 | |||
589 | /* |
||
590 | * string Tj |
||
591 | * Show a Text String |
||
592 | */ |
||
593 | 9 | case 'Tj': |
|
594 | 8 | $extractedData[] = $command; |
|
595 | 8 | break; |
|
596 | |||
597 | /* |
||
598 | * string ' |
||
599 | * Move to the next line and show a text string. This operator has the |
||
600 | * same effect as the code: |
||
601 | * T* |
||
602 | * string Tj |
||
603 | */ |
||
604 | 9 | case "'": |
|
605 | $extractedData[] = $command; |
||
606 | break; |
||
607 | |||
608 | /* |
||
609 | * aw ac string " |
||
610 | * Move to the next lkine and show a text string, using aw as the word |
||
611 | * spacing and ac as the character spacing. This operator has the same |
||
612 | * effect as the code: |
||
613 | * aw Tw |
||
614 | * ac Tc |
||
615 | * string ' |
||
616 | * Tw set the word spacing, Tw, to wordSpace. |
||
617 | * Tc Set the character spacing, Tc, to charsSpace. |
||
618 | */ |
||
619 | 9 | case '"': |
|
620 | $extractedData[] = $command; |
||
621 | break; |
||
622 | |||
623 | 9 | case 'Tf': |
|
624 | 9 | case 'TF': |
|
625 | 9 | $extractedData[] = $command; |
|
626 | 9 | break; |
|
627 | |||
628 | /* |
||
629 | * array TJ |
||
630 | * Show one or more text strings allow individual glyph positioning. |
||
631 | * Each lement of array con be a string or a number. If the element is |
||
632 | * a string, this operator shows the string. If it is a number, the |
||
633 | * operator adjust the text position by that amount; that is, it translates |
||
634 | * the text matrix, Tm. This amount is substracted form the current |
||
635 | * horizontal or vertical coordinate, depending on the writing mode. |
||
636 | * in the default coordinate system, a positive adjustment has the effect |
||
637 | * of moving the next glyph painted either to the left or down by the given |
||
638 | * amount. |
||
639 | */ |
||
640 | 9 | case 'TJ': |
|
641 | 9 | $extractedData[] = $command; |
|
642 | 9 | break; |
|
643 | default: |
||
644 | } |
||
645 | } |
||
646 | |||
647 | 9 | return $extractedData; |
|
648 | } |
||
649 | |||
650 | /** |
||
651 | * Gets the Text Matrix of the text in the page |
||
652 | * |
||
653 | * Return an array where every item is an array where the first item is the |
||
654 | * Text Matrix (Tm) and the second is a string with the text data. The Text matrix |
||
655 | * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the |
||
656 | * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text. |
||
657 | * |
||
658 | * @param array $dataCommands the data extracted by getDataCommands |
||
659 | * if null getDataCommands is called |
||
660 | * |
||
661 | * @return array an array with the data of the page including the Tm information |
||
662 | * of any text in the page |
||
663 | */ |
||
664 | 8 | public function getDataTm(?array $dataCommands = null): array |
|
665 | { |
||
666 | 8 | if (!isset($dataCommands) || !$dataCommands) { |
|
0 ignored issues
–
show
The expression
$dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using
Loading history...
|
|||
667 | 8 | $dataCommands = $this->getDataCommands(); |
|
668 | } |
||
669 | |||
670 | /* |
||
671 | * At the beginning of a text object Tm is the identity matrix |
||
672 | */ |
||
673 | 8 | $defaultTm = ['1', '0', '0', '1', '0', '0']; |
|
674 | |||
675 | /* |
||
676 | * Set the text leading used by T*, ' and " operators |
||
677 | */ |
||
678 | 8 | $defaultTl = 0; |
|
679 | |||
680 | /* |
||
681 | * Set default values for font data |
||
682 | */ |
||
683 | 8 | $defaultFontId = -1; |
|
684 | 8 | $defaultFontSize = 1; |
|
685 | |||
686 | /* |
||
687 | * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm) |
||
688 | */ |
||
689 | 8 | $hSc = 0; // horizontal scaling |
|
690 | /** |
||
691 | * index of vertical scaling in the array that encodes the text matrix. |
||
692 | * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500 |
||
693 | */ |
||
694 | 8 | $vSc = 3; |
|
695 | 8 | $x = 4; |
|
696 | 8 | $y = 5; |
|
697 | |||
698 | /* |
||
699 | * x,y-coordinates of text space origin in user units |
||
700 | * |
||
701 | * These will be assigned the value of the currently printed string |
||
702 | */ |
||
703 | 8 | $Tx = 0; |
|
704 | 8 | $Ty = 0; |
|
705 | |||
706 | 8 | $Tm = $defaultTm; |
|
707 | 8 | $Tl = $defaultTl; |
|
708 | 8 | $fontId = $defaultFontId; |
|
709 | 8 | $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs |
|
710 | |||
711 | 8 | $extractedTexts = $this->getTextArray(); |
|
712 | 8 | $extractedData = []; |
|
713 | 8 | foreach ($dataCommands as $command) { |
|
714 | // If we've used up all the texts from getTextArray(), exit |
||
715 | // so we aren't accessing non-existent array indices |
||
716 | // Fixes 'undefined array key' errors in Issues #575, #576 |
||
717 | 8 | if (\count($extractedTexts) <= \count($extractedData)) { |
|
718 | 6 | break; |
|
719 | } |
||
720 | 8 | $currentText = $extractedTexts[\count($extractedData)]; |
|
721 | 8 | switch ($command['o']) { |
|
722 | /* |
||
723 | * BT |
||
724 | * Begin a text object, initializing the Tm and Tlm to identity matrix |
||
725 | */ |
||
726 | 8 | case 'BT': |
|
727 | 8 | $Tm = $defaultTm; |
|
728 | 8 | $Tl = $defaultTl; |
|
729 | 8 | $Tx = 0; |
|
730 | 8 | $Ty = 0; |
|
731 | 8 | break; |
|
732 | |||
733 | /* |
||
734 | * ET |
||
735 | * End a text object |
||
736 | */ |
||
737 | 8 | case 'ET': |
|
738 | 7 | break; |
|
739 | |||
740 | /* |
||
741 | * text leading TL |
||
742 | * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. |
||
743 | * Initial value: 0 |
||
744 | */ |
||
745 | 8 | case 'TL': |
|
746 | // scaled text leading |
||
747 | 4 | $Tl = (float) $command['c'] * (float) $Tm[$vSc]; |
|
748 | 4 | break; |
|
749 | |||
750 | /* |
||
751 | * tx ty Td |
||
752 | * Move to the start of the next line, offset from the start of the |
||
753 | * current line by tx, ty. |
||
754 | */ |
||
755 | 8 | case 'Td': |
|
756 | 8 | $coord = explode(' ', $command['c']); |
|
757 | 8 | $Tx += (float) $coord[0] * (float) $Tm[$hSc]; |
|
758 | 8 | $Ty += (float) $coord[1] * (float) $Tm[$vSc]; |
|
759 | 8 | $Tm[$x] = (string) $Tx; |
|
760 | 8 | $Tm[$y] = (string) $Ty; |
|
761 | 8 | break; |
|
762 | |||
763 | /* |
||
764 | * tx ty TD |
||
765 | * Move to the start of the next line, offset form the start of the |
||
766 | * current line by tx, ty. As a side effect, this operator set the leading |
||
767 | * parameter in the text state. This operator has the same effect as the |
||
768 | * code: |
||
769 | * -ty TL |
||
770 | * tx ty Td |
||
771 | */ |
||
772 | 8 | case 'TD': |
|
773 | 1 | $coord = explode(' ', $command['c']); |
|
774 | 1 | $Tl = -((float) $coord[1] * (float) $Tm[$vSc]); |
|
775 | 1 | $Tx += (float) $coord[0] * (float) $Tm[$hSc]; |
|
776 | 1 | $Ty += (float) $coord[1] * (float) $Tm[$vSc]; |
|
777 | 1 | $Tm[$x] = (string) $Tx; |
|
778 | 1 | $Tm[$y] = (string) $Ty; |
|
779 | 1 | break; |
|
780 | |||
781 | /* |
||
782 | * a b c d e f Tm |
||
783 | * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are |
||
784 | * all numbers, and the initial value for Tm and Tlm is the identity matrix |
||
785 | * [1 0 0 1 0 0] |
||
786 | */ |
||
787 | 8 | case 'Tm': |
|
788 | 6 | $Tm = explode(' ', $command['c']); |
|
789 | 6 | $Tx = (float) $Tm[$x]; |
|
790 | 6 | $Ty = (float) $Tm[$y]; |
|
791 | 6 | break; |
|
792 | |||
793 | /* |
||
794 | * T* |
||
795 | * Move to the start of the next line. This operator has the same effect |
||
796 | * as the code: |
||
797 | * 0 Tl Td |
||
798 | * Where Tl is the current leading parameter in the text state. |
||
799 | */ |
||
800 | 8 | case 'T*': |
|
801 | 4 | $Ty -= $Tl; |
|
802 | 4 | $Tm[$y] = (string) $Ty; |
|
803 | 4 | break; |
|
804 | |||
805 | /* |
||
806 | * string Tj |
||
807 | * Show a Text String |
||
808 | */ |
||
809 | 8 | case 'Tj': |
|
810 | 7 | $data = [$Tm, $currentText]; |
|
811 | 7 | if ($this->config->getDataTmFontInfoHasToBeIncluded()) { |
|
812 | 2 | $data[] = $fontId; |
|
813 | 2 | $data[] = $fontSize; |
|
814 | } |
||
815 | 7 | $extractedData[] = $data; |
|
816 | 7 | break; |
|
817 | |||
818 | /* |
||
819 | * string ' |
||
820 | * Move to the next line and show a text string. This operator has the |
||
821 | * same effect as the code: |
||
822 | * T* |
||
823 | * string Tj |
||
824 | */ |
||
825 | 8 | case "'": |
|
826 | 1 | $Ty -= $Tl; |
|
827 | 1 | $Tm[$y] = (string) $Ty; |
|
828 | 1 | $extractedData[] = [$Tm, $currentText]; |
|
829 | 1 | break; |
|
830 | |||
831 | /* |
||
832 | * aw ac string " |
||
833 | * Move to the next line and show a text string, using aw as the word |
||
834 | * spacing and ac as the character spacing. This operator has the same |
||
835 | * effect as the code: |
||
836 | * aw Tw |
||
837 | * ac Tc |
||
838 | * string ' |
||
839 | * Tw set the word spacing, Tw, to wordSpace. |
||
840 | * Tc Set the character spacing, Tc, to charsSpace. |
||
841 | */ |
||
842 | 8 | case '"': |
|
843 | $data = explode(' ', $currentText); |
||
844 | $Ty -= $Tl; |
||
845 | $Tm[$y] = (string) $Ty; |
||
846 | $extractedData[] = [$Tm, $data[2]]; // Verify |
||
847 | break; |
||
848 | |||
849 | 8 | case 'Tf': |
|
850 | /* |
||
851 | * From PDF 1.0 specification, page 106: |
||
852 | * fontname size Tf Set font and size |
||
853 | * Sets the text font and text size in the graphics state. There is no default value for |
||
854 | * either fontname or size; they must be selected using Tf before drawing any text. |
||
855 | * fontname is a resource name. size is a number expressed in text space units. |
||
856 | * |
||
857 | * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf |
||
858 | * Introduced with https://github.com/smalot/pdfparser/pull/516 |
||
859 | */ |
||
860 | 8 | list($fontId, $fontSize) = explode(' ', $command['c'], 2); |
|
861 | 8 | break; |
|
862 | |||
863 | /* |
||
864 | * array TJ |
||
865 | * Show one or more text strings allow individual glyph positioning. |
||
866 | * Each lement of array con be a string or a number. If the element is |
||
867 | * a string, this operator shows the string. If it is a number, the |
||
868 | * operator adjust the text position by that amount; that is, it translates |
||
869 | * the text matrix, Tm. This amount is substracted form the current |
||
870 | * horizontal or vertical coordinate, depending on the writing mode. |
||
871 | * in the default coordinate system, a positive adjustment has the effect |
||
872 | * of moving the next glyph painted either to the left or down by the given |
||
873 | * amount. |
||
874 | */ |
||
875 | 8 | case 'TJ': |
|
876 | 8 | $data = [$Tm, $currentText]; |
|
877 | 8 | if ($this->config->getDataTmFontInfoHasToBeIncluded()) { |
|
878 | 2 | $data[] = $fontId; |
|
879 | 2 | $data[] = $fontSize; |
|
880 | } |
||
881 | 8 | $extractedData[] = $data; |
|
882 | 8 | break; |
|
883 | default: |
||
884 | } |
||
885 | } |
||
886 | 8 | $this->dataTm = $extractedData; |
|
887 | |||
888 | 8 | return $extractedData; |
|
889 | } |
||
890 | |||
891 | /** |
||
892 | * Gets text data that are around the given coordinates (X,Y) |
||
893 | * |
||
894 | * If the text is in near the given coordinates (X,Y) (or the TM info), |
||
895 | * the text is returned. The extractedData return by getDataTm, could be use to see |
||
896 | * where is the coordinates of a given text, using the TM info for it. |
||
897 | * |
||
898 | * @param float $x The X value of the coordinate to search for. if null |
||
899 | * just the Y value is considered (same Row) |
||
900 | * @param float $y The Y value of the coordinate to search for |
||
901 | * just the X value is considered (same column) |
||
902 | * @param float $xError The value less or more to consider an X to be "near" |
||
903 | * @param float $yError The value less or more to consider an Y to be "near" |
||
904 | * |
||
905 | * @return array An array of text that are near the given coordinates. If no text |
||
906 | * "near" the x,y coordinate, an empty array is returned. If Both, x |
||
907 | * and y coordinates are null, null is returned. |
||
908 | */ |
||
909 | 2 | public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array |
|
910 | { |
||
911 | 2 | if (!isset($this->dataTm) || !$this->dataTm) { |
|
0 ignored issues
–
show
The expression
$this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using
Loading history...
|
|||
912 | 1 | $this->getDataTm(); |
|
913 | } |
||
914 | |||
915 | 2 | if (null !== $x) { |
|
916 | 2 | $x = (float) $x; |
|
917 | } |
||
918 | |||
919 | 2 | if (null !== $y) { |
|
920 | 2 | $y = (float) $y; |
|
921 | } |
||
922 | |||
923 | 2 | if (null === $x && null === $y) { |
|
924 | return []; |
||
925 | } |
||
926 | |||
927 | 2 | $xError = (float) $xError; |
|
928 | 2 | $yError = (float) $yError; |
|
929 | |||
930 | 2 | $extractedData = []; |
|
931 | 2 | foreach ($this->dataTm as $item) { |
|
932 | 2 | $tm = $item[0]; |
|
933 | 2 | $xTm = (float) $tm[4]; |
|
934 | 2 | $yTm = (float) $tm[5]; |
|
935 | 2 | $text = $item[1]; |
|
936 | 2 | if (null === $y) { |
|
937 | if (($xTm >= ($x - $xError)) |
||
938 | && ($xTm <= ($x + $xError))) { |
||
939 | $extractedData[] = [$tm, $text]; |
||
940 | continue; |
||
941 | } |
||
942 | } |
||
943 | 2 | if (null === $x) { |
|
944 | if (($yTm >= ($y - $yError)) |
||
945 | && ($yTm <= ($y + $yError))) { |
||
946 | $extractedData[] = [$tm, $text]; |
||
947 | continue; |
||
948 | } |
||
949 | } |
||
950 | 2 | if (($xTm >= ($x - $xError)) |
|
951 | 2 | && ($xTm <= ($x + $xError)) |
|
952 | 2 | && ($yTm >= ($y - $yError)) |
|
953 | 2 | && ($yTm <= ($y + $yError))) { |
|
954 | 2 | $extractedData[] = [$tm, $text]; |
|
955 | 2 | continue; |
|
956 | } |
||
957 | } |
||
958 | |||
959 | 2 | return $extractedData; |
|
960 | } |
||
961 | } |
||
962 |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.