smalot /
pdfparser
| 1 | <?php |
||||||
| 2 | |||||||
| 3 | /** |
||||||
| 4 | * @file |
||||||
| 5 | * This file is part of the PdfParser library. |
||||||
| 6 | * |
||||||
| 7 | * @author Sébastien MALOT <[email protected]> |
||||||
| 8 | * |
||||||
| 9 | * @date 2017-01-03 |
||||||
| 10 | * |
||||||
| 11 | * @license LGPLv3 |
||||||
| 12 | * |
||||||
| 13 | * @url <https://github.com/smalot/pdfparser> |
||||||
| 14 | * |
||||||
| 15 | * PdfParser is a pdf library written in PHP, extraction oriented. |
||||||
| 16 | * Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
||||||
| 17 | * |
||||||
| 18 | * This program is free software: you can redistribute it and/or modify |
||||||
| 19 | * it under the terms of the GNU Lesser General Public License as published by |
||||||
| 20 | * the Free Software Foundation, either version 3 of the License, or |
||||||
| 21 | * (at your option) any later version. |
||||||
| 22 | * |
||||||
| 23 | * This program is distributed in the hope that it will be useful, |
||||||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||||||
| 26 | * GNU Lesser General Public License for more details. |
||||||
| 27 | * |
||||||
| 28 | * You should have received a copy of the GNU Lesser General Public License |
||||||
| 29 | * along with this program. |
||||||
| 30 | * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
||||||
| 31 | */ |
||||||
| 32 | |||||||
| 33 | namespace Smalot\PdfParser; |
||||||
| 34 | |||||||
| 35 | use Smalot\PdfParser\Element\ElementArray; |
||||||
| 36 | use Smalot\PdfParser\Element\ElementMissing; |
||||||
| 37 | use Smalot\PdfParser\Element\ElementNull; |
||||||
| 38 | use Smalot\PdfParser\Element\ElementXRef; |
||||||
| 39 | |||||||
| 40 | class Page extends PDFObject |
||||||
| 41 | { |
||||||
| 42 | /** |
||||||
| 43 | * @var Font[] |
||||||
| 44 | */ |
||||||
| 45 | protected $fonts; |
||||||
| 46 | |||||||
| 47 | /** |
||||||
| 48 | * @var PDFObject[] |
||||||
| 49 | */ |
||||||
| 50 | protected $xobjects; |
||||||
| 51 | |||||||
| 52 | /** |
||||||
| 53 | * @var array |
||||||
| 54 | */ |
||||||
| 55 | protected $dataTm; |
||||||
| 56 | |||||||
| 57 | /** |
||||||
| 58 | * @param array<\Smalot\PdfParser\Font> $fonts |
||||||
| 59 | * |
||||||
| 60 | * @internal |
||||||
| 61 | */ |
||||||
| 62 | 9 | public function setFonts($fonts) |
|||||
| 63 | { |
||||||
| 64 | 9 | if (empty($this->fonts)) { |
|||||
| 65 | 9 | $this->fonts = $fonts; |
|||||
| 66 | } |
||||||
| 67 | } |
||||||
| 68 | |||||||
| 69 | /** |
||||||
| 70 | * @return Font[] |
||||||
| 71 | */ |
||||||
| 72 | 53 | public function getFonts() |
|||||
| 73 | { |
||||||
| 74 | 53 | if (null !== $this->fonts) { |
|||||
| 75 | 50 | return $this->fonts; |
|||||
| 76 | } |
||||||
| 77 | |||||||
| 78 | 45 | $resources = $this->get('Resources'); |
|||||
| 79 | |||||||
| 80 | 45 | if (method_exists($resources, 'has') && $resources->has('Font')) { |
|||||
| 81 | 44 | if ($resources->get('Font') instanceof ElementMissing) { |
|||||
|
0 ignored issues
–
show
|
|||||||
| 82 | 1 | return []; |
|||||
| 83 | } |
||||||
| 84 | |||||||
| 85 | 43 | if ($resources->get('Font') instanceof Header) { |
|||||
| 86 | 37 | $fonts = $resources->get('Font')->getElements(); |
|||||
| 87 | } else { |
||||||
| 88 | 10 | $fonts = $resources->get('Font')->getHeader()->getElements(); |
|||||
| 89 | } |
||||||
| 90 | |||||||
| 91 | 43 | $table = []; |
|||||
| 92 | |||||||
| 93 | 43 | foreach ($fonts as $id => $font) { |
|||||
| 94 | 42 | if ($font instanceof Font) { |
|||||
| 95 | 42 | $table[$id] = $font; |
|||||
| 96 | |||||||
| 97 | // Store too on cleaned id value (only numeric) |
||||||
| 98 | 42 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|||||
| 99 | 42 | if ('' != $id) { |
|||||
| 100 | 41 | $table[$id] = $font; |
|||||
| 101 | } |
||||||
| 102 | } |
||||||
| 103 | } |
||||||
| 104 | |||||||
| 105 | 43 | return $this->fonts = $table; |
|||||
| 106 | } |
||||||
| 107 | |||||||
| 108 | 4 | return []; |
|||||
| 109 | } |
||||||
| 110 | |||||||
| 111 | 48 | public function getFont(string $id): ?Font |
|||||
| 112 | { |
||||||
| 113 | 48 | $fonts = $this->getFonts(); |
|||||
| 114 | |||||||
| 115 | 48 | if (isset($fonts[$id])) { |
|||||
| 116 | 48 | return $fonts[$id]; |
|||||
| 117 | } |
||||||
| 118 | |||||||
| 119 | // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238) |
||||||
| 120 | // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources" |
||||||
| 121 | // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass. |
||||||
| 122 | |||||||
| 123 | 1 | if (isset($fonts[$id])) { |
|||||
| 124 | return $fonts[$id]; |
||||||
| 125 | } else { |
||||||
| 126 | 1 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|||||
| 127 | 1 | if (isset($fonts[$id])) { |
|||||
| 128 | 1 | return $fonts[$id]; |
|||||
| 129 | } |
||||||
| 130 | } |
||||||
| 131 | |||||||
| 132 | return null; |
||||||
| 133 | } |
||||||
| 134 | |||||||
| 135 | /** |
||||||
| 136 | * Support for XObject |
||||||
| 137 | * |
||||||
| 138 | * @return PDFObject[] |
||||||
| 139 | */ |
||||||
| 140 | 17 | public function getXObjects() |
|||||
| 141 | { |
||||||
| 142 | 17 | if (null !== $this->xobjects) { |
|||||
| 143 | 10 | return $this->xobjects; |
|||||
| 144 | } |
||||||
| 145 | |||||||
| 146 | 17 | $resources = $this->get('Resources'); |
|||||
| 147 | |||||||
| 148 | 17 | if (method_exists($resources, 'has') && $resources->has('XObject')) { |
|||||
| 149 | 17 | if ($resources->get('XObject') instanceof Header) { |
|||||
| 150 | 16 | $xobjects = $resources->get('XObject')->getElements(); |
|||||
| 151 | } else { |
||||||
| 152 | 1 | $xobjects = $resources->get('XObject')->getHeader()->getElements(); |
|||||
| 153 | } |
||||||
| 154 | |||||||
| 155 | 17 | $table = []; |
|||||
| 156 | |||||||
| 157 | 17 | foreach ($xobjects as $id => $xobject) { |
|||||
| 158 | 17 | $table[$id] = $xobject; |
|||||
| 159 | |||||||
| 160 | // Store too on cleaned id value (only numeric) |
||||||
| 161 | 17 | $id = preg_replace('/[^0-9\.\-_]/', '', $id); |
|||||
| 162 | 17 | if ('' != $id) { |
|||||
| 163 | 17 | $table[$id] = $xobject; |
|||||
| 164 | } |
||||||
| 165 | } |
||||||
| 166 | |||||||
| 167 | 17 | return $this->xobjects = $table; |
|||||
| 168 | } |
||||||
| 169 | |||||||
| 170 | return []; |
||||||
| 171 | } |
||||||
| 172 | |||||||
| 173 | 16 | public function getXObject(string $id): ?PDFObject |
|||||
| 174 | { |
||||||
| 175 | 16 | $xobjects = $this->getXObjects(); |
|||||
| 176 | |||||||
| 177 | 16 | if (isset($xobjects[$id])) { |
|||||
| 178 | 16 | return $xobjects[$id]; |
|||||
| 179 | } |
||||||
| 180 | |||||||
| 181 | return null; |
||||||
| 182 | /*$id = preg_replace('/[^0-9\.\-_]/', '', $id); |
||||||
| 183 | |||||||
| 184 | if (isset($xobjects[$id])) { |
||||||
| 185 | return $xobjects[$id]; |
||||||
| 186 | } else { |
||||||
| 187 | return null; |
||||||
| 188 | }*/ |
||||||
| 189 | } |
||||||
| 190 | |||||||
| 191 | 36 | public function getText(?self $page = null): string |
|||||
| 192 | { |
||||||
| 193 | 36 | if ($contents = $this->get('Contents')) { |
|||||
| 194 | 36 | if ($contents instanceof ElementMissing) { |
|||||
| 195 | return ''; |
||||||
| 196 | 36 | } elseif ($contents instanceof ElementNull) { |
|||||
| 197 | return ''; |
||||||
| 198 | 36 | } elseif ($contents instanceof PDFObject) { |
|||||
|
0 ignored issues
–
show
|
|||||||
| 199 | 30 | $elements = $contents->getHeader()->getElements(); |
|||||
| 200 | |||||||
| 201 | 30 | if (is_numeric(key($elements))) { |
|||||
| 202 | $new_content = ''; |
||||||
| 203 | |||||||
| 204 | foreach ($elements as $element) { |
||||||
| 205 | if ($element instanceof ElementXRef) { |
||||||
| 206 | $new_content .= $element->getObject()->getContent(); |
||||||
| 207 | } else { |
||||||
| 208 | $new_content .= $element->getContent(); |
||||||
| 209 | } |
||||||
| 210 | } |
||||||
| 211 | |||||||
| 212 | $header = new Header([], $this->document); |
||||||
| 213 | 30 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|||||
| 214 | } |
||||||
| 215 | 9 | } elseif ($contents instanceof ElementArray) { |
|||||
| 216 | // Create a virtual global content. |
||||||
| 217 | 9 | $new_content = ''; |
|||||
| 218 | |||||||
| 219 | 9 | foreach ($contents->getContent() as $content) { |
|||||
| 220 | 9 | $new_content .= $content->getContent()."\n"; |
|||||
| 221 | } |
||||||
| 222 | |||||||
| 223 | 9 | $header = new Header([], $this->document); |
|||||
| 224 | 9 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|||||
|
0 ignored issues
–
show
It seems like
$this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
| 225 | } |
||||||
| 226 | |||||||
| 227 | /* |
||||||
| 228 | * Elements referencing each other on the same page can cause endless loops during text parsing. |
||||||
| 229 | * To combat this we keep a recursionStack containing already parsed elements on the page. |
||||||
| 230 | * The stack is only emptied here after getting text from a page. |
||||||
| 231 | */ |
||||||
| 232 | 36 | $contentsText = $contents->getText($this); |
|||||
|
0 ignored issues
–
show
The method
getText() does not exist on Smalot\PdfParser\Element.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 233 | 36 | PDFObject::$recursionStack = []; |
|||||
| 234 | |||||||
| 235 | 36 | return $contentsText; |
|||||
| 236 | } |
||||||
| 237 | |||||||
| 238 | return ''; |
||||||
| 239 | } |
||||||
| 240 | |||||||
| 241 | /** |
||||||
| 242 | * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document |
||||||
| 243 | * |
||||||
| 244 | * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the |
||||||
| 245 | * pdf file was generated by FPDF/Fpfi. |
||||||
| 246 | * |
||||||
| 247 | * @return bool true is the current page is a FPDI/FPDF document |
||||||
| 248 | */ |
||||||
| 249 | 14 | public function isFpdf(): bool |
|||||
| 250 | { |
||||||
| 251 | 14 | if (\array_key_exists('Producer', $this->document->getDetails()) |
|||||
|
0 ignored issues
–
show
The method
getDetails() does not exist on null.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 252 | 14 | && \is_string($this->document->getDetails()['Producer']) |
|||||
| 253 | 14 | && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) { |
|||||
| 254 | 2 | return true; |
|||||
| 255 | } |
||||||
| 256 | |||||||
| 257 | 13 | return false; |
|||||
| 258 | } |
||||||
| 259 | |||||||
| 260 | /** |
||||||
| 261 | * Return the page number of the PDF document of the page object |
||||||
| 262 | * |
||||||
| 263 | * @return int the page number |
||||||
| 264 | */ |
||||||
| 265 | 2 | public function getPageNumber(): int |
|||||
| 266 | { |
||||||
| 267 | 2 | $pages = $this->document->getPages(); |
|||||
| 268 | 2 | $numOfPages = \count($pages); |
|||||
| 269 | 2 | for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) { |
|||||
| 270 | 2 | if ($pages[$pageNum] === $this) { |
|||||
| 271 | 2 | break; |
|||||
| 272 | } |
||||||
| 273 | } |
||||||
| 274 | |||||||
| 275 | 2 | return $pageNum; |
|||||
| 276 | } |
||||||
| 277 | |||||||
| 278 | /** |
||||||
| 279 | * Return the Object of the page if the document is a FPDF/FPDI document |
||||||
| 280 | * |
||||||
| 281 | * If the document was generated by FPDF/FPDI it returns the |
||||||
| 282 | * PDFObject of the given page |
||||||
| 283 | * |
||||||
| 284 | * @return PDFObject The PDFObject for the page |
||||||
| 285 | */ |
||||||
| 286 | 1 | public function getPDFObjectForFpdf(): PDFObject |
|||||
| 287 | { |
||||||
| 288 | 1 | $pageNum = $this->getPageNumber(); |
|||||
| 289 | 1 | $xObjects = $this->getXObjects(); |
|||||
| 290 | |||||||
| 291 | 1 | return $xObjects[$pageNum]; |
|||||
| 292 | } |
||||||
| 293 | |||||||
| 294 | /** |
||||||
| 295 | * Return a new PDFObject of the document created with FPDF/FPDI |
||||||
| 296 | * |
||||||
| 297 | * For a document generated by FPDF/FPDI, it generates a |
||||||
| 298 | * new PDFObject for that document |
||||||
| 299 | * |
||||||
| 300 | * @return PDFObject The PDFObject |
||||||
| 301 | */ |
||||||
| 302 | 1 | public function createPDFObjectForFpdf(): PDFObject |
|||||
| 303 | { |
||||||
| 304 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|||||
| 305 | 1 | $new_content = $pdfObject->getContent(); |
|||||
| 306 | 1 | $header = $pdfObject->getHeader(); |
|||||
| 307 | 1 | $config = $pdfObject->config; |
|||||
| 308 | |||||||
| 309 | 1 | return new PDFObject($pdfObject->document, $header, $new_content, $config); |
|||||
|
0 ignored issues
–
show
It seems like
$pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
| 310 | } |
||||||
| 311 | |||||||
| 312 | /** |
||||||
| 313 | * Return page if document is a FPDF/FPDI document |
||||||
| 314 | * |
||||||
| 315 | * @return Page The page |
||||||
| 316 | */ |
||||||
| 317 | 1 | public function createPageForFpdf(): self |
|||||
| 318 | { |
||||||
| 319 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|||||
| 320 | 1 | $new_content = $pdfObject->getContent(); |
|||||
| 321 | 1 | $header = $pdfObject->getHeader(); |
|||||
| 322 | 1 | $config = $pdfObject->config; |
|||||
| 323 | |||||||
| 324 | 1 | return new self($pdfObject->document, $header, $new_content, $config); |
|||||
|
0 ignored issues
–
show
It seems like
$pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\Page::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
| 325 | } |
||||||
| 326 | |||||||
| 327 | 9 | public function getTextArray(?self $page = null): array |
|||||
| 328 | { |
||||||
| 329 | 9 | if ($this->isFpdf()) { |
|||||
| 330 | 1 | $pdfObject = $this->getPDFObjectForFpdf(); |
|||||
| 331 | 1 | $newPdfObject = $this->createPDFObjectForFpdf(); |
|||||
| 332 | |||||||
| 333 | 1 | return $newPdfObject->getTextArray($pdfObject); |
|||||
| 334 | } else { |
||||||
| 335 | 8 | if ($contents = $this->get('Contents')) { |
|||||
| 336 | 8 | if ($contents instanceof ElementMissing) { |
|||||
| 337 | return []; |
||||||
| 338 | 8 | } elseif ($contents instanceof ElementNull) { |
|||||
| 339 | return []; |
||||||
| 340 | 8 | } elseif ($contents instanceof PDFObject) { |
|||||
|
0 ignored issues
–
show
|
|||||||
| 341 | 7 | $elements = $contents->getHeader()->getElements(); |
|||||
| 342 | |||||||
| 343 | 7 | if (is_numeric(key($elements))) { |
|||||
| 344 | $new_content = ''; |
||||||
| 345 | |||||||
| 346 | /** @var PDFObject $element */ |
||||||
| 347 | foreach ($elements as $element) { |
||||||
| 348 | if ($element instanceof ElementXRef) { |
||||||
| 349 | $new_content .= $element->getObject()->getContent(); |
||||||
| 350 | } else { |
||||||
| 351 | $new_content .= $element->getContent(); |
||||||
| 352 | } |
||||||
| 353 | } |
||||||
| 354 | |||||||
| 355 | $header = new Header([], $this->document); |
||||||
| 356 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
||||||
| 357 | } else { |
||||||
| 358 | try { |
||||||
| 359 | 7 | $contents->getTextArray($this); |
|||||
| 360 | } catch (\Throwable $e) { |
||||||
| 361 | 7 | return $contents->getTextArray(); |
|||||
| 362 | } |
||||||
| 363 | } |
||||||
| 364 | 2 | } elseif ($contents instanceof ElementArray) { |
|||||
| 365 | // Create a virtual global content. |
||||||
| 366 | 2 | $new_content = ''; |
|||||
| 367 | |||||||
| 368 | /** @var PDFObject $content */ |
||||||
| 369 | 2 | foreach ($contents->getContent() as $content) { |
|||||
| 370 | 2 | $new_content .= $content->getContent()."\n"; |
|||||
| 371 | } |
||||||
| 372 | |||||||
| 373 | 2 | $header = new Header([], $this->document); |
|||||
| 374 | 2 | $contents = new PDFObject($this->document, $header, $new_content, $this->config); |
|||||
|
0 ignored issues
–
show
It seems like
$this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
| 375 | } |
||||||
| 376 | |||||||
| 377 | 8 | return $contents->getTextArray($this); |
|||||
|
0 ignored issues
–
show
The method
getTextArray() does not exist on Smalot\PdfParser\Element.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 378 | } |
||||||
| 379 | |||||||
| 380 | return []; |
||||||
| 381 | } |
||||||
| 382 | } |
||||||
| 383 | |||||||
| 384 | /** |
||||||
| 385 | * Gets all the text data with its internal representation of the page. |
||||||
| 386 | * |
||||||
| 387 | * Returns an array with the data and the internal representation |
||||||
| 388 | */ |
||||||
| 389 | 13 | public function extractRawData(): array |
|||||
| 390 | { |
||||||
| 391 | /* |
||||||
| 392 | * Now you can get the complete content of the object with the text on it |
||||||
| 393 | */ |
||||||
| 394 | 13 | $extractedData = []; |
|||||
| 395 | 13 | $content = $this->get('Contents'); |
|||||
| 396 | 13 | $values = $content->getContent(); |
|||||
| 397 | 13 | if (isset($values) && \is_array($values)) { |
|||||
| 398 | 2 | $text = ''; |
|||||
| 399 | 2 | foreach ($values as $section) { |
|||||
| 400 | 2 | $text .= $section->getContent(); |
|||||
| 401 | } |
||||||
| 402 | 2 | $sectionsText = $this->getSectionsText($text); |
|||||
| 403 | 2 | foreach ($sectionsText as $sectionText) { |
|||||
| 404 | 2 | $commandsText = $this->getCommandsText($sectionText); |
|||||
| 405 | 2 | foreach ($commandsText as $command) { |
|||||
| 406 | 2 | $extractedData[] = $command; |
|||||
| 407 | } |
||||||
| 408 | } |
||||||
| 409 | } else { |
||||||
| 410 | 12 | if ($this->isFpdf()) { |
|||||
| 411 | 1 | $content = $this->getPDFObjectForFpdf(); |
|||||
| 412 | } |
||||||
| 413 | 12 | $sectionsText = $content->getSectionsText($content->getContent()); |
|||||
|
0 ignored issues
–
show
The method
getSectionsText() does not exist on Smalot\PdfParser\Element.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 414 | 12 | foreach ($sectionsText as $sectionText) { |
|||||
| 415 | 12 | $commandsText = $content->getCommandsText($sectionText); |
|||||
|
0 ignored issues
–
show
The method
getCommandsText() does not exist on Smalot\PdfParser\Element.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 416 | 12 | foreach ($commandsText as $command) { |
|||||
| 417 | 12 | $extractedData[] = $command; |
|||||
| 418 | } |
||||||
| 419 | } |
||||||
| 420 | } |
||||||
| 421 | |||||||
| 422 | 13 | return $extractedData; |
|||||
| 423 | } |
||||||
| 424 | |||||||
| 425 | /** |
||||||
| 426 | * Gets all the decoded text data with it internal representation from a page. |
||||||
| 427 | * |
||||||
| 428 | * @param array $extractedRawData the extracted data return by extractRawData or |
||||||
| 429 | * null if extractRawData should be called |
||||||
| 430 | * |
||||||
| 431 | * @return array An array with the data and the internal representation |
||||||
| 432 | */ |
||||||
| 433 | 12 | public function extractDecodedRawData(?array $extractedRawData = null): array |
|||||
| 434 | { |
||||||
| 435 | 12 | if (!isset($extractedRawData) || !$extractedRawData) { |
|||||
|
0 ignored issues
–
show
The expression
$extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using Loading history...
|
|||||||
| 436 | 12 | $extractedRawData = $this->extractRawData(); |
|||||
| 437 | } |
||||||
| 438 | 12 | $currentFont = null; /** @var Font $currentFont */ |
|||||
| 439 | 12 | $clippedFont = null; |
|||||
| 440 | 12 | $fpdfPage = null; |
|||||
| 441 | 12 | if ($this->isFpdf()) { |
|||||
| 442 | 1 | $fpdfPage = $this->createPageForFpdf(); |
|||||
| 443 | } |
||||||
| 444 | 12 | foreach ($extractedRawData as &$command) { |
|||||
| 445 | 12 | if ('Tj' == $command['o'] || 'TJ' == $command['o']) { |
|||||
| 446 | 12 | $data = $command['c']; |
|||||
| 447 | 12 | if (!\is_array($data)) { |
|||||
| 448 | 9 | $tmpText = ''; |
|||||
| 449 | 9 | if (isset($currentFont)) { |
|||||
| 450 | 9 | $tmpText = $currentFont->decodeOctal($data); |
|||||
| 451 | // $tmpText = $currentFont->decodeHexadecimal($tmpText, false); |
||||||
| 452 | } |
||||||
| 453 | 9 | $tmpText = str_replace( |
|||||
| 454 | 9 | ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], |
|||||
| 455 | 9 | ['\\', '(', ')', "\n", "\r", "\t", ' '], |
|||||
| 456 | 9 | $tmpText |
|||||
| 457 | 9 | ); |
|||||
| 458 | 9 | $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1'); |
|||||
| 459 | 9 | if (isset($currentFont)) { |
|||||
| 460 | 9 | $tmpText = $currentFont->decodeContent($tmpText); |
|||||
|
0 ignored issues
–
show
It seems like
$tmpText can also be of type array; however, parameter $text of Smalot\PdfParser\Font::decodeContent() does only seem to accept string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||||
| 461 | } |
||||||
| 462 | 9 | $command['c'] = $tmpText; |
|||||
| 463 | 9 | continue; |
|||||
| 464 | } |
||||||
| 465 | 12 | $numText = \count($data); |
|||||
| 466 | 12 | for ($i = 0; $i < $numText; ++$i) { |
|||||
| 467 | 12 | if (0 != ($i % 2)) { |
|||||
| 468 | 8 | continue; |
|||||
| 469 | } |
||||||
| 470 | 12 | $tmpText = $data[$i]['c']; |
|||||
| 471 | 12 | $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText; |
|||||
| 472 | 12 | $decodedText = str_replace( |
|||||
| 473 | 12 | ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '], |
|||||
| 474 | 12 | ['\\', '(', ')', "\n", "\r", "\t", ' '], |
|||||
| 475 | 12 | $decodedText |
|||||
| 476 | 12 | ); |
|||||
| 477 | |||||||
| 478 | 12 | $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1'); |
|||||
| 479 | |||||||
| 480 | 12 | if (isset($currentFont)) { |
|||||
| 481 | 12 | $decodedText = $currentFont->decodeContent($decodedText); |
|||||
| 482 | } |
||||||
| 483 | 12 | $command['c'][$i]['c'] = $decodedText; |
|||||
| 484 | 12 | continue; |
|||||
| 485 | } |
||||||
| 486 | 12 | } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) { |
|||||
| 487 | 12 | $fontId = explode(' ', $command['c'])[0]; |
|||||
| 488 | // If document is a FPDI/FPDF the $page has the correct font |
||||||
| 489 | 12 | $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId); |
|||||
| 490 | 12 | continue; |
|||||
| 491 | 12 | } elseif ('Q' == $command['o']) { |
|||||
| 492 | 9 | $currentFont = $clippedFont; |
|||||
| 493 | 12 | } elseif ('q' == $command['o']) { |
|||||
| 494 | 9 | $clippedFont = $currentFont; |
|||||
| 495 | } |
||||||
| 496 | } |
||||||
| 497 | |||||||
| 498 | 12 | return $extractedRawData; |
|||||
| 499 | } |
||||||
| 500 | |||||||
| 501 | /** |
||||||
| 502 | * Gets just the Text commands that are involved in text positions and |
||||||
| 503 | * Text Matrix (Tm) |
||||||
| 504 | * |
||||||
| 505 | * It extract just the PDF commands that are involved with text positions, and |
||||||
| 506 | * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ |
||||||
| 507 | * |
||||||
| 508 | * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData. |
||||||
| 509 | * If it is null, the method extractDecodeRawData is called. |
||||||
| 510 | * |
||||||
| 511 | * @return array An array with the text command of the page |
||||||
| 512 | */ |
||||||
| 513 | 10 | public function getDataCommands(?array $extractedDecodedRawData = null): array |
|||||
| 514 | { |
||||||
| 515 | 10 | if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) { |
|||||
|
0 ignored issues
–
show
The expression
$extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using Loading history...
|
|||||||
| 516 | 10 | $extractedDecodedRawData = $this->extractDecodedRawData(); |
|||||
| 517 | } |
||||||
| 518 | 10 | $extractedData = []; |
|||||
| 519 | 10 | foreach ($extractedDecodedRawData as $command) { |
|||||
| 520 | 10 | switch ($command['o']) { |
|||||
| 521 | /* |
||||||
| 522 | * BT |
||||||
| 523 | * Begin a text object, inicializind the Tm and Tlm to identity matrix |
||||||
| 524 | */ |
||||||
| 525 | 10 | case 'BT': |
|||||
| 526 | 10 | $extractedData[] = $command; |
|||||
| 527 | 10 | break; |
|||||
| 528 | /* |
||||||
| 529 | * cm |
||||||
| 530 | * Concatenation Matrix that will transform all following Tm |
||||||
| 531 | */ |
||||||
| 532 | 10 | case 'cm': |
|||||
| 533 | 8 | $extractedData[] = $command; |
|||||
| 534 | 8 | break; |
|||||
| 535 | /* |
||||||
| 536 | * ET |
||||||
| 537 | * End a text object, discarding the text matrix |
||||||
| 538 | */ |
||||||
| 539 | 10 | case 'ET': |
|||||
| 540 | 10 | $extractedData[] = $command; |
|||||
| 541 | 10 | break; |
|||||
| 542 | |||||||
| 543 | /* |
||||||
| 544 | * leading TL |
||||||
| 545 | * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. |
||||||
| 546 | * Initial value: 0 |
||||||
| 547 | */ |
||||||
| 548 | 10 | case 'TL': |
|||||
| 549 | 5 | $extractedData[] = $command; |
|||||
| 550 | 5 | break; |
|||||
| 551 | |||||||
| 552 | /* |
||||||
| 553 | * tx ty Td |
||||||
| 554 | * Move to the start of the next line, offset form the start of the |
||||||
| 555 | * current line by tx, ty. |
||||||
| 556 | */ |
||||||
| 557 | 10 | case 'Td': |
|||||
| 558 | 9 | $extractedData[] = $command; |
|||||
| 559 | 9 | break; |
|||||
| 560 | |||||||
| 561 | /* |
||||||
| 562 | * tx ty TD |
||||||
| 563 | * Move to the start of the next line, offset form the start of the |
||||||
| 564 | * current line by tx, ty. As a side effect, this operator set the leading |
||||||
| 565 | * parameter in the text state. This operator has the same effect as the |
||||||
| 566 | * code: |
||||||
| 567 | * -ty TL |
||||||
| 568 | * tx ty Td |
||||||
| 569 | */ |
||||||
| 570 | 10 | case 'TD': |
|||||
| 571 | $extractedData[] = $command; |
||||||
| 572 | break; |
||||||
| 573 | |||||||
| 574 | /* |
||||||
| 575 | * a b c d e f Tm |
||||||
| 576 | * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are |
||||||
| 577 | * all numbers, and the initial value for Tm and Tlm is the identity matrix |
||||||
| 578 | * [1 0 0 1 0 0] |
||||||
| 579 | */ |
||||||
| 580 | 10 | case 'Tm': |
|||||
| 581 | 8 | $extractedData[] = $command; |
|||||
| 582 | 8 | break; |
|||||
| 583 | |||||||
| 584 | /* |
||||||
| 585 | * T* |
||||||
| 586 | * Move to the start of the next line. This operator has the same effect |
||||||
| 587 | * as the code: |
||||||
| 588 | * 0 Tl Td |
||||||
| 589 | * Where Tl is the current leading parameter in the text state. |
||||||
| 590 | */ |
||||||
| 591 | 10 | case 'T*': |
|||||
| 592 | 5 | $extractedData[] = $command; |
|||||
| 593 | 5 | break; |
|||||
| 594 | |||||||
| 595 | /* |
||||||
| 596 | * string Tj |
||||||
| 597 | * Show a Text String |
||||||
| 598 | */ |
||||||
| 599 | 10 | case 'Tj': |
|||||
| 600 | 8 | $extractedData[] = $command; |
|||||
| 601 | 8 | break; |
|||||
| 602 | |||||||
| 603 | /* |
||||||
| 604 | * string ' |
||||||
| 605 | * Move to the next line and show a text string. This operator has the |
||||||
| 606 | * same effect as the code: |
||||||
| 607 | * T* |
||||||
| 608 | * string Tj |
||||||
| 609 | */ |
||||||
| 610 | 10 | case "'": |
|||||
| 611 | $extractedData[] = $command; |
||||||
| 612 | break; |
||||||
| 613 | |||||||
| 614 | /* |
||||||
| 615 | * aw ac string " |
||||||
| 616 | * Move to the next lkine and show a text string, using aw as the word |
||||||
| 617 | * spacing and ac as the character spacing. This operator has the same |
||||||
| 618 | * effect as the code: |
||||||
| 619 | * aw Tw |
||||||
| 620 | * ac Tc |
||||||
| 621 | * string ' |
||||||
| 622 | * Tw set the word spacing, Tw, to wordSpace. |
||||||
| 623 | * Tc Set the character spacing, Tc, to charsSpace. |
||||||
| 624 | */ |
||||||
| 625 | 10 | case '"': |
|||||
| 626 | $extractedData[] = $command; |
||||||
| 627 | break; |
||||||
| 628 | |||||||
| 629 | 10 | case 'Tf': |
|||||
| 630 | 10 | case 'TF': |
|||||
| 631 | 10 | $extractedData[] = $command; |
|||||
| 632 | 10 | break; |
|||||
| 633 | |||||||
| 634 | /* |
||||||
| 635 | * array TJ |
||||||
| 636 | * Show one or more text strings allow individual glyph positioning. |
||||||
| 637 | * Each lement of array con be a string or a number. If the element is |
||||||
| 638 | * a string, this operator shows the string. If it is a number, the |
||||||
| 639 | * operator adjust the text position by that amount; that is, it translates |
||||||
| 640 | * the text matrix, Tm. This amount is substracted form the current |
||||||
| 641 | * horizontal or vertical coordinate, depending on the writing mode. |
||||||
| 642 | * in the default coordinate system, a positive adjustment has the effect |
||||||
| 643 | * of moving the next glyph painted either to the left or down by the given |
||||||
| 644 | * amount. |
||||||
| 645 | */ |
||||||
| 646 | 10 | case 'TJ': |
|||||
| 647 | 10 | $extractedData[] = $command; |
|||||
| 648 | 10 | break; |
|||||
| 649 | /* |
||||||
| 650 | * q |
||||||
| 651 | * Save current graphics state to stack |
||||||
| 652 | */ |
||||||
| 653 | 9 | case 'q': |
|||||
| 654 | /* |
||||||
| 655 | * Q |
||||||
| 656 | * Load last saved graphics state from stack |
||||||
| 657 | */ |
||||||
| 658 | 9 | case 'Q': |
|||||
| 659 | 8 | $extractedData[] = $command; |
|||||
| 660 | 8 | break; |
|||||
| 661 | default: |
||||||
| 662 | } |
||||||
| 663 | } |
||||||
| 664 | |||||||
| 665 | 10 | return $extractedData; |
|||||
| 666 | } |
||||||
| 667 | |||||||
| 668 | /** |
||||||
| 669 | * Gets the Text Matrix of the text in the page |
||||||
| 670 | * |
||||||
| 671 | * Return an array where every item is an array where the first item is the |
||||||
| 672 | * Text Matrix (Tm) and the second is a string with the text data. The Text matrix |
||||||
| 673 | * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the |
||||||
| 674 | * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text. |
||||||
| 675 | * |
||||||
| 676 | * @param array $dataCommands the data extracted by getDataCommands |
||||||
| 677 | * if null getDataCommands is called |
||||||
| 678 | * |
||||||
| 679 | * @return array an array with the data of the page including the Tm information |
||||||
| 680 | * of any text in the page |
||||||
| 681 | */ |
||||||
| 682 | 9 | public function getDataTm(?array $dataCommands = null): array |
|||||
| 683 | { |
||||||
| 684 | 9 | if (!isset($dataCommands) || !$dataCommands) { |
|||||
|
0 ignored issues
–
show
The expression
$dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using Loading history...
|
|||||||
| 685 | 9 | $dataCommands = $this->getDataCommands(); |
|||||
| 686 | } |
||||||
| 687 | |||||||
| 688 | /* |
||||||
| 689 | * At the beginning of a text object Tm is the identity matrix |
||||||
| 690 | */ |
||||||
| 691 | 9 | $defaultTm = ['1', '0', '0', '1', '0', '0']; |
|||||
| 692 | 9 | $concatTm = ['1', '0', '0', '1', '0', '0']; |
|||||
| 693 | 9 | $graphicsStatesStack = []; |
|||||
| 694 | /* |
||||||
| 695 | * Set the text leading used by T*, ' and " operators |
||||||
| 696 | */ |
||||||
| 697 | 9 | $defaultTl = 0; |
|||||
| 698 | |||||||
| 699 | /* |
||||||
| 700 | * Set default values for font data |
||||||
| 701 | */ |
||||||
| 702 | 9 | $defaultFontId = -1; |
|||||
| 703 | 9 | $defaultFontSize = 1; |
|||||
| 704 | |||||||
| 705 | /* |
||||||
| 706 | * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm) |
||||||
| 707 | */ |
||||||
| 708 | 9 | $hSc = 0; // horizontal scaling |
|||||
| 709 | /** |
||||||
| 710 | * index of vertical scaling in the array that encodes the text matrix. |
||||||
| 711 | * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500 |
||||||
| 712 | */ |
||||||
| 713 | 9 | $vSc = 3; |
|||||
| 714 | 9 | $x = 4; |
|||||
| 715 | 9 | $y = 5; |
|||||
| 716 | |||||||
| 717 | /* |
||||||
| 718 | * x,y-coordinates of text space origin in user units |
||||||
| 719 | * |
||||||
| 720 | * These will be assigned the value of the currently printed string |
||||||
| 721 | */ |
||||||
| 722 | 9 | $Tx = 0; |
|||||
| 723 | 9 | $Ty = 0; |
|||||
| 724 | |||||||
| 725 | 9 | $Tm = $defaultTm; |
|||||
| 726 | 9 | $Tl = $defaultTl; |
|||||
| 727 | 9 | $fontId = $defaultFontId; |
|||||
| 728 | 9 | $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs |
|||||
| 729 | |||||||
| 730 | 9 | $extractedTexts = $this->getTextArray(); |
|||||
| 731 | 9 | $extractedData = []; |
|||||
| 732 | 9 | foreach ($dataCommands as $command) { |
|||||
| 733 | // If we've used up all the texts from getTextArray(), exit |
||||||
| 734 | // so we aren't accessing non-existent array indices |
||||||
| 735 | // Fixes 'undefined array key' errors in Issues #575, #576 |
||||||
| 736 | 9 | if (\count($extractedTexts) <= \count($extractedData)) { |
|||||
| 737 | 7 | break; |
|||||
| 738 | } |
||||||
| 739 | 9 | $currentText = $extractedTexts[\count($extractedData)]; |
|||||
| 740 | 9 | switch ($command['o']) { |
|||||
| 741 | /* |
||||||
| 742 | * BT |
||||||
| 743 | * Begin a text object, initializing the Tm and Tlm to identity matrix |
||||||
| 744 | */ |
||||||
| 745 | 9 | case 'BT': |
|||||
| 746 | 9 | $Tm = $defaultTm; |
|||||
| 747 | 9 | $Tl = $defaultTl; |
|||||
| 748 | 9 | $Tx = 0; |
|||||
| 749 | 9 | $Ty = 0; |
|||||
| 750 | 9 | break; |
|||||
| 751 | |||||||
| 752 | 9 | case 'cm': |
|||||
| 753 | 7 | $newConcatTm = (array) explode(' ', $command['c']); |
|||||
| 754 | 7 | $TempMatrix = []; |
|||||
| 755 | // Multiply with previous concatTm |
||||||
| 756 | 7 | $TempMatrix[0] = (float) $concatTm[0] * (float) $newConcatTm[0] + (float) $concatTm[1] * (float) $newConcatTm[2]; |
|||||
| 757 | 7 | $TempMatrix[1] = (float) $concatTm[0] * (float) $newConcatTm[1] + (float) $concatTm[1] * (float) $newConcatTm[3]; |
|||||
| 758 | 7 | $TempMatrix[2] = (float) $concatTm[2] * (float) $newConcatTm[0] + (float) $concatTm[3] * (float) $newConcatTm[2]; |
|||||
| 759 | 7 | $TempMatrix[3] = (float) $concatTm[2] * (float) $newConcatTm[1] + (float) $concatTm[3] * (float) $newConcatTm[3]; |
|||||
| 760 | 7 | $TempMatrix[4] = (float) $concatTm[4] * (float) $newConcatTm[0] + (float) $concatTm[5] * (float) $newConcatTm[2] + (float) $newConcatTm[4]; |
|||||
| 761 | 7 | $TempMatrix[5] = (float) $concatTm[4] * (float) $newConcatTm[1] + (float) $concatTm[5] * (float) $newConcatTm[3] + (float) $newConcatTm[5]; |
|||||
| 762 | 7 | $concatTm = $TempMatrix; |
|||||
| 763 | 7 | break; |
|||||
| 764 | /* |
||||||
| 765 | * ET |
||||||
| 766 | * End a text object |
||||||
| 767 | */ |
||||||
| 768 | 9 | case 'ET': |
|||||
| 769 | 8 | break; |
|||||
| 770 | |||||||
| 771 | /* |
||||||
| 772 | * text leading TL |
||||||
| 773 | * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators. |
||||||
| 774 | * Initial value: 0 |
||||||
| 775 | */ |
||||||
| 776 | 9 | case 'TL': |
|||||
| 777 | // scaled text leading |
||||||
| 778 | 4 | $Tl = (float) $command['c'] * (float) $Tm[$vSc]; |
|||||
| 779 | 4 | break; |
|||||
| 780 | |||||||
| 781 | /* |
||||||
| 782 | * tx ty Td |
||||||
| 783 | * Move to the start of the next line, offset from the start of the |
||||||
| 784 | * current line by tx, ty. |
||||||
| 785 | */ |
||||||
| 786 | 9 | case 'Td': |
|||||
| 787 | 8 | $coord = explode(' ', $command['c']); |
|||||
| 788 | 8 | $Tx += (float) $coord[0] * (float) $Tm[$hSc]; |
|||||
| 789 | 8 | $Ty += (float) $coord[1] * (float) $Tm[$vSc]; |
|||||
| 790 | 8 | $Tm[$x] = (string) $Tx; |
|||||
| 791 | 8 | $Tm[$y] = (string) $Ty; |
|||||
| 792 | 8 | break; |
|||||
| 793 | |||||||
| 794 | /* |
||||||
| 795 | * tx ty TD |
||||||
| 796 | * Move to the start of the next line, offset form the start of the |
||||||
| 797 | * current line by tx, ty. As a side effect, this operator set the leading |
||||||
| 798 | * parameter in the text state. This operator has the same effect as the |
||||||
| 799 | * code: |
||||||
| 800 | * -ty TL |
||||||
| 801 | * tx ty Td |
||||||
| 802 | */ |
||||||
| 803 | 9 | case 'TD': |
|||||
| 804 | 1 | $coord = explode(' ', $command['c']); |
|||||
| 805 | 1 | $Tl = -((float) $coord[1] * (float) $Tm[$vSc]); |
|||||
| 806 | 1 | $Tx += (float) $coord[0] * (float) $Tm[$hSc]; |
|||||
| 807 | 1 | $Ty += (float) $coord[1] * (float) $Tm[$vSc]; |
|||||
| 808 | 1 | $Tm[$x] = (string) $Tx; |
|||||
| 809 | 1 | $Tm[$y] = (string) $Ty; |
|||||
| 810 | 1 | break; |
|||||
| 811 | |||||||
| 812 | /* |
||||||
| 813 | * a b c d e f Tm |
||||||
| 814 | * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are |
||||||
| 815 | * all numbers, and the initial value for Tm and Tlm is the identity matrix |
||||||
| 816 | * [1 0 0 1 0 0] |
||||||
| 817 | */ |
||||||
| 818 | 9 | case 'Tm': |
|||||
| 819 | 7 | $Tm = explode(' ', $command['c']); |
|||||
| 820 | 7 | $TempMatrix = []; |
|||||
| 821 | 7 | $TempMatrix[0] = (float) $Tm[0] * (float) $concatTm[0] + (float) $Tm[1] * (float) $concatTm[2]; |
|||||
| 822 | 7 | $TempMatrix[1] = (float) $Tm[0] * (float) $concatTm[1] + (float) $Tm[1] * (float) $concatTm[3]; |
|||||
| 823 | 7 | $TempMatrix[2] = (float) $Tm[2] * (float) $concatTm[0] + (float) $Tm[3] * (float) $concatTm[2]; |
|||||
| 824 | 7 | $TempMatrix[3] = (float) $Tm[2] * (float) $concatTm[1] + (float) $Tm[3] * (float) $concatTm[3]; |
|||||
| 825 | 7 | $TempMatrix[4] = (float) $Tm[4] * (float) $concatTm[0] + (float) $Tm[5] * (float) $concatTm[2] + (float) $concatTm[4]; |
|||||
| 826 | 7 | $TempMatrix[5] = (float) $Tm[4] * (float) $concatTm[1] + (float) $Tm[5] * (float) $concatTm[3] + (float) $concatTm[5]; |
|||||
| 827 | 7 | $Tm = $TempMatrix; |
|||||
| 828 | 7 | $Tx = (float) $Tm[$x]; |
|||||
| 829 | 7 | $Ty = (float) $Tm[$y]; |
|||||
| 830 | 7 | break; |
|||||
| 831 | |||||||
| 832 | /* |
||||||
| 833 | * T* |
||||||
| 834 | * Move to the start of the next line. This operator has the same effect |
||||||
| 835 | * as the code: |
||||||
| 836 | * 0 Tl Td |
||||||
| 837 | * Where Tl is the current leading parameter in the text state. |
||||||
| 838 | */ |
||||||
| 839 | 9 | case 'T*': |
|||||
| 840 | 4 | $Ty -= $Tl; |
|||||
| 841 | 4 | $Tm[$y] = (string) $Ty; |
|||||
| 842 | 4 | break; |
|||||
| 843 | |||||||
| 844 | /* |
||||||
| 845 | * string Tj |
||||||
| 846 | * Show a Text String |
||||||
| 847 | */ |
||||||
| 848 | 9 | case 'Tj': |
|||||
| 849 | 7 | $data = [$Tm, $currentText]; |
|||||
| 850 | 7 | if ($this->config->getDataTmFontInfoHasToBeIncluded()) { |
|||||
|
0 ignored issues
–
show
The method
getDataTmFontInfoHasToBeIncluded() does not exist on null.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed. Loading history...
|
|||||||
| 851 | 2 | $data[] = $fontId; |
|||||
| 852 | 2 | $data[] = $fontSize; |
|||||
| 853 | } |
||||||
| 854 | 7 | $extractedData[] = $data; |
|||||
| 855 | 7 | break; |
|||||
| 856 | |||||||
| 857 | /* |
||||||
| 858 | * string ' |
||||||
| 859 | * Move to the next line and show a text string. This operator has the |
||||||
| 860 | * same effect as the code: |
||||||
| 861 | * T* |
||||||
| 862 | * string Tj |
||||||
| 863 | */ |
||||||
| 864 | 9 | case "'": |
|||||
| 865 | 1 | $Ty -= $Tl; |
|||||
| 866 | 1 | $Tm[$y] = (string) $Ty; |
|||||
| 867 | 1 | $extractedData[] = [$Tm, $currentText]; |
|||||
| 868 | 1 | break; |
|||||
| 869 | |||||||
| 870 | /* |
||||||
| 871 | * aw ac string " |
||||||
| 872 | * Move to the next line and show a text string, using aw as the word |
||||||
| 873 | * spacing and ac as the character spacing. This operator has the same |
||||||
| 874 | * effect as the code: |
||||||
| 875 | * aw Tw |
||||||
| 876 | * ac Tc |
||||||
| 877 | * string ' |
||||||
| 878 | * Tw set the word spacing, Tw, to wordSpace. |
||||||
| 879 | * Tc Set the character spacing, Tc, to charsSpace. |
||||||
| 880 | */ |
||||||
| 881 | 9 | case '"': |
|||||
| 882 | $data = explode(' ', $currentText); |
||||||
| 883 | $Ty -= $Tl; |
||||||
| 884 | $Tm[$y] = (string) $Ty; |
||||||
| 885 | $extractedData[] = [$Tm, $data[2]]; // Verify |
||||||
| 886 | break; |
||||||
| 887 | |||||||
| 888 | 9 | case 'Tf': |
|||||
| 889 | /* |
||||||
| 890 | * From PDF 1.0 specification, page 106: |
||||||
| 891 | * fontname size Tf Set font and size |
||||||
| 892 | * Sets the text font and text size in the graphics state. There is no default value for |
||||||
| 893 | * either fontname or size; they must be selected using Tf before drawing any text. |
||||||
| 894 | * fontname is a resource name. size is a number expressed in text space units. |
||||||
| 895 | * |
||||||
| 896 | * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf |
||||||
| 897 | * Introduced with https://github.com/smalot/pdfparser/pull/516 |
||||||
| 898 | */ |
||||||
| 899 | 9 | list($fontId, $fontSize) = explode(' ', $command['c'], 2); |
|||||
| 900 | 9 | break; |
|||||
| 901 | |||||||
| 902 | /* |
||||||
| 903 | * array TJ |
||||||
| 904 | * Show one or more text strings allow individual glyph positioning. |
||||||
| 905 | * Each lement of array con be a string or a number. If the element is |
||||||
| 906 | * a string, this operator shows the string. If it is a number, the |
||||||
| 907 | * operator adjust the text position by that amount; that is, it translates |
||||||
| 908 | * the text matrix, Tm. This amount is substracted form the current |
||||||
| 909 | * horizontal or vertical coordinate, depending on the writing mode. |
||||||
| 910 | * in the default coordinate system, a positive adjustment has the effect |
||||||
| 911 | * of moving the next glyph painted either to the left or down by the given |
||||||
| 912 | * amount. |
||||||
| 913 | */ |
||||||
| 914 | 9 | case 'TJ': |
|||||
| 915 | 9 | $data = [$Tm, $currentText]; |
|||||
| 916 | 9 | if ($this->config->getDataTmFontInfoHasToBeIncluded()) { |
|||||
| 917 | 2 | $data[] = $fontId; |
|||||
| 918 | 2 | $data[] = $fontSize; |
|||||
| 919 | } |
||||||
| 920 | 9 | $extractedData[] = $data; |
|||||
| 921 | 9 | break; |
|||||
| 922 | /* |
||||||
| 923 | * q |
||||||
| 924 | * Save current graphics state to stack |
||||||
| 925 | */ |
||||||
| 926 | 7 | case 'q': |
|||||
| 927 | 7 | $graphicsStatesStack[] = $concatTm; |
|||||
| 928 | 7 | break; |
|||||
| 929 | /* |
||||||
| 930 | * Q |
||||||
| 931 | * Load last saved graphics state from stack |
||||||
| 932 | */ |
||||||
| 933 | 7 | case 'Q': |
|||||
| 934 | 7 | $concatTm = array_pop($graphicsStatesStack); |
|||||
| 935 | 7 | break; |
|||||
| 936 | default: |
||||||
| 937 | } |
||||||
| 938 | } |
||||||
| 939 | 9 | $this->dataTm = $extractedData; |
|||||
| 940 | |||||||
| 941 | 9 | return $extractedData; |
|||||
| 942 | } |
||||||
| 943 | |||||||
| 944 | /** |
||||||
| 945 | * Gets text data that are around the given coordinates (X,Y) |
||||||
| 946 | * |
||||||
| 947 | * If the text is in near the given coordinates (X,Y) (or the TM info), |
||||||
| 948 | * the text is returned. The extractedData return by getDataTm, could be use to see |
||||||
| 949 | * where is the coordinates of a given text, using the TM info for it. |
||||||
| 950 | * |
||||||
| 951 | * @param float $x The X value of the coordinate to search for. if null |
||||||
| 952 | * just the Y value is considered (same Row) |
||||||
| 953 | * @param float $y The Y value of the coordinate to search for |
||||||
| 954 | * just the X value is considered (same column) |
||||||
| 955 | * @param float $xError The value less or more to consider an X to be "near" |
||||||
| 956 | * @param float $yError The value less or more to consider an Y to be "near" |
||||||
| 957 | * |
||||||
| 958 | * @return array An array of text that are near the given coordinates. If no text |
||||||
| 959 | * "near" the x,y coordinate, an empty array is returned. If Both, x |
||||||
| 960 | * and y coordinates are null, null is returned. |
||||||
| 961 | */ |
||||||
| 962 | 2 | public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array |
|||||
| 963 | { |
||||||
| 964 | 2 | if (!isset($this->dataTm) || !$this->dataTm) { |
|||||
|
0 ignored issues
–
show
The expression
$this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using Loading history...
|
|||||||
| 965 | 1 | $this->getDataTm(); |
|||||
| 966 | } |
||||||
| 967 | |||||||
| 968 | 2 | if (null !== $x) { |
|||||
| 969 | 2 | $x = (float) $x; |
|||||
| 970 | } |
||||||
| 971 | |||||||
| 972 | 2 | if (null !== $y) { |
|||||
| 973 | 2 | $y = (float) $y; |
|||||
| 974 | } |
||||||
| 975 | |||||||
| 976 | 2 | if (null === $x && null === $y) { |
|||||
| 977 | return []; |
||||||
| 978 | } |
||||||
| 979 | |||||||
| 980 | 2 | $xError = (float) $xError; |
|||||
| 981 | 2 | $yError = (float) $yError; |
|||||
| 982 | |||||||
| 983 | 2 | $extractedData = []; |
|||||
| 984 | 2 | foreach ($this->dataTm as $item) { |
|||||
| 985 | 2 | $tm = $item[0]; |
|||||
| 986 | 2 | $xTm = (float) $tm[4]; |
|||||
| 987 | 2 | $yTm = (float) $tm[5]; |
|||||
| 988 | 2 | $text = $item[1]; |
|||||
| 989 | 2 | if (null === $y) { |
|||||
| 990 | if (($xTm >= ($x - $xError)) |
||||||
| 991 | && ($xTm <= ($x + $xError))) { |
||||||
| 992 | $extractedData[] = [$tm, $text]; |
||||||
| 993 | continue; |
||||||
| 994 | } |
||||||
| 995 | } |
||||||
| 996 | 2 | if (null === $x) { |
|||||
| 997 | if (($yTm >= ($y - $yError)) |
||||||
| 998 | && ($yTm <= ($y + $yError))) { |
||||||
| 999 | $extractedData[] = [$tm, $text]; |
||||||
| 1000 | continue; |
||||||
| 1001 | } |
||||||
| 1002 | } |
||||||
| 1003 | 2 | if (($xTm >= ($x - $xError)) |
|||||
| 1004 | 2 | && ($xTm <= ($x + $xError)) |
|||||
| 1005 | 2 | && ($yTm >= ($y - $yError)) |
|||||
| 1006 | 2 | && ($yTm <= ($y + $yError))) { |
|||||
| 1007 | 2 | $extractedData[] = [$tm, $text]; |
|||||
| 1008 | 2 | continue; |
|||||
| 1009 | } |
||||||
| 1010 | } |
||||||
| 1011 | |||||||
| 1012 | 2 | return $extractedData; |
|||||
| 1013 | } |
||||||
| 1014 | } |
||||||
| 1015 |
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.
This is most likely a typographical error or the method has been renamed.