Passed
Push — master ( 2939df...ddf03e )
by Konrad
02:55
created

Page::getPDFObjectForFpdf()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 0
dl 0
loc 6
ccs 4
cts 4
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Element\ElementNull;
38
use Smalot\PdfParser\Element\ElementXRef;
39
40
class Page extends PDFObject
41
{
42
    /**
43
     * @var Font[]
44
     */
45
    protected $fonts;
46
47
    /**
48
     * @var PDFObject[]
49
     */
50
    protected $xobjects;
51
52
    /**
53
     * @var array
54
     */
55
    protected $dataTm;
56
57
    /**
58
     * @return Font[]
59
     */
60 49
    public function getFonts()
61
    {
62 49
        if (null !== $this->fonts) {
63 43
            return $this->fonts;
64
        }
65
66 49
        $resources = $this->get('Resources');
67
68 49
        if (method_exists($resources, 'has') && $resources->has('Font')) {
69 44
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

69
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
70 1
                return [];
71
            }
72
73 43
            if ($resources->get('Font') instanceof Header) {
74 35
                $fonts = $resources->get('Font')->getElements();
75
            } else {
76 12
                $fonts = $resources->get('Font')->getHeader()->getElements();
77
            }
78
79 43
            $table = [];
80
81 43
            foreach ($fonts as $id => $font) {
82 43
                if ($font instanceof Font) {
83 43
                    $table[$id] = $font;
84
85
                    // Store too on cleaned id value (only numeric)
86 43
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
87 43
                    if ('' != $id) {
88 42
                        $table[$id] = $font;
89
                    }
90
                }
91
            }
92
93 43
            return $this->fonts = $table;
94
        }
95
96 8
        return [];
97
    }
98
99 46
    public function getFont(string $id): ?Font
100
    {
101 46
        $fonts = $this->getFonts();
102
103 46
        if (isset($fonts[$id])) {
104 42
            return $fonts[$id];
105
        }
106
107
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
108
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
109
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
110
111 5
        if (isset($fonts[$id])) {
112
            return $fonts[$id];
113
        } else {
114 5
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
115 5
            if (isset($fonts[$id])) {
116 1
                return $fonts[$id];
117
            }
118
        }
119
120 4
        return null;
121
    }
122
123
    /**
124
     * Support for XObject
125
     *
126
     * @return PDFObject[]
127
     */
128 16
    public function getXObjects()
129
    {
130 16
        if (null !== $this->xobjects) {
131 10
            return $this->xobjects;
132
        }
133
134 16
        $resources = $this->get('Resources');
135
136 16
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
137 16
            if ($resources->get('XObject') instanceof Header) {
138 15
                $xobjects = $resources->get('XObject')->getElements();
139
            } else {
140 1
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
141
            }
142
143 16
            $table = [];
144
145 16
            foreach ($xobjects as $id => $xobject) {
146 16
                $table[$id] = $xobject;
147
148
                // Store too on cleaned id value (only numeric)
149 16
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
150 16
                if ('' != $id) {
151 16
                    $table[$id] = $xobject;
152
                }
153
            }
154
155 16
            return $this->xobjects = $table;
156
        }
157
158
        return [];
159
    }
160
161 15
    public function getXObject(string $id): ?PDFObject
162
    {
163 15
        $xobjects = $this->getXObjects();
164
165 15
        if (isset($xobjects[$id])) {
166 15
            return $xobjects[$id];
167
        }
168
169
        return null;
170
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
171
172
        if (isset($xobjects[$id])) {
173
            return $xobjects[$id];
174
        } else {
175
            return null;
176
        }*/
177
    }
178
179 34
    public function getText(?self $page = null): string
180
    {
181 34
        if ($contents = $this->get('Contents')) {
182 34
            if ($contents instanceof ElementMissing) {
183
                return '';
184 34
            } elseif ($contents instanceof ElementNull) {
185
                return '';
186 34
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
187 28
                $elements = $contents->getHeader()->getElements();
188
189 28
                if (is_numeric(key($elements))) {
190
                    $new_content = '';
191
192
                    foreach ($elements as $element) {
193
                        if ($element instanceof ElementXRef) {
194
                            $new_content .= $element->getObject()->getContent();
195
                        } else {
196
                            $new_content .= $element->getContent();
197
                        }
198
                    }
199
200
                    $header = new Header([], $this->document);
201 28
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
202
                }
203 9
            } elseif ($contents instanceof ElementArray) {
204
                // Create a virtual global content.
205 9
                $new_content = '';
206
207 9
                foreach ($contents->getContent() as $content) {
208 9
                    $new_content .= $content->getContent()."\n";
209
                }
210
211 9
                $header = new Header([], $this->document);
212 9
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

212
                $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
213
            }
214
215
            /*
216
             * Elements referencing each other on the same page can cause endless loops during text parsing.
217
             * To combat this we keep a recursionStack containing already parsed elements on the page.
218
             * The stack is only emptied here after getting text from a page.
219
             */
220 34
            $contentsText = $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
            /** @scrutinizer ignore-call */ 
221
            $contentsText = $contents->getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221 34
            PDFObject::$recursionStack = [];
222
223 34
            return $contentsText;
224
        }
225
226
        return '';
227
    }
228
229
    /**
230
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
231
     *
232
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
233
     * pdf file was generated by FPDF/Fpfi.
234
     *
235
     * @return bool true is the current page is a FPDI/FPDF document
236
     */
237 13
    public function isFpdf(): bool
238
    {
239 13
        if (\array_key_exists('Producer', $this->document->getDetails())
0 ignored issues
show
Bug introduced by
The method getDetails() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

239
        if (\array_key_exists('Producer', $this->document->/** @scrutinizer ignore-call */ getDetails())

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
240 13
            && \is_string($this->document->getDetails()['Producer'])
241 13
            && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) {
242 2
            return true;
243
        }
244
245 12
        return false;
246
    }
247
248
    /**
249
     * Return the page number of the PDF document of the page object
250
     *
251
     * @return int the page number
252
     */
253 2
    public function getPageNumber(): int
254
    {
255 2
        $pages = $this->document->getPages();
256 2
        $numOfPages = \count($pages);
257 2
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
258 2
            if ($pages[$pageNum] === $this) {
259 2
                break;
260
            }
261
        }
262
263 2
        return $pageNum;
264
    }
265
266
    /**
267
     * Return the Object of the page if the document is a FPDF/FPDI document
268
     *
269
     * If the document was generated by FPDF/FPDI it returns the
270
     * PDFObject of the given page
271
     *
272
     * @return PDFObject The PDFObject for the page
273
     */
274 1
    public function getPDFObjectForFpdf(): PDFObject
275
    {
276 1
        $pageNum = $this->getPageNumber();
277 1
        $xObjects = $this->getXObjects();
278
279 1
        return $xObjects[$pageNum];
280
    }
281
282
    /**
283
     * Return a new PDFObject of the document created with FPDF/FPDI
284
     *
285
     * For a document generated by FPDF/FPDI, it generates a
286
     * new PDFObject for that document
287
     *
288
     * @return PDFObject The PDFObject
289
     */
290 1
    public function createPDFObjectForFpdf(): PDFObject
291
    {
292 1
        $pdfObject = $this->getPDFObjectForFpdf();
293 1
        $new_content = $pdfObject->getContent();
294 1
        $header = $pdfObject->getHeader();
295 1
        $config = $pdfObject->config;
296
297 1
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

297
        return new PDFObject(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
298
    }
299
300
    /**
301
     * Return page if document is a FPDF/FPDI document
302
     *
303
     * @return Page The page
304
     */
305 1
    public function createPageForFpdf(): self
306
    {
307 1
        $pdfObject = $this->getPDFObjectForFpdf();
308 1
        $new_content = $pdfObject->getContent();
309 1
        $header = $pdfObject->getHeader();
310 1
        $config = $pdfObject->config;
311
312 1
        return new self($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\Page::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

312
        return new self(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
313
    }
314
315 8
    public function getTextArray(?self $page = null): array
316
    {
317 8
        if ($this->isFpdf()) {
318 1
            $pdfObject = $this->getPDFObjectForFpdf();
319 1
            $newPdfObject = $this->createPDFObjectForFpdf();
320
321 1
            return $newPdfObject->getTextArray($pdfObject);
322
        } else {
323 7
            if ($contents = $this->get('Contents')) {
324 7
                if ($contents instanceof ElementMissing) {
325
                    return [];
326 7
                } elseif ($contents instanceof ElementNull) {
327
                    return [];
328 7
                } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
329 7
                    $elements = $contents->getHeader()->getElements();
330
331 7
                    if (is_numeric(key($elements))) {
332
                        $new_content = '';
333
334
                        /** @var PDFObject $element */
335
                        foreach ($elements as $element) {
336
                            if ($element instanceof ElementXRef) {
337
                                $new_content .= $element->getObject()->getContent();
338
                            } else {
339
                                $new_content .= $element->getContent();
340
                            }
341
                        }
342
343
                        $header = new Header([], $this->document);
344
                        $contents = new PDFObject($this->document, $header, $new_content, $this->config);
345
                    } else {
346
                        try {
347 7
                            $contents->getTextArray($this);
348
                        } catch (\Throwable $e) {
349 7
                            return $contents->getTextArray();
350
                        }
351
                    }
352 1
                } elseif ($contents instanceof ElementArray) {
353
                    // Create a virtual global content.
354 1
                    $new_content = '';
355
356
                    /** @var PDFObject $content */
357 1
                    foreach ($contents->getContent() as $content) {
358 1
                        $new_content .= $content->getContent()."\n";
359
                    }
360
361 1
                    $header = new Header([], $this->document);
362 1
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

362
                    $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
363
                }
364
365 7
                return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

365
                return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
366
            }
367
368
            return [];
369
        }
370
    }
371
372
    /**
373
     * Gets all the text data with its internal representation of the page.
374
     *
375
     * Returns an array with the data and the internal representation
376
     */
377 12
    public function extractRawData(): array
378
    {
379
        /*
380
         * Now you can get the complete content of the object with the text on it
381
         */
382 12
        $extractedData = [];
383 12
        $content = $this->get('Contents');
384 12
        $values = $content->getContent();
385 12
        if (isset($values) && \is_array($values)) {
386 1
            $text = '';
387 1
            foreach ($values as $section) {
388 1
                $text .= $section->getContent();
389
            }
390 1
            $sectionsText = $this->getSectionsText($text);
391 1
            foreach ($sectionsText as $sectionText) {
392 1
                $commandsText = $this->getCommandsText($sectionText);
393 1
                foreach ($commandsText as $command) {
394 1
                    $extractedData[] = $command;
395
                }
396
            }
397
        } else {
398 12
            if ($this->isFpdf()) {
399 1
                $content = $this->getPDFObjectForFpdf();
400
            }
401 12
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

401
            /** @scrutinizer ignore-call */ 
402
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
402 12
            foreach ($sectionsText as $sectionText) {
403 12
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

403
                /** @scrutinizer ignore-call */ 
404
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
404 12
                foreach ($commandsText as $command) {
405 12
                    $extractedData[] = $command;
406
                }
407
            }
408
        }
409
410 12
        return $extractedData;
411
    }
412
413
    /**
414
     * Gets all the decoded text data with it internal representation from a page.
415
     *
416
     * @param array $extractedRawData the extracted data return by extractRawData or
417
     *                                null if extractRawData should be called
418
     *
419
     * @return array An array with the data and the internal representation
420
     */
421 11
    public function extractDecodedRawData(?array $extractedRawData = null): array
422
    {
423 11
        if (!isset($extractedRawData) || !$extractedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
424 11
            $extractedRawData = $this->extractRawData();
425
        }
426 11
        $currentFont = null; /** @var Font $currentFont */
427 11
        $clippedFont = null;
428 11
        $fpdfPage = null;
429 11
        if ($this->isFpdf()) {
430 1
            $fpdfPage = $this->createPageForFpdf();
431
        }
432 11
        foreach ($extractedRawData as &$command) {
433 11
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
434 11
                $data = $command['c'];
435 11
                if (!\is_array($data)) {
436 9
                    $tmpText = '';
437 9
                    if (isset($currentFont)) {
438 9
                        $tmpText = $currentFont->decodeOctal($data);
439
                        // $tmpText = $currentFont->decodeHexadecimal($tmpText, false);
440
                    }
441 9
                    $tmpText = str_replace(
442 9
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
443 9
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
444 9
                        $tmpText
445 9
                    );
446 9
                    $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1');
447 9
                    if (isset($currentFont)) {
448 9
                        $tmpText = $currentFont->decodeContent($tmpText);
0 ignored issues
show
Bug introduced by
It seems like $tmpText can also be of type array; however, parameter $text of Smalot\PdfParser\Font::decodeContent() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

448
                        $tmpText = $currentFont->decodeContent(/** @scrutinizer ignore-type */ $tmpText);
Loading history...
449
                    }
450 9
                    $command['c'] = $tmpText;
451 9
                    continue;
452
                }
453 11
                $numText = \count($data);
454 11
                for ($i = 0; $i < $numText; ++$i) {
455 11
                    if (0 != ($i % 2)) {
456 7
                        continue;
457
                    }
458 11
                    $tmpText = $data[$i]['c'];
459 11
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
460 11
                    $decodedText = str_replace(
461 11
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
462 11
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
463 11
                        $decodedText
464 11
                    );
465
466 11
                    $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1');
467
468 11
                    if (isset($currentFont)) {
469 9
                        $decodedText = $currentFont->decodeContent($decodedText);
470
                    }
471 11
                    $command['c'][$i]['c'] = $decodedText;
472 11
                    continue;
473
                }
474 11
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
475 11
                $fontId = explode(' ', $command['c'])[0];
476
                // If document is a FPDI/FPDF the $page has the correct font
477 11
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
478 11
                continue;
479 11
            } elseif ('Q' == $command['o']) {
480 8
                $currentFont = $clippedFont;
481 11
            } elseif ('q' == $command['o']) {
482 8
                $clippedFont = $currentFont;
483
            }
484
        }
485
486 11
        return $extractedRawData;
487
    }
488
489
    /**
490
     * Gets just the Text commands that are involved in text positions and
491
     * Text Matrix (Tm)
492
     *
493
     * It extract just the PDF commands that are involved with text positions, and
494
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
495
     *
496
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
497
     *                                       If it is null, the method extractDecodeRawData is called.
498
     *
499
     * @return array An array with the text command of the page
500
     */
501 9
    public function getDataCommands(?array $extractedDecodedRawData = null): array
502
    {
503 9
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
504 9
            $extractedDecodedRawData = $this->extractDecodedRawData();
505
        }
506 9
        $extractedData = [];
507 9
        foreach ($extractedDecodedRawData as $command) {
508 9
            switch ($command['o']) {
509
                /*
510
                 * BT
511
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
512
                 */
513 9
                case 'BT':
514 9
                    $extractedData[] = $command;
515 9
                    break;
516
517
                    /*
518
                     * ET
519
                     * End a text object, discarding the text matrix
520
                     */
521 9
                case 'ET':
522 9
                    $extractedData[] = $command;
523 9
                    break;
524
525
                    /*
526
                     * leading TL
527
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
528
                     * Initial value: 0
529
                     */
530 9
                case 'TL':
531 5
                    $extractedData[] = $command;
532 5
                    break;
533
534
                    /*
535
                     * tx ty Td
536
                     * Move to the start of the next line, offset form the start of the
537
                     * current line by tx, ty.
538
                     */
539 9
                case 'Td':
540 9
                    $extractedData[] = $command;
541 9
                    break;
542
543
                    /*
544
                     * tx ty TD
545
                     * Move to the start of the next line, offset form the start of the
546
                     * current line by tx, ty. As a side effect, this operator set the leading
547
                     * parameter in the text state. This operator has the same effect as the
548
                     * code:
549
                     * -ty TL
550
                     * tx ty Td
551
                     */
552 9
                case 'TD':
553
                    $extractedData[] = $command;
554
                    break;
555
556
                    /*
557
                     * a b c d e f Tm
558
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
559
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
560
                     * [1 0 0 1 0 0]
561
                     */
562 9
                case 'Tm':
563 7
                    $extractedData[] = $command;
564 7
                    break;
565
566
                    /*
567
                     * T*
568
                     * Move to the start of the next line. This operator has the same effect
569
                     * as the code:
570
                     * 0 Tl Td
571
                     * Where Tl is the current leading parameter in the text state.
572
                     */
573 9
                case 'T*':
574 5
                    $extractedData[] = $command;
575 5
                    break;
576
577
                    /*
578
                     * string Tj
579
                     * Show a Text String
580
                     */
581 9
                case 'Tj':
582 8
                    $extractedData[] = $command;
583 8
                    break;
584
585
                    /*
586
                     * string '
587
                     * Move to the next line and show a text string. This operator has the
588
                     * same effect as the code:
589
                     * T*
590
                     * string Tj
591
                     */
592 9
                case "'":
593
                    $extractedData[] = $command;
594
                    break;
595
596
                    /*
597
                     * aw ac string "
598
                     * Move to the next lkine and show a text string, using aw as the word
599
                     * spacing and ac as the character spacing. This operator has the same
600
                     * effect as the code:
601
                     * aw Tw
602
                     * ac Tc
603
                     * string '
604
                     * Tw set the word spacing, Tw, to wordSpace.
605
                     * Tc Set the character spacing, Tc, to charsSpace.
606
                     */
607 9
                case '"':
608
                    $extractedData[] = $command;
609
                    break;
610
611 9
                case 'Tf':
612 9
                case 'TF':
613 9
                    $extractedData[] = $command;
614 9
                    break;
615
616
                    /*
617
                     * array TJ
618
                     * Show one or more text strings allow individual glyph positioning.
619
                     * Each lement of array con be a string or a number. If the element is
620
                     * a string, this operator shows the string. If it is a number, the
621
                     * operator adjust the text position by that amount; that is, it translates
622
                     * the text matrix, Tm. This amount is substracted form the current
623
                     * horizontal or vertical coordinate, depending on the writing mode.
624
                     * in the default coordinate system, a positive adjustment has the effect
625
                     * of moving the next glyph painted either to the left or down by the given
626
                     * amount.
627
                     */
628 9
                case 'TJ':
629 9
                    $extractedData[] = $command;
630 9
                    break;
631
                default:
632
            }
633
        }
634
635 9
        return $extractedData;
636
    }
637
638
    /**
639
     * Gets the Text Matrix of the text in the page
640
     *
641
     * Return an array where every item is an array where the first item is the
642
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
643
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
644
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
645
     *
646
     * @param array $dataCommands the data extracted by getDataCommands
647
     *                            if null getDataCommands is called
648
     *
649
     * @return array an array with the data of the page including the Tm information
650
     *               of any text in the page
651
     */
652 8
    public function getDataTm(?array $dataCommands = null): array
653
    {
654 8
        if (!isset($dataCommands) || !$dataCommands) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
655 8
            $dataCommands = $this->getDataCommands();
656
        }
657
658
        /*
659
         * At the beginning of a text object Tm is the identity matrix
660
         */
661 8
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
662
663
        /*
664
         *  Set the text leading used by T*, ' and " operators
665
         */
666 8
        $defaultTl = 0;
667
668
        /*
669
         *  Set default values for font data
670
         */
671 8
        $defaultFontId = -1;
672 8
        $defaultFontSize = 1;
673
674
        /*
675
         * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm)
676
         */
677 8
        $hSc = 0; // horizontal scaling
678
        /**
679
         * index of vertical scaling in the array that encodes the text matrix.
680
         * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500
681
         */
682 8
        $vSc = 3;
683 8
        $x = 4;
684 8
        $y = 5;
685
686
        /*
687
         * x,y-coordinates of text space origin in user units
688
         *
689
         * These will be assigned the value of the currently printed string
690
         */
691 8
        $Tx = 0;
692 8
        $Ty = 0;
693
694 8
        $Tm = $defaultTm;
695 8
        $Tl = $defaultTl;
696 8
        $fontId = $defaultFontId;
697 8
        $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs
698
699 8
        $extractedTexts = $this->getTextArray();
700 8
        $extractedData = [];
701 8
        foreach ($dataCommands as $command) {
702
            // If we've used up all the texts from getTextArray(), exit
703
            // so we aren't accessing non-existent array indices
704
            // Fixes 'undefined array key' errors in Issues #575, #576
705 8
            if (\count($extractedTexts) <= \count($extractedData)) {
706 6
                break;
707
            }
708 8
            $currentText = $extractedTexts[\count($extractedData)];
709 8
            switch ($command['o']) {
710
                /*
711
                 * BT
712
                 * Begin a text object, initializing the Tm and Tlm to identity matrix
713
                 */
714 8
                case 'BT':
715 8
                    $Tm = $defaultTm;
716 8
                    $Tl = $defaultTl;
717 8
                    $Tx = 0;
718 8
                    $Ty = 0;
719 8
                    break;
720
721
                    /*
722
                     * ET
723
                     * End a text object
724
                     */
725 8
                case 'ET':
726 7
                    break;
727
728
                    /*
729
                     * text leading TL
730
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
731
                     * Initial value: 0
732
                     */
733 8
                case 'TL':
734
                    // scaled text leading
735 4
                    $Tl = (float) $command['c'] * (float) $Tm[$vSc];
736 4
                    break;
737
738
                    /*
739
                     * tx ty Td
740
                     * Move to the start of the next line, offset from the start of the
741
                     * current line by tx, ty.
742
                     */
743 8
                case 'Td':
744 8
                    $coord = explode(' ', $command['c']);
745 8
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
746 8
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
747 8
                    $Tm[$x] = (string) $Tx;
748 8
                    $Tm[$y] = (string) $Ty;
749 8
                    break;
750
751
                    /*
752
                     * tx ty TD
753
                     * Move to the start of the next line, offset form the start of the
754
                     * current line by tx, ty. As a side effect, this operator set the leading
755
                     * parameter in the text state. This operator has the same effect as the
756
                     * code:
757
                     * -ty TL
758
                     * tx ty Td
759
                     */
760 8
                case 'TD':
761 1
                    $coord = explode(' ', $command['c']);
762 1
                    $Tl = -((float) $coord[1] * (float) $Tm[$vSc]);
763 1
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
764 1
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
765 1
                    $Tm[$x] = (string) $Tx;
766 1
                    $Tm[$y] = (string) $Ty;
767 1
                    break;
768
769
                    /*
770
                     * a b c d e f Tm
771
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
772
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
773
                     * [1 0 0 1 0 0]
774
                     */
775 8
                case 'Tm':
776 6
                    $Tm = explode(' ', $command['c']);
777 6
                    $Tx = (float) $Tm[$x];
778 6
                    $Ty = (float) $Tm[$y];
779 6
                    break;
780
781
                    /*
782
                     * T*
783
                     * Move to the start of the next line. This operator has the same effect
784
                     * as the code:
785
                     * 0 Tl Td
786
                     * Where Tl is the current leading parameter in the text state.
787
                     */
788 8
                case 'T*':
789 4
                    $Ty -= $Tl;
790 4
                    $Tm[$y] = (string) $Ty;
791 4
                    break;
792
793
                    /*
794
                     * string Tj
795
                     * Show a Text String
796
                     */
797 8
                case 'Tj':
798 7
                    $data = [$Tm, $currentText];
799 7
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
0 ignored issues
show
Bug introduced by
The method getDataTmFontInfoHasToBeIncluded() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

799
                    if ($this->config->/** @scrutinizer ignore-call */ getDataTmFontInfoHasToBeIncluded()) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
800 2
                        $data[] = $fontId;
801 2
                        $data[] = $fontSize;
802
                    }
803 7
                    $extractedData[] = $data;
804 7
                    break;
805
806
                    /*
807
                     * string '
808
                     * Move to the next line and show a text string. This operator has the
809
                     * same effect as the code:
810
                     * T*
811
                     * string Tj
812
                     */
813 8
                case "'":
814 1
                    $Ty -= $Tl;
815 1
                    $Tm[$y] = (string) $Ty;
816 1
                    $extractedData[] = [$Tm, $currentText];
817 1
                    break;
818
819
                    /*
820
                     * aw ac string "
821
                     * Move to the next line and show a text string, using aw as the word
822
                     * spacing and ac as the character spacing. This operator has the same
823
                     * effect as the code:
824
                     * aw Tw
825
                     * ac Tc
826
                     * string '
827
                     * Tw set the word spacing, Tw, to wordSpace.
828
                     * Tc Set the character spacing, Tc, to charsSpace.
829
                     */
830 8
                case '"':
831
                    $data = explode(' ', $currentText);
832
                    $Ty -= $Tl;
833
                    $Tm[$y] = (string) $Ty;
834
                    $extractedData[] = [$Tm, $data[2]]; // Verify
835
                    break;
836
837 8
                case 'Tf':
838
                    /*
839
                     * From PDF 1.0 specification, page 106:
840
                     *     fontname size Tf Set font and size
841
                     *     Sets the text font and text size in the graphics state. There is no default value for
842
                     *     either fontname or size; they must be selected using Tf before drawing any text.
843
                     *     fontname is a resource name. size is a number expressed in text space units.
844
                     *
845
                     * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
846
                     * Introduced with https://github.com/smalot/pdfparser/pull/516
847
                     */
848 8
                    list($fontId, $fontSize) = explode(' ', $command['c'], 2);
849 8
                    break;
850
851
                    /*
852
                     * array TJ
853
                     * Show one or more text strings allow individual glyph positioning.
854
                     * Each lement of array con be a string or a number. If the element is
855
                     * a string, this operator shows the string. If it is a number, the
856
                     * operator adjust the text position by that amount; that is, it translates
857
                     * the text matrix, Tm. This amount is substracted form the current
858
                     * horizontal or vertical coordinate, depending on the writing mode.
859
                     * in the default coordinate system, a positive adjustment has the effect
860
                     * of moving the next glyph painted either to the left or down by the given
861
                     * amount.
862
                     */
863 8
                case 'TJ':
864 8
                    $data = [$Tm, $currentText];
865 8
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
866 2
                        $data[] = $fontId;
867 2
                        $data[] = $fontSize;
868
                    }
869 8
                    $extractedData[] = $data;
870 8
                    break;
871
                default:
872
            }
873
        }
874 8
        $this->dataTm = $extractedData;
875
876 8
        return $extractedData;
877
    }
878
879
    /**
880
     * Gets text data that are around the given coordinates (X,Y)
881
     *
882
     * If the text is in near the given coordinates (X,Y) (or the TM info),
883
     * the text is returned.  The extractedData return by getDataTm, could be use to see
884
     * where is the coordinates of a given text, using the TM info for it.
885
     *
886
     * @param float $x      The X value of the coordinate to search for. if null
887
     *                      just the Y value is considered (same Row)
888
     * @param float $y      The Y value of the coordinate to search for
889
     *                      just the X value is considered (same column)
890
     * @param float $xError The value less or more to consider an X to be "near"
891
     * @param float $yError The value less or more to consider an Y to be "near"
892
     *
893
     * @return array An array of text that are near the given coordinates. If no text
894
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
895
     *               and y coordinates are null, null is returned.
896
     */
897 2
    public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
898
    {
899 2
        if (!isset($this->dataTm) || !$this->dataTm) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
900 1
            $this->getDataTm();
901
        }
902
903 2
        if (null !== $x) {
904 2
            $x = (float) $x;
905
        }
906
907 2
        if (null !== $y) {
908 2
            $y = (float) $y;
909
        }
910
911 2
        if (null === $x && null === $y) {
912
            return [];
913
        }
914
915 2
        $xError = (float) $xError;
916 2
        $yError = (float) $yError;
917
918 2
        $extractedData = [];
919 2
        foreach ($this->dataTm as $item) {
920 2
            $tm = $item[0];
921 2
            $xTm = (float) $tm[4];
922 2
            $yTm = (float) $tm[5];
923 2
            $text = $item[1];
924 2
            if (null === $y) {
925
                if (($xTm >= ($x - $xError))
926
                    && ($xTm <= ($x + $xError))) {
927
                    $extractedData[] = [$tm, $text];
928
                    continue;
929
                }
930
            }
931 2
            if (null === $x) {
932
                if (($yTm >= ($y - $yError))
933
                    && ($yTm <= ($y + $yError))) {
934
                    $extractedData[] = [$tm, $text];
935
                    continue;
936
                }
937
            }
938 2
            if (($xTm >= ($x - $xError))
939 2
                && ($xTm <= ($x + $xError))
940 2
                && ($yTm >= ($y - $yError))
941 2
                && ($yTm <= ($y + $yError))) {
942 2
                $extractedData[] = [$tm, $text];
943 2
                continue;
944
            }
945
        }
946
947 2
        return $extractedData;
948
    }
949
}
950