Passed
Pull Request — master (#698)
by
unknown
02:57
created

Page::setFonts()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 2
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 4
ccs 3
cts 3
cp 1
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Element\ElementNull;
38
use Smalot\PdfParser\Element\ElementXRef;
39
40
class Page extends PDFObject
41
{
42
    /**
43
     * @var Font[]
44
     */
45
    protected $fonts;
46
47
    /**
48
     * @var PDFObject[]
49
     */
50
    protected $xobjects;
51
52
    /**
53
     * @var array
54
     */
55
    protected $dataTm;
56
57 8
    public function setFonts($fonts)
58
    {
59 8
        if (empty($this->fonts)) {
60 8
            $this->fonts = $fonts;
61
        }
62
    }
63
64
    /**
65
     * @return Font[]
66
     */
67 50
    public function getFonts()
68
    {
69 50
        if (null !== $this->fonts) {
70 48
            return $this->fonts;
71
        }
72
73 43
        $resources = $this->get('Resources');
74
75 43
        if (method_exists($resources, 'has') && $resources->has('Font')) {
76 42
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

76
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
77 1
                return [];
78
            }
79
80 41
            if ($resources->get('Font') instanceof Header) {
81 35
                $fonts = $resources->get('Font')->getElements();
82
            } else {
83 10
                $fonts = $resources->get('Font')->getHeader()->getElements();
84
            }
85
86 41
            $table = [];
87
88 41
            foreach ($fonts as $id => $font) {
89 41
                if ($font instanceof Font) {
90 41
                    $table[$id] = $font;
91
92
                    // Store too on cleaned id value (only numeric)
93 41
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
94 41
                    if ('' != $id) {
95 40
                        $table[$id] = $font;
96
                    }
97
                }
98
            }
99
100 41
            return $this->fonts = $table;
101
        }
102
103 4
        return [];
104
    }
105
106 47
    public function getFont(string $id): ?Font
107
    {
108 47
        $fonts = $this->getFonts();
109
110 47
        if (isset($fonts[$id])) {
111 47
            return $fonts[$id];
112
        }
113
114
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
115
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
116
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
117
118 1
        if (isset($fonts[$id])) {
119
            return $fonts[$id];
120
        } else {
121 1
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
122 1
            if (isset($fonts[$id])) {
123 1
                return $fonts[$id];
124
            }
125
        }
126
127
        return null;
128
    }
129
130
    /**
131
     * Support for XObject
132
     *
133
     * @return PDFObject[]
134
     */
135 16
    public function getXObjects()
136
    {
137 16
        if (null !== $this->xobjects) {
138 10
            return $this->xobjects;
139
        }
140
141 16
        $resources = $this->get('Resources');
142
143 16
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
144 16
            if ($resources->get('XObject') instanceof Header) {
145 15
                $xobjects = $resources->get('XObject')->getElements();
146
            } else {
147 1
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
148
            }
149
150 16
            $table = [];
151
152 16
            foreach ($xobjects as $id => $xobject) {
153 16
                $table[$id] = $xobject;
154
155
                // Store too on cleaned id value (only numeric)
156 16
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
157 16
                if ('' != $id) {
158 16
                    $table[$id] = $xobject;
159
                }
160
            }
161
162 16
            return $this->xobjects = $table;
163
        }
164
165
        return [];
166
    }
167
168 15
    public function getXObject(string $id): ?PDFObject
169
    {
170 15
        $xobjects = $this->getXObjects();
171
172 15
        if (isset($xobjects[$id])) {
173 15
            return $xobjects[$id];
174
        }
175
176
        return null;
177
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
178
179
        if (isset($xobjects[$id])) {
180
            return $xobjects[$id];
181
        } else {
182
            return null;
183
        }*/
184
    }
185
186 35
    public function getText(?self $page = null): string
187
    {
188 35
        if ($contents = $this->get('Contents')) {
189 35
            if ($contents instanceof ElementMissing) {
190
                return '';
191 35
            } elseif ($contents instanceof ElementNull) {
192
                return '';
193 35
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
194 29
                $elements = $contents->getHeader()->getElements();
195
196 29
                if (is_numeric(key($elements))) {
197
                    $new_content = '';
198
199
                    foreach ($elements as $element) {
200
                        if ($element instanceof ElementXRef) {
201
                            $new_content .= $element->getObject()->getContent();
202
                        } else {
203
                            $new_content .= $element->getContent();
204
                        }
205
                    }
206
207
                    $header = new Header([], $this->document);
208 29
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
209
                }
210 9
            } elseif ($contents instanceof ElementArray) {
211
                // Create a virtual global content.
212 9
                $new_content = '';
213
214 9
                foreach ($contents->getContent() as $content) {
215 9
                    $new_content .= $content->getContent()."\n";
216
                }
217
218 9
                $header = new Header([], $this->document);
219 9
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

219
                $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
220
            }
221
222
            /*
223
             * Elements referencing each other on the same page can cause endless loops during text parsing.
224
             * To combat this we keep a recursionStack containing already parsed elements on the page.
225
             * The stack is only emptied here after getting text from a page.
226
             */
227 35
            $contentsText = $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

227
            /** @scrutinizer ignore-call */ 
228
            $contentsText = $contents->getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
228 35
            PDFObject::$recursionStack = [];
229
230 35
            return $contentsText;
231
        }
232
233
        return '';
234
    }
235
236
    /**
237
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
238
     *
239
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
240
     * pdf file was generated by FPDF/Fpfi.
241
     *
242
     * @return bool true is the current page is a FPDI/FPDF document
243
     */
244 13
    public function isFpdf(): bool
245
    {
246 13
        if (\array_key_exists('Producer', $this->document->getDetails())
0 ignored issues
show
Bug introduced by
The method getDetails() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

246
        if (\array_key_exists('Producer', $this->document->/** @scrutinizer ignore-call */ getDetails())

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
247 13
            && \is_string($this->document->getDetails()['Producer'])
248 13
            && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) {
249 2
            return true;
250
        }
251
252 12
        return false;
253
    }
254
255
    /**
256
     * Return the page number of the PDF document of the page object
257
     *
258
     * @return int the page number
259
     */
260 2
    public function getPageNumber(): int
261
    {
262 2
        $pages = $this->document->getPages();
263 2
        $numOfPages = \count($pages);
264 2
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
265 2
            if ($pages[$pageNum] === $this) {
266 2
                break;
267
            }
268
        }
269
270 2
        return $pageNum;
271
    }
272
273
    /**
274
     * Return the Object of the page if the document is a FPDF/FPDI document
275
     *
276
     * If the document was generated by FPDF/FPDI it returns the
277
     * PDFObject of the given page
278
     *
279
     * @return PDFObject The PDFObject for the page
280
     */
281 1
    public function getPDFObjectForFpdf(): PDFObject
282
    {
283 1
        $pageNum = $this->getPageNumber();
284 1
        $xObjects = $this->getXObjects();
285
286 1
        return $xObjects[$pageNum];
287
    }
288
289
    /**
290
     * Return a new PDFObject of the document created with FPDF/FPDI
291
     *
292
     * For a document generated by FPDF/FPDI, it generates a
293
     * new PDFObject for that document
294
     *
295
     * @return PDFObject The PDFObject
296
     */
297 1
    public function createPDFObjectForFpdf(): PDFObject
298
    {
299 1
        $pdfObject = $this->getPDFObjectForFpdf();
300 1
        $new_content = $pdfObject->getContent();
301 1
        $header = $pdfObject->getHeader();
302 1
        $config = $pdfObject->config;
303
304 1
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

304
        return new PDFObject(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
305
    }
306
307
    /**
308
     * Return page if document is a FPDF/FPDI document
309
     *
310
     * @return Page The page
311
     */
312 1
    public function createPageForFpdf(): self
313
    {
314 1
        $pdfObject = $this->getPDFObjectForFpdf();
315 1
        $new_content = $pdfObject->getContent();
316 1
        $header = $pdfObject->getHeader();
317 1
        $config = $pdfObject->config;
318
319 1
        return new self($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\Page::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

319
        return new self(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
320
    }
321
322 8
    public function getTextArray(?self $page = null): array
323
    {
324 8
        if ($this->isFpdf()) {
325 1
            $pdfObject = $this->getPDFObjectForFpdf();
326 1
            $newPdfObject = $this->createPDFObjectForFpdf();
327
328 1
            return $newPdfObject->getTextArray($pdfObject);
329
        } else {
330 7
            if ($contents = $this->get('Contents')) {
331 7
                if ($contents instanceof ElementMissing) {
332
                    return [];
333 7
                } elseif ($contents instanceof ElementNull) {
334
                    return [];
335 7
                } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
336 7
                    $elements = $contents->getHeader()->getElements();
337
338 7
                    if (is_numeric(key($elements))) {
339
                        $new_content = '';
340
341
                        /** @var PDFObject $element */
342
                        foreach ($elements as $element) {
343
                            if ($element instanceof ElementXRef) {
344
                                $new_content .= $element->getObject()->getContent();
345
                            } else {
346
                                $new_content .= $element->getContent();
347
                            }
348
                        }
349
350
                        $header = new Header([], $this->document);
351
                        $contents = new PDFObject($this->document, $header, $new_content, $this->config);
352
                    } else {
353
                        try {
354 7
                            $contents->getTextArray($this);
355
                        } catch (\Throwable $e) {
356 7
                            return $contents->getTextArray();
357
                        }
358
                    }
359 1
                } elseif ($contents instanceof ElementArray) {
360
                    // Create a virtual global content.
361 1
                    $new_content = '';
362
363
                    /** @var PDFObject $content */
364 1
                    foreach ($contents->getContent() as $content) {
365 1
                        $new_content .= $content->getContent()."\n";
366
                    }
367
368 1
                    $header = new Header([], $this->document);
369 1
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

369
                    $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
370
                }
371
372 7
                return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

372
                return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
373
            }
374
375
            return [];
376
        }
377
    }
378
379
    /**
380
     * Gets all the text data with its internal representation of the page.
381
     *
382
     * Returns an array with the data and the internal representation
383
     */
384 12
    public function extractRawData(): array
385
    {
386
        /*
387
         * Now you can get the complete content of the object with the text on it
388
         */
389 12
        $extractedData = [];
390 12
        $content = $this->get('Contents');
391 12
        $values = $content->getContent();
392 12
        if (isset($values) && \is_array($values)) {
393 1
            $text = '';
394 1
            foreach ($values as $section) {
395 1
                $text .= $section->getContent();
396
            }
397 1
            $sectionsText = $this->getSectionsText($text);
398 1
            foreach ($sectionsText as $sectionText) {
399 1
                $commandsText = $this->getCommandsText($sectionText);
400 1
                foreach ($commandsText as $command) {
401 1
                    $extractedData[] = $command;
402
                }
403
            }
404
        } else {
405 12
            if ($this->isFpdf()) {
406 1
                $content = $this->getPDFObjectForFpdf();
407
            }
408 12
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

408
            /** @scrutinizer ignore-call */ 
409
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
409 12
            foreach ($sectionsText as $sectionText) {
410 12
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

410
                /** @scrutinizer ignore-call */ 
411
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
411 12
                foreach ($commandsText as $command) {
412 12
                    $extractedData[] = $command;
413
                }
414
            }
415
        }
416
417 12
        return $extractedData;
418
    }
419
420
    /**
421
     * Gets all the decoded text data with it internal representation from a page.
422
     *
423
     * @param array $extractedRawData the extracted data return by extractRawData or
424
     *                                null if extractRawData should be called
425
     *
426
     * @return array An array with the data and the internal representation
427
     */
428 11
    public function extractDecodedRawData(?array $extractedRawData = null): array
429
    {
430 11
        if (!isset($extractedRawData) || !$extractedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
431 11
            $extractedRawData = $this->extractRawData();
432
        }
433 11
        $currentFont = null; /** @var Font $currentFont */
434 11
        $clippedFont = null;
435 11
        $fpdfPage = null;
436 11
        if ($this->isFpdf()) {
437 1
            $fpdfPage = $this->createPageForFpdf();
438
        }
439 11
        foreach ($extractedRawData as &$command) {
440 11
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
441 11
                $data = $command['c'];
442 11
                if (!\is_array($data)) {
443 9
                    $tmpText = '';
444 9
                    if (isset($currentFont)) {
445 9
                        $tmpText = $currentFont->decodeOctal($data);
446
                        // $tmpText = $currentFont->decodeHexadecimal($tmpText, false);
447
                    }
448 9
                    $tmpText = str_replace(
449 9
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
450 9
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
451 9
                        $tmpText
452 9
                    );
453 9
                    $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1');
454 9
                    if (isset($currentFont)) {
455 9
                        $tmpText = $currentFont->decodeContent($tmpText);
0 ignored issues
show
Bug introduced by
It seems like $tmpText can also be of type array; however, parameter $text of Smalot\PdfParser\Font::decodeContent() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

455
                        $tmpText = $currentFont->decodeContent(/** @scrutinizer ignore-type */ $tmpText);
Loading history...
456
                    }
457 9
                    $command['c'] = $tmpText;
458 9
                    continue;
459
                }
460 11
                $numText = \count($data);
461 11
                for ($i = 0; $i < $numText; ++$i) {
462 11
                    if (0 != ($i % 2)) {
463 7
                        continue;
464
                    }
465 11
                    $tmpText = $data[$i]['c'];
466 11
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
467 11
                    $decodedText = str_replace(
468 11
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
469 11
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
470 11
                        $decodedText
471 11
                    );
472
473 11
                    $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1');
474
475 11
                    if (isset($currentFont)) {
476 11
                        $decodedText = $currentFont->decodeContent($decodedText);
477
                    }
478 11
                    $command['c'][$i]['c'] = $decodedText;
479 11
                    continue;
480
                }
481 11
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
482 11
                $fontId = explode(' ', $command['c'])[0];
483
                // If document is a FPDI/FPDF the $page has the correct font
484 11
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
485 11
                continue;
486 11
            } elseif ('Q' == $command['o']) {
487 8
                $currentFont = $clippedFont;
488 11
            } elseif ('q' == $command['o']) {
489 8
                $clippedFont = $currentFont;
490
            }
491
        }
492
493 11
        return $extractedRawData;
494
    }
495
496
    /**
497
     * Gets just the Text commands that are involved in text positions and
498
     * Text Matrix (Tm)
499
     *
500
     * It extract just the PDF commands that are involved with text positions, and
501
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
502
     *
503
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
504
     *                                       If it is null, the method extractDecodeRawData is called.
505
     *
506
     * @return array An array with the text command of the page
507
     */
508 9
    public function getDataCommands(?array $extractedDecodedRawData = null): array
509
    {
510 9
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
511 9
            $extractedDecodedRawData = $this->extractDecodedRawData();
512
        }
513 9
        $extractedData = [];
514 9
        foreach ($extractedDecodedRawData as $command) {
515 9
            switch ($command['o']) {
516
                /*
517
                 * BT
518
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
519
                 */
520 9
                case 'BT':
521 9
                    $extractedData[] = $command;
522 9
                    break;
523
524
                    /*
525
                     * ET
526
                     * End a text object, discarding the text matrix
527
                     */
528 9
                case 'ET':
529 9
                    $extractedData[] = $command;
530 9
                    break;
531
532
                    /*
533
                     * leading TL
534
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
535
                     * Initial value: 0
536
                     */
537 9
                case 'TL':
538 5
                    $extractedData[] = $command;
539 5
                    break;
540
541
                    /*
542
                     * tx ty Td
543
                     * Move to the start of the next line, offset form the start of the
544
                     * current line by tx, ty.
545
                     */
546 9
                case 'Td':
547 9
                    $extractedData[] = $command;
548 9
                    break;
549
550
                    /*
551
                     * tx ty TD
552
                     * Move to the start of the next line, offset form the start of the
553
                     * current line by tx, ty. As a side effect, this operator set the leading
554
                     * parameter in the text state. This operator has the same effect as the
555
                     * code:
556
                     * -ty TL
557
                     * tx ty Td
558
                     */
559 9
                case 'TD':
560
                    $extractedData[] = $command;
561
                    break;
562
563
                    /*
564
                     * a b c d e f Tm
565
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
566
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
567
                     * [1 0 0 1 0 0]
568
                     */
569 9
                case 'Tm':
570 7
                    $extractedData[] = $command;
571 7
                    break;
572
573
                    /*
574
                     * T*
575
                     * Move to the start of the next line. This operator has the same effect
576
                     * as the code:
577
                     * 0 Tl Td
578
                     * Where Tl is the current leading parameter in the text state.
579
                     */
580 9
                case 'T*':
581 5
                    $extractedData[] = $command;
582 5
                    break;
583
584
                    /*
585
                     * string Tj
586
                     * Show a Text String
587
                     */
588 9
                case 'Tj':
589 8
                    $extractedData[] = $command;
590 8
                    break;
591
592
                    /*
593
                     * string '
594
                     * Move to the next line and show a text string. This operator has the
595
                     * same effect as the code:
596
                     * T*
597
                     * string Tj
598
                     */
599 9
                case "'":
600
                    $extractedData[] = $command;
601
                    break;
602
603
                    /*
604
                     * aw ac string "
605
                     * Move to the next lkine and show a text string, using aw as the word
606
                     * spacing and ac as the character spacing. This operator has the same
607
                     * effect as the code:
608
                     * aw Tw
609
                     * ac Tc
610
                     * string '
611
                     * Tw set the word spacing, Tw, to wordSpace.
612
                     * Tc Set the character spacing, Tc, to charsSpace.
613
                     */
614 9
                case '"':
615
                    $extractedData[] = $command;
616
                    break;
617
618 9
                case 'Tf':
619 9
                case 'TF':
620 9
                    $extractedData[] = $command;
621 9
                    break;
622
623
                    /*
624
                     * array TJ
625
                     * Show one or more text strings allow individual glyph positioning.
626
                     * Each lement of array con be a string or a number. If the element is
627
                     * a string, this operator shows the string. If it is a number, the
628
                     * operator adjust the text position by that amount; that is, it translates
629
                     * the text matrix, Tm. This amount is substracted form the current
630
                     * horizontal or vertical coordinate, depending on the writing mode.
631
                     * in the default coordinate system, a positive adjustment has the effect
632
                     * of moving the next glyph painted either to the left or down by the given
633
                     * amount.
634
                     */
635 9
                case 'TJ':
636 9
                    $extractedData[] = $command;
637 9
                    break;
638
                default:
639
            }
640
        }
641
642 9
        return $extractedData;
643
    }
644
645
    /**
646
     * Gets the Text Matrix of the text in the page
647
     *
648
     * Return an array where every item is an array where the first item is the
649
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
650
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
651
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
652
     *
653
     * @param array $dataCommands the data extracted by getDataCommands
654
     *                            if null getDataCommands is called
655
     *
656
     * @return array an array with the data of the page including the Tm information
657
     *               of any text in the page
658
     */
659 8
    public function getDataTm(?array $dataCommands = null): array
660
    {
661 8
        if (!isset($dataCommands) || !$dataCommands) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
662 8
            $dataCommands = $this->getDataCommands();
663
        }
664
665
        /*
666
         * At the beginning of a text object Tm is the identity matrix
667
         */
668 8
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
669
670
        /*
671
         *  Set the text leading used by T*, ' and " operators
672
         */
673 8
        $defaultTl = 0;
674
675
        /*
676
         *  Set default values for font data
677
         */
678 8
        $defaultFontId = -1;
679 8
        $defaultFontSize = 1;
680
681
        /*
682
         * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm)
683
         */
684 8
        $hSc = 0; // horizontal scaling
685
        /**
686
         * index of vertical scaling in the array that encodes the text matrix.
687
         * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500
688
         */
689 8
        $vSc = 3;
690 8
        $x = 4;
691 8
        $y = 5;
692
693
        /*
694
         * x,y-coordinates of text space origin in user units
695
         *
696
         * These will be assigned the value of the currently printed string
697
         */
698 8
        $Tx = 0;
699 8
        $Ty = 0;
700
701 8
        $Tm = $defaultTm;
702 8
        $Tl = $defaultTl;
703 8
        $fontId = $defaultFontId;
704 8
        $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs
705
706 8
        $extractedTexts = $this->getTextArray();
707 8
        $extractedData = [];
708 8
        foreach ($dataCommands as $command) {
709
            // If we've used up all the texts from getTextArray(), exit
710
            // so we aren't accessing non-existent array indices
711
            // Fixes 'undefined array key' errors in Issues #575, #576
712 8
            if (\count($extractedTexts) <= \count($extractedData)) {
713 6
                break;
714
            }
715 8
            $currentText = $extractedTexts[\count($extractedData)];
716 8
            switch ($command['o']) {
717
                /*
718
                 * BT
719
                 * Begin a text object, initializing the Tm and Tlm to identity matrix
720
                 */
721 8
                case 'BT':
722 8
                    $Tm = $defaultTm;
723 8
                    $Tl = $defaultTl;
724 8
                    $Tx = 0;
725 8
                    $Ty = 0;
726 8
                    break;
727
728
                    /*
729
                     * ET
730
                     * End a text object
731
                     */
732 8
                case 'ET':
733 7
                    break;
734
735
                    /*
736
                     * text leading TL
737
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
738
                     * Initial value: 0
739
                     */
740 8
                case 'TL':
741
                    // scaled text leading
742 4
                    $Tl = (float) $command['c'] * (float) $Tm[$vSc];
743 4
                    break;
744
745
                    /*
746
                     * tx ty Td
747
                     * Move to the start of the next line, offset from the start of the
748
                     * current line by tx, ty.
749
                     */
750 8
                case 'Td':
751 8
                    $coord = explode(' ', $command['c']);
752 8
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
753 8
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
754 8
                    $Tm[$x] = (string) $Tx;
755 8
                    $Tm[$y] = (string) $Ty;
756 8
                    break;
757
758
                    /*
759
                     * tx ty TD
760
                     * Move to the start of the next line, offset form the start of the
761
                     * current line by tx, ty. As a side effect, this operator set the leading
762
                     * parameter in the text state. This operator has the same effect as the
763
                     * code:
764
                     * -ty TL
765
                     * tx ty Td
766
                     */
767 8
                case 'TD':
768 1
                    $coord = explode(' ', $command['c']);
769 1
                    $Tl = -((float) $coord[1] * (float) $Tm[$vSc]);
770 1
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
771 1
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
772 1
                    $Tm[$x] = (string) $Tx;
773 1
                    $Tm[$y] = (string) $Ty;
774 1
                    break;
775
776
                    /*
777
                     * a b c d e f Tm
778
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
779
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
780
                     * [1 0 0 1 0 0]
781
                     */
782 8
                case 'Tm':
783 6
                    $Tm = explode(' ', $command['c']);
784 6
                    $Tx = (float) $Tm[$x];
785 6
                    $Ty = (float) $Tm[$y];
786 6
                    break;
787
788
                    /*
789
                     * T*
790
                     * Move to the start of the next line. This operator has the same effect
791
                     * as the code:
792
                     * 0 Tl Td
793
                     * Where Tl is the current leading parameter in the text state.
794
                     */
795 8
                case 'T*':
796 4
                    $Ty -= $Tl;
797 4
                    $Tm[$y] = (string) $Ty;
798 4
                    break;
799
800
                    /*
801
                     * string Tj
802
                     * Show a Text String
803
                     */
804 8
                case 'Tj':
805 7
                    $data = [$Tm, $currentText];
806 7
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
0 ignored issues
show
Bug introduced by
The method getDataTmFontInfoHasToBeIncluded() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

806
                    if ($this->config->/** @scrutinizer ignore-call */ getDataTmFontInfoHasToBeIncluded()) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
807 2
                        $data[] = $fontId;
808 2
                        $data[] = $fontSize;
809
                    }
810 7
                    $extractedData[] = $data;
811 7
                    break;
812
813
                    /*
814
                     * string '
815
                     * Move to the next line and show a text string. This operator has the
816
                     * same effect as the code:
817
                     * T*
818
                     * string Tj
819
                     */
820 8
                case "'":
821 1
                    $Ty -= $Tl;
822 1
                    $Tm[$y] = (string) $Ty;
823 1
                    $extractedData[] = [$Tm, $currentText];
824 1
                    break;
825
826
                    /*
827
                     * aw ac string "
828
                     * Move to the next line and show a text string, using aw as the word
829
                     * spacing and ac as the character spacing. This operator has the same
830
                     * effect as the code:
831
                     * aw Tw
832
                     * ac Tc
833
                     * string '
834
                     * Tw set the word spacing, Tw, to wordSpace.
835
                     * Tc Set the character spacing, Tc, to charsSpace.
836
                     */
837 8
                case '"':
838
                    $data = explode(' ', $currentText);
839
                    $Ty -= $Tl;
840
                    $Tm[$y] = (string) $Ty;
841
                    $extractedData[] = [$Tm, $data[2]]; // Verify
842
                    break;
843
844 8
                case 'Tf':
845
                    /*
846
                     * From PDF 1.0 specification, page 106:
847
                     *     fontname size Tf Set font and size
848
                     *     Sets the text font and text size in the graphics state. There is no default value for
849
                     *     either fontname or size; they must be selected using Tf before drawing any text.
850
                     *     fontname is a resource name. size is a number expressed in text space units.
851
                     *
852
                     * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
853
                     * Introduced with https://github.com/smalot/pdfparser/pull/516
854
                     */
855 8
                    list($fontId, $fontSize) = explode(' ', $command['c'], 2);
856 8
                    break;
857
858
                    /*
859
                     * array TJ
860
                     * Show one or more text strings allow individual glyph positioning.
861
                     * Each lement of array con be a string or a number. If the element is
862
                     * a string, this operator shows the string. If it is a number, the
863
                     * operator adjust the text position by that amount; that is, it translates
864
                     * the text matrix, Tm. This amount is substracted form the current
865
                     * horizontal or vertical coordinate, depending on the writing mode.
866
                     * in the default coordinate system, a positive adjustment has the effect
867
                     * of moving the next glyph painted either to the left or down by the given
868
                     * amount.
869
                     */
870 8
                case 'TJ':
871 8
                    $data = [$Tm, $currentText];
872 8
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
873 2
                        $data[] = $fontId;
874 2
                        $data[] = $fontSize;
875
                    }
876 8
                    $extractedData[] = $data;
877 8
                    break;
878
                default:
879
            }
880
        }
881 8
        $this->dataTm = $extractedData;
882
883 8
        return $extractedData;
884
    }
885
886
    /**
887
     * Gets text data that are around the given coordinates (X,Y)
888
     *
889
     * If the text is in near the given coordinates (X,Y) (or the TM info),
890
     * the text is returned.  The extractedData return by getDataTm, could be use to see
891
     * where is the coordinates of a given text, using the TM info for it.
892
     *
893
     * @param float $x      The X value of the coordinate to search for. if null
894
     *                      just the Y value is considered (same Row)
895
     * @param float $y      The Y value of the coordinate to search for
896
     *                      just the X value is considered (same column)
897
     * @param float $xError The value less or more to consider an X to be "near"
898
     * @param float $yError The value less or more to consider an Y to be "near"
899
     *
900
     * @return array An array of text that are near the given coordinates. If no text
901
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
902
     *               and y coordinates are null, null is returned.
903
     */
904 2
    public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
905
    {
906 2
        if (!isset($this->dataTm) || !$this->dataTm) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
907 1
            $this->getDataTm();
908
        }
909
910 2
        if (null !== $x) {
911 2
            $x = (float) $x;
912
        }
913
914 2
        if (null !== $y) {
915 2
            $y = (float) $y;
916
        }
917
918 2
        if (null === $x && null === $y) {
919
            return [];
920
        }
921
922 2
        $xError = (float) $xError;
923 2
        $yError = (float) $yError;
924
925 2
        $extractedData = [];
926 2
        foreach ($this->dataTm as $item) {
927 2
            $tm = $item[0];
928 2
            $xTm = (float) $tm[4];
929 2
            $yTm = (float) $tm[5];
930 2
            $text = $item[1];
931 2
            if (null === $y) {
932
                if (($xTm >= ($x - $xError))
933
                    && ($xTm <= ($x + $xError))) {
934
                    $extractedData[] = [$tm, $text];
935
                    continue;
936
                }
937
            }
938 2
            if (null === $x) {
939
                if (($yTm >= ($y - $yError))
940
                    && ($yTm <= ($y + $yError))) {
941
                    $extractedData[] = [$tm, $text];
942
                    continue;
943
                }
944
            }
945 2
            if (($xTm >= ($x - $xError))
946 2
                && ($xTm <= ($x + $xError))
947 2
                && ($yTm >= ($y - $yError))
948 2
                && ($yTm <= ($y + $yError))) {
949 2
                $extractedData[] = [$tm, $text];
950 2
                continue;
951
            }
952
        }
953
954 2
        return $extractedData;
955
    }
956
}
957