Test Failed
Push — fix/438-str_starts_with ( 87fc2b )
by Konrad
02:38
created

Page::getText()   B

Complexity

Conditions 10
Paths 7

Size

Total Lines 48
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 19
CRAP Score 14.0992

Importance

Changes 6
Bugs 1 Features 0
Metric Value
cc 10
eloc 26
c 6
b 1
f 0
nc 7
nop 1
dl 0
loc 48
ccs 19
cts 29
cp 0.6552
crap 14.0992
rs 7.6666

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
class Page extends PDFObject
39
{
40
    /**
41
     * @var Font[]
42
     */
43
    protected $fonts = null;
44
45
    /**
46
     * @var PDFObject[]
47
     */
48
    protected $xobjects = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $dataTm = null;
54
55
    /**
56
     * @return Font[]
57
     */
58
    public function getFonts()
59
    {
60
        if (null !== $this->fonts) {
61
            return $this->fonts;
62
        }
63
64
        $resources = $this->get('Resources');
65
66
        if (method_exists($resources, 'has') && $resources->has('Font')) {
67
            if ($resources->get('Font') instanceof ElementMissing) {
68
                return [];
69
            }
70
71
            if ($resources->get('Font') instanceof Header) {
72
                $fonts = $resources->get('Font')->getElements();
73
            } else {
74
                $fonts = $resources->get('Font')->getHeader()->getElements();
75
            }
76
77
            $table = [];
78
79
            foreach ($fonts as $id => $font) {
80
                if ($font instanceof Font) {
81
                    $table[$id] = $font;
82
83
                    // Store too on cleaned id value (only numeric)
84
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
85
                    if ('' != $id) {
86
                        $table[$id] = $font;
87
                    }
88
                }
89
            }
90
91
            return $this->fonts = $table;
92
        }
93
94
        return [];
95
    }
96
97
    public function getFont(string $id): ?Font
98
    {
99
        $fonts = $this->getFonts();
100
101
        if (isset($fonts[$id])) {
102
            return $fonts[$id];
103
        }
104
105
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
106
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
107
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
108
109
        if (isset($fonts[$id])) {
110
            return $fonts[$id];
111
        } else {
112
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
113
            if (isset($fonts[$id])) {
114
                return $fonts[$id];
115
            }
116
        }
117
118
        return null;
119
    }
120
121
    /**
122
     * Support for XObject
123
     *
124
     * @return PDFObject[]
125
     */
126
    public function getXObjects()
127
    {
128
        if (null !== $this->xobjects) {
129
            return $this->xobjects;
130
        }
131
132
        $resources = $this->get('Resources');
133
134
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
135
            if ($resources->get('XObject') instanceof Header) {
136
                $xobjects = $resources->get('XObject')->getElements();
137
            } else {
138
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
139
            }
140
141
            $table = [];
142
143
            foreach ($xobjects as $id => $xobject) {
144
                $table[$id] = $xobject;
145
146
                // Store too on cleaned id value (only numeric)
147
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
148
                if ('' != $id) {
149
                    $table[$id] = $xobject;
150
                }
151
            }
152
153
            return $this->xobjects = $table;
154
        }
155
156
        return [];
157
    }
158
159
    public function getXObject(string $id): ?PDFObject
160
    {
161
        $xobjects = $this->getXObjects();
162
163
        if (isset($xobjects[$id])) {
164
            return $xobjects[$id];
165
        }
166
167
        return null;
168
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
169
170
        if (isset($xobjects[$id])) {
171
            return $xobjects[$id];
172
        } else {
173
            return null;
174
        }*/
175
    }
176
177
    public function getText(self $page = null): string
178
    {
179
        if ($contents = $this->get('Contents')) {
180
            if ($contents instanceof ElementMissing) {
181
                return '';
182
            } elseif ($contents instanceof ElementNull) {
183
                return '';
184
            } elseif ($contents instanceof PDFObject) {
185
                $elements = $contents->getHeader()->getElements();
186
187
                if (is_numeric(key($elements))) {
188
                    $new_content = '';
189
190
                    foreach ($elements as $element) {
191
                        if ($element instanceof ElementXRef) {
192
                            $new_content .= $element->getObject()->getContent();
193
                        } else {
194
                            $new_content .= $element->getContent();
195
                        }
196
                    }
197
198
                    $header = new Header([], $this->document);
199
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
200
                }
201
            } elseif ($contents instanceof ElementArray) {
202
                // Create a virtual global content.
203
                $new_content = '';
204
205
                foreach ($contents->getContent() as $content) {
206
                    $new_content .= $content->getContent()."\n";
207
                }
208
209
                $header = new Header([], $this->document);
210
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
211
            }
212
213
            /*
214
             * Elements referencing each other on the same page can cause endless loops during text parsing.
215
             * To combat this we keep a recursionStack containing already parsed elements on the page.
216
             * The stack is only emptied here after getting text from a page.
217
             */
218
            $contentsText = $contents->getText($this);
219
            PDFObject::$recursionStack = [];
220
221
            return $contentsText;
222
        }
223
224
        return '';
225
    }
226
227
    /**
228
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
229
     *
230
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
231
     * pdf file was generated by FPDF/Fpfi.
232
     *
233
     * @return bool true is the current page is a FPDI/FPDF document
234
     */
235
    public function isFpdf(): bool
236
    {
237
        if (\array_key_exists('Producer', $this->document->getDetails()) &&
238
            \is_string($this->document->getDetails()['Producer']) &&
239
            0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', \strlen('FPDF')) {
240
            return true;
0 ignored issues
show
Bug introduced by
A parse error occurred: Syntax error, unexpected T_RETURN on line 240 at column 12
Loading history...
241
        }
242
243
        return false;
244
    }
245
246
    /**
247
     * Return the page number of the PDF document of the page object
248
     *
249
     * @return int the page number
250
     */
251
    public function getPageNumber(): int
252
    {
253
        $pages = $this->document->getPages();
254
        $numOfPages = \count($pages);
255
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
256
            if ($pages[$pageNum] === $this) {
257
                break;
258
            }
259
        }
260
261
        return $pageNum;
262
    }
263
264
    /**
265
     * Return the Object of the page if the document is a FPDF/FPDI document
266
     *
267
     * If the document was generated by FPDF/FPDI it returns the
268
     * PDFObject of the given page
269
     *
270
     * @return PDFObject The PDFObject for the page
271
     */
272
    public function getPDFObjectForFpdf(): PDFObject
273
    {
274
        $pageNum = $this->getPageNumber();
275
        $xObjects = $this->getXObjects();
276
277
        return $xObjects[$pageNum];
278
    }
279
280
    /**
281
     * Return a new PDFObject of the document created with FPDF/FPDI
282
     *
283
     * For a document generated by FPDF/FPDI, it generates a
284
     * new PDFObject for that document
285
     *
286
     * @return PDFObject The PDFObject
287
     */
288
    public function createPDFObjectForFpdf(): PDFObject
289
    {
290
        $pdfObject = $this->getPDFObjectForFpdf();
291
        $new_content = $pdfObject->getContent();
292
        $header = $pdfObject->getHeader();
293
        $config = $pdfObject->config;
294
295
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
296
    }
297
298
    /**
299
     * Return page if document is a FPDF/FPDI document
300
     *
301
     * @return Page The page
302
     */
303
    public function createPageForFpdf(): self
304
    {
305
        $pdfObject = $this->getPDFObjectForFpdf();
306
        $new_content = $pdfObject->getContent();
307
        $header = $pdfObject->getHeader();
308
        $config = $pdfObject->config;
309
310
        return new self($pdfObject->document, $header, $new_content, $config);
311
    }
312
313
    public function getTextArray(self $page = null): array
314
    {
315
        if ($this->isFpdf()) {
316
            $pdfObject = $this->getPDFObjectForFpdf();
317
            $newPdfObject = $this->createPDFObjectForFpdf();
318
319
            return $newPdfObject->getTextArray($pdfObject);
320
        } else {
321
            if ($contents = $this->get('Contents')) {
322
                if ($contents instanceof ElementMissing) {
323
                    return [];
324
                } elseif ($contents instanceof ElementNull) {
325
                    return [];
326
                } elseif ($contents instanceof PDFObject) {
327
                    $elements = $contents->getHeader()->getElements();
328
329
                    if (is_numeric(key($elements))) {
330
                        $new_content = '';
331
332
                        /** @var PDFObject $element */
333
                        foreach ($elements as $element) {
334
                            if ($element instanceof ElementXRef) {
335
                                $new_content .= $element->getObject()->getContent();
336
                            } else {
337
                                $new_content .= $element->getContent();
338
                            }
339
                        }
340
341
                        $header = new Header([], $this->document);
342
                        $contents = new PDFObject($this->document, $header, $new_content, $this->config);
343
                    } else {
344
                        try {
345
                            $contents->getTextArray($this);
346
                        } catch (\Throwable $e) {
347
                            return $contents->getTextArray();
348
                        }
349
                    }
350
                } elseif ($contents instanceof ElementArray) {
351
                    // Create a virtual global content.
352
                    $new_content = '';
353
354
                    /** @var PDFObject $content */
355
                    foreach ($contents->getContent() as $content) {
356
                        $new_content .= $content->getContent()."\n";
357
                    }
358
359
                    $header = new Header([], $this->document);
360
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
361
                }
362
363
                return $contents->getTextArray($this);
364
            }
365
366
            return [];
367
        }
368
    }
369
370
    /**
371
     * Gets all the text data with its internal representation of the page.
372
     *
373
     * Returns an array with the data and the internal representation
374
     */
375
    public function extractRawData(): array
376
    {
377
        /*
378
         * Now you can get the complete content of the object with the text on it
379
         */
380
        $extractedData = [];
381
        $content = $this->get('Contents');
382
        $values = $content->getContent();
383
        if (isset($values) && \is_array($values)) {
384
            $text = '';
385
            foreach ($values as $section) {
386
                $text .= $section->getContent();
387
            }
388
            $sectionsText = $this->getSectionsText($text);
389
            foreach ($sectionsText as $sectionText) {
390
                $commandsText = $this->getCommandsText($sectionText);
391
                foreach ($commandsText as $command) {
392
                    $extractedData[] = $command;
393
                }
394
            }
395
        } else {
396
            if ($this->isFpdf()) {
397
                $content = $this->getPDFObjectForFpdf();
398
            }
399
            $sectionsText = $content->getSectionsText($content->getContent());
400
            foreach ($sectionsText as $sectionText) {
401
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
402
403
                $commandsText = $content->getCommandsText($sectionText);
404
                foreach ($commandsText as $command) {
405
                    $extractedData[] = $command;
406
                }
407
            }
408
        }
409
410
        return $extractedData;
411
    }
412
413
    /**
414
     * Gets all the decoded text data with it internal representation from a page.
415
     *
416
     * @param array $extractedRawData the extracted data return by extractRawData or
417
     *                                null if extractRawData should be called
418
     *
419
     * @return array An array with the data and the internal representation
420
     */
421
    public function extractDecodedRawData(array $extractedRawData = null): array
422
    {
423
        if (!isset($extractedRawData) || !$extractedRawData) {
424
            $extractedRawData = $this->extractRawData();
425
        }
426
        $currentFont = null; /** @var Font $currentFont */
427
        $clippedFont = null;
428
        $fpdfPage = null;
429
        if ($this->isFpdf()) {
430
            $fpdfPage = $this->createPageForFpdf();
431
        }
432
        foreach ($extractedRawData as &$command) {
433
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
434
                $data = $command['c'];
435
                if (!\is_array($data)) {
436
                    $tmpText = '';
437
                    if (isset($currentFont)) {
438
                        $tmpText = $currentFont->decodeOctal($data);
439
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
440
                    }
441
                    $tmpText = str_replace(
442
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
443
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
444
                            $tmpText
445
                    );
446
                    $tmpText = utf8_encode($tmpText);
447
                    if (isset($currentFont)) {
448
                        $tmpText = $currentFont->decodeContent($tmpText);
449
                    }
450
                    $command['c'] = $tmpText;
451
                    continue;
452
                }
453
                $numText = \count($data);
454
                for ($i = 0; $i < $numText; ++$i) {
455
                    if (0 != ($i % 2)) {
456
                        continue;
457
                    }
458
                    $tmpText = $data[$i]['c'];
459
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
460
                    $decodedText = str_replace(
461
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
462
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
463
                            $decodedText
464
                    );
465
                    $decodedText = utf8_encode($decodedText);
466
                    if (isset($currentFont)) {
467
                        $decodedText = $currentFont->decodeContent($decodedText);
468
                    }
469
                    $command['c'][$i]['c'] = $decodedText;
470
                    continue;
471
                }
472
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
473
                $fontId = explode(' ', $command['c'])[0];
474
                // If document is a FPDI/FPDF the $page has the correct font
475
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
476
                continue;
477
            } elseif ('Q' == $command['o']) {
478
                $currentFont = $clippedFont;
479
            } elseif ('q' == $command['o']) {
480
                $clippedFont = $currentFont;
481
            }
482
        }
483
484
        return $extractedRawData;
485
    }
486
487
    /**
488
     * Gets just the Text commands that are involved in text positions and
489
     * Text Matrix (Tm)
490
     *
491
     * It extract just the PDF commands that are involved with text positions, and
492
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
493
     *
494
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
495
     *                                       If it is null, the method extractDecodeRawData is called.
496
     *
497
     * @return array An array with the text command of the page
498
     */
499
    public function getDataCommands(array $extractedDecodedRawData = null): array
500
    {
501
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
502
            $extractedDecodedRawData = $this->extractDecodedRawData();
503
        }
504
        $extractedData = [];
505
        foreach ($extractedDecodedRawData as $command) {
506
            switch ($command['o']) {
507
                /*
508
                 * BT
509
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
510
                 */
511
                case 'BT':
512
                    $extractedData[] = $command;
513
                    break;
514
515
                /*
516
                 * ET
517
                 * End a text object, discarding the text matrix
518
                 */
519
                case 'ET':
520
                    $extractedData[] = $command;
521
                    break;
522
523
                /*
524
                 * leading TL
525
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
526
                 * Initial value: 0
527
                 */
528
                case 'TL':
529
                    $extractedData[] = $command;
530
                    break;
531
532
                /*
533
                 * tx ty Td
534
                 * Move to the start of the next line, offset form the start of the
535
                 * current line by tx, ty.
536
                 */
537
                case 'Td':
538
                    $extractedData[] = $command;
539
                    break;
540
541
                /*
542
                 * tx ty TD
543
                 * Move to the start of the next line, offset form the start of the
544
                 * current line by tx, ty. As a side effect, this operator set the leading
545
                 * parameter in the text state. This operator has the same effect as the
546
                 * code:
547
                 * -ty TL
548
                 * tx ty Td
549
                 */
550
                case 'TD':
551
                    $extractedData[] = $command;
552
                    break;
553
554
                /*
555
                 * a b c d e f Tm
556
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
557
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
558
                 * [1 0 0 1 0 0]
559
                 */
560
                case 'Tm':
561
                    $extractedData[] = $command;
562
                    break;
563
564
                /*
565
                 * T*
566
                 * Move to the start of the next line. This operator has the same effect
567
                 * as the code:
568
                 * 0 Tl Td
569
                 * Where Tl is the current leading parameter in the text state.
570
                 */
571
                case 'T*':
572
                    $extractedData[] = $command;
573
                    break;
574
575
                /*
576
                 * string Tj
577
                 * Show a Text String
578
                 */
579
                case 'Tj':
580
                    $extractedData[] = $command;
581
                    break;
582
583
                /*
584
                 * string '
585
                 * Move to the next line and show a text string. This operator has the
586
                 * same effect as the code:
587
                 * T*
588
                 * string Tj
589
                 */
590
                case "'":
591
                    $extractedData[] = $command;
592
                    break;
593
594
                /*
595
                 * aw ac string "
596
                 * Move to the next lkine and show a text string, using aw as the word
597
                 * spacing and ac as the character spacing. This operator has the same
598
                 * effect as the code:
599
                 * aw Tw
600
                 * ac Tc
601
                 * string '
602
                 * Tw set the word spacing, Tw, to wordSpace.
603
                 * Tc Set the character spacing, Tc, to charsSpace.
604
                 */
605
                case '"':
606
                    $extractedData[] = $command;
607
                    break;
608
609
                /*
610
                 * array TJ
611
                 * Show one or more text strings allow individual glyph positioning.
612
                 * Each lement of array con be a string or a number. If the element is
613
                 * a string, this operator shows the string. If it is a number, the
614
                 * operator adjust the text position by that amount; that is, it translates
615
                 * the text matrix, Tm. This amount is substracted form the current
616
                 * horizontal or vertical coordinate, depending on the writing mode.
617
                 * in the default coordinate system, a positive adjustment has the effect
618
                 * of moving the next glyph painted either to the left or down by the given
619
                 * amount.
620
                 */
621
                case 'TJ':
622
                    $extractedData[] = $command;
623
                    break;
624
                default:
625
            }
626
        }
627
628
        return $extractedData;
629
    }
630
631
    /**
632
     * Gets the Text Matrix of the text in the page
633
     *
634
     * Return an array where every item is an array where the first item is the
635
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
636
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
637
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
638
     *
639
     * @param array $dataCommands the data extracted by getDataCommands
640
     *                            if null getDataCommands is called
641
     *
642
     * @return array an array with the data of the page including the Tm information
643
     *               of any text in the page
644
     */
645
    public function getDataTm(array $dataCommands = null): array
646
    {
647
        if (!isset($dataCommands) || !$dataCommands) {
648
            $dataCommands = $this->getDataCommands();
649
        }
650
651
        /*
652
         * At the beginning of a text object Tm is the identity matrix
653
         */
654
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
655
656
        /*
657
         *  Set the text leading used by T*, ' and " operators
658
         */
659
        $defaultTl = 0;
660
661
        /*
662
         * Setting where are the X and Y coordinates in the matrix (Tm)
663
         */
664
        $x = 4;
665
        $y = 5;
666
        $Tx = 0;
667
        $Ty = 0;
668
669
        $Tm = $defaultTm;
670
        $Tl = $defaultTl;
671
672
        $extractedTexts = $this->getTextArray();
673
        $extractedData = [];
674
        foreach ($dataCommands as $command) {
675
            $currentText = $extractedTexts[\count($extractedData)];
676
            switch ($command['o']) {
677
                /*
678
                 * BT
679
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
680
                 */
681
                case 'BT':
682
                    $Tm = $defaultTm;
683
                    $Tl = $defaultTl; //review this.
684
                    $Tx = 0;
685
                    $Ty = 0;
686
                    break;
687
688
                /*
689
                 * ET
690
                 * End a text object, discarding the text matrix
691
                 */
692
                case 'ET':
693
                    $Tm = $defaultTm;
694
                    $Tl = $defaultTl;  //review this
695
                    $Tx = 0;
696
                    $Ty = 0;
697
                    break;
698
699
                /*
700
                 * leading TL
701
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
702
                 * Initial value: 0
703
                 */
704
                case 'TL':
705
                    $Tl = (float) $command['c'];
706
                    break;
707
708
                /*
709
                 * tx ty Td
710
                 * Move to the start of the next line, offset form the start of the
711
                 * current line by tx, ty.
712
                 */
713
                case 'Td':
714
                    $coord = explode(' ', $command['c']);
715
                    $Tx += (float) $coord[0];
716
                    $Ty += (float) $coord[1];
717
                    $Tm[$x] = (string) $Tx;
718
                    $Tm[$y] = (string) $Ty;
719
                    break;
720
721
                /*
722
                 * tx ty TD
723
                 * Move to the start of the next line, offset form the start of the
724
                 * current line by tx, ty. As a side effect, this operator set the leading
725
                 * parameter in the text state. This operator has the same effect as the
726
                 * code:
727
                 * -ty TL
728
                 * tx ty Td
729
                 */
730
                case 'TD':
731
                    $coord = explode(' ', $command['c']);
732
                    $Tl = (float) $coord[1];
733
                    $Tx += (float) $coord[0];
734
                    $Ty -= (float) $coord[1];
735
                    $Tm[$x] = (string) $Tx;
736
                    $Tm[$y] = (string) $Ty;
737
                    break;
738
739
                /*
740
                 * a b c d e f Tm
741
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
742
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
743
                 * [1 0 0 1 0 0]
744
                 */
745
                case 'Tm':
746
                    $Tm = explode(' ', $command['c']);
747
                    $Tx = (float) $Tm[$x];
748
                    $Ty = (float) $Tm[$y];
749
                    break;
750
751
                /*
752
                 * T*
753
                 * Move to the start of the next line. This operator has the same effect
754
                 * as the code:
755
                 * 0 Tl Td
756
                 * Where Tl is the current leading parameter in the text state.
757
                 */
758
                case 'T*':
759
                    $Ty -= $Tl;
760
                    $Tm[$y] = (string) $Ty;
761
                    break;
762
763
                /*
764
                 * string Tj
765
                 * Show a Text String
766
                 */
767
                case 'Tj':
768
                    $extractedData[] = [$Tm, $currentText];
769
                    break;
770
771
                /*
772
                 * string '
773
                 * Move to the next line and show a text string. This operator has the
774
                 * same effect as the code:
775
                 * T*
776
                 * string Tj
777
                 */
778
                case "'":
779
                    $Ty -= $Tl;
780
                    $Tm[$y] = (string) $Ty;
781
                    $extractedData[] = [$Tm, $currentText];
782
                    break;
783
784
                /*
785
                 * aw ac string "
786
                 * Move to the next line and show a text string, using aw as the word
787
                 * spacing and ac as the character spacing. This operator has the same
788
                 * effect as the code:
789
                 * aw Tw
790
                 * ac Tc
791
                 * string '
792
                 * Tw set the word spacing, Tw, to wordSpace.
793
                 * Tc Set the character spacing, Tc, to charsSpace.
794
                 */
795
                case '"':
796
                    $data = explode(' ', $currentText);
797
                    $Ty -= $Tl;
798
                    $Tm[$y] = (string) $Ty;
799
                    $extractedData[] = [$Tm, $data[2]]; //Verify
800
                    break;
801
802
                /*
803
                 * array TJ
804
                 * Show one or more text strings allow individual glyph positioning.
805
                 * Each lement of array con be a string or a number. If the element is
806
                 * a string, this operator shows the string. If it is a number, the
807
                 * operator adjust the text position by that amount; that is, it translates
808
                 * the text matrix, Tm. This amount is substracted form the current
809
                 * horizontal or vertical coordinate, depending on the writing mode.
810
                 * in the default coordinate system, a positive adjustment has the effect
811
                 * of moving the next glyph painted either to the left or down by the given
812
                 * amount.
813
                 */
814
                case 'TJ':
815
                    $extractedData[] = [$Tm, $currentText];
816
                    break;
817
                default:
818
            }
819
        }
820
        $this->dataTm = $extractedData;
821
822
        return $extractedData;
823
    }
824
825
    /**
826
     * Gets text data that are around the given coordinates (X,Y)
827
     *
828
     * If the text is in near the given coordinates (X,Y) (or the TM info),
829
     * the text is returned.  The extractedData return by getDataTm, could be use to see
830
     * where is the coordinates of a given text, using the TM info for it.
831
     *
832
     * @param float $x      The X value of the coordinate to search for. if null
833
     *                      just the Y value is considered (same Row)
834
     * @param float $y      The Y value of the coordinate to search for
835
     *                      just the X value is considered (same column)
836
     * @param float $xError The value less or more to consider an X to be "near"
837
     * @param float $yError The value less or more to consider an Y to be "near"
838
     *
839
     * @return array An array of text that are near the given coordinates. If no text
840
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
841
     *               and y coordinates are null, null is returned.
842
     */
843
    public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
844
    {
845
        if (!isset($this->dataTm) || !$this->dataTm) {
846
            $this->getDataTm();
847
        }
848
849
        if (null !== $x) {
850
            $x = (float) $x;
851
        }
852
853
        if (null !== $y) {
854
            $y = (float) $y;
855
        }
856
857
        if (null === $x && null === $y) {
858
            return [];
859
        }
860
861
        $xError = (float) $xError;
862
        $yError = (float) $yError;
863
864
        $extractedData = [];
865
        foreach ($this->dataTm as $item) {
866
            $tm = $item[0];
867
            $xTm = (float) $tm[4];
868
            $yTm = (float) $tm[5];
869
            $text = $item[1];
870
            if (null === $y) {
871
                if (($xTm >= ($x - $xError)) &&
872
                    ($xTm <= ($x + $xError))) {
873
                    $extractedData[] = [$tm, $text];
874
                    continue;
875
                }
876
            }
877
            if (null === $x) {
878
                if (($yTm >= ($y - $yError)) &&
879
                    ($yTm <= ($y + $yError))) {
880
                    $extractedData[] = [$tm, $text];
881
                    continue;
882
                }
883
            }
884
            if (($xTm >= ($x - $xError)) &&
885
                ($xTm <= ($x + $xError)) &&
886
                ($yTm >= ($y - $yError)) &&
887
                ($yTm <= ($y + $yError))) {
888
                $extractedData[] = [$tm, $text];
889
                continue;
890
            }
891
        }
892
893
        return $extractedData;
894
    }
895
}
896