Passed
Pull Request — master (#349)
by
unknown
02:03
created

Page::getXObjects()   B

Complexity

Conditions 7
Paths 8

Size

Total Lines 31
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 7.0119

Importance

Changes 0
Metric Value
cc 7
eloc 16
c 0
b 0
f 0
nc 8
nop 0
dl 0
loc 31
ccs 15
cts 16
cp 0.9375
crap 7.0119
rs 8.8333
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
class Page extends PDFObject
39
{
40
    /**
41
     * @var Font[]
42
     */
43
    protected $fonts = null;
44
45
    /**
46
     * @var PDFObject[]
47
     */
48
    protected $xobjects = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $dataTm = null;
54
55
    /**
56
     * @return Font[]
57
     */
58 16
    public function getFonts()
59
    {
60 16
        if (null !== $this->fonts) {
61 14
            return $this->fonts;
62
        }
63
64 16
        $resources = $this->get('Resources');
65
66 16
        if (method_exists($resources, 'has') && $resources->has('Font')) {
67 15
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

67
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
68 1
                return [];
69
            }
70
71 14
            if ($resources->get('Font') instanceof Header) {
72 2
                $fonts = $resources->get('Font')->getElements();
73
            } else {
74 13
                $fonts = $resources->get('Font')->getHeader()->getElements();
75
            }
76
77 14
            $table = [];
78
79 14
            foreach ($fonts as $id => $font) {
80 14
                if ($font instanceof Font) {
81 14
                    $table[$id] = $font;
82
83
                    // Store too on cleaned id value (only numeric)
84 14
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
85 14
                    if ('' != $id) {
86 14
                        $table[$id] = $font;
87
                    }
88
                }
89
            }
90
91 14
            return $this->fonts = $table;
92
        }
93
94 4
        return [];
95
    }
96
97
    /**
98
     * @param string $id
99
     *
100
     * @return Font|null
101
     */
102 14
    public function getFont($id)
103
    {
104 14
        $fonts = $this->getFonts();
105
106 14
        if (isset($fonts[$id])) {
107 13
            return $fonts[$id];
108
        }
109
110
        // $id = preg_replace('/[^0-9\.\-_]/', '', $id);
111
        //
112
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
113
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
114
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
115
116 3
        if (isset($fonts[$id])) {
117
            return $fonts[$id];
118
        } else {
119 3
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
120 3
            if (isset($fonts[$id])) {
121 1
                return $fonts[$id];
122
            }
123
        }
124
125 2
        return null;
126
    }
127
128
    /**
129
     * Support for XObject
130
     *
131
     * @return PDFObject[]
132
     */
133 3
    public function getXObjects()
134
    {
135 3
        if (null !== $this->xobjects) {
136 2
            return $this->xobjects;
137
        }
138
139 3
        $resources = $this->get('Resources');
140
141 3
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
142 3
            if ($resources->get('XObject') instanceof Header) {
143 2
                $xobjects = $resources->get('XObject')->getElements();
144
            } else {
145 2
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
146
            }
147
148 3
            $table = [];
149
150 3
            foreach ($xobjects as $id => $xobject) {
151 3
                $table[$id] = $xobject;
152
153
                // Store too on cleaned id value (only numeric)
154 3
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
155 3
                if ('' != $id) {
156 3
                    $table[$id] = $xobject;
157
                }
158
            }
159
160 3
            return $this->xobjects = $table;
161
        }
162
163
        return [];
164
    }
165
166
    /**
167
     * @param string $id
168
     *
169
     * @return PDFObject|null
170
     */
171 3
    public function getXObject($id)
172
    {
173 3
        $xobjects = $this->getXObjects();
174
175 3
        if (isset($xobjects[$id])) {
176 3
            return $xobjects[$id];
177
        }
178
179
        return null;
180
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
181
182
        if (isset($xobjects[$id])) {
183
            return $xobjects[$id];
184
        } else {
185
            return null;
186
        }*/
187
    }
188
189
    /**
190
     * @param Page $page
191
     *
192
     * @return string
193
     */
194 8
    public function getText(self $page = null)
195
    {
196 8
        if ($contents = $this->get('Contents')) {
197 8
            if ($contents instanceof ElementMissing) {
198
                return '';
199 8
            } elseif ($contents instanceof ElementNull) {
200
                return '';
201 8
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
202 6
                $elements = $contents->getHeader()->getElements();
203
204 6
                if (is_numeric(key($elements))) {
205
                    $new_content = '';
206
207
                    foreach ($elements as $element) {
208
                        if ($element instanceof ElementXRef) {
209
                            $new_content .= $element->getObject()->getContent();
210
                        } else {
211
                            $new_content .= $element->getContent();
212
                        }
213
                    }
214
215
                    $header = new Header([], $this->document);
216 6
                    $contents = new PDFObject($this->document, $header, $new_content);
217
                }
218 3
            } elseif ($contents instanceof ElementArray) {
219
                // Create a virtual global content.
220 3
                $new_content = '';
221
222 3
                foreach ($contents->getContent() as $content) {
223 3
                    $new_content .= $content->getContent()."\n";
224
                }
225
226 3
                $header = new Header([], $this->document);
227 3
                $contents = new PDFObject($this->document, $header, $new_content);
228
            }
229
230 8
            return $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

230
            return $contents->/** @scrutinizer ignore-call */ getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
231
        }
232
233
        return '';
234
    }
235
236
    /**
237
     * @param Page $page
238
     *
239
     * @return array
240
     */
241 3
    public function getTextArray(self $page = null)
242
    {
243 3
        if ($contents = $this->get('Contents')) {
244 3
            if ($contents instanceof ElementMissing) {
245
                return [];
246 3
            } elseif ($contents instanceof ElementNull) {
247
                return [];
248 3
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
249 3
                $elements = $contents->getHeader()->getElements();
250
251 3
                if (is_numeric(key($elements))) {
252
                    $new_content = '';
253
254
                    /** @var PDFObject $element */
255
                    foreach ($elements as $element) {
256
                        if ($element instanceof ElementXRef) {
257
                            $new_content .= $element->getObject()->getContent();
258
                        } else {
259
                            $new_content .= $element->getContent();
260
                        }
261
                    }
262
263
                    $header = new Header([], $this->document);
264 3
                    $contents = new PDFObject($this->document, $header, $new_content);
265
                }
266
            } elseif ($contents instanceof ElementArray) {
267
                // Create a virtual global content.
268
                $new_content = '';
269
270
                /** @var PDFObject $content */
271
                foreach ($contents->getContent() as $content) {
272
                    $new_content .= $content->getContent()."\n";
273
                }
274
275
                $header = new Header([], $this->document);
276
                $contents = new PDFObject($this->document, $header, $new_content);
277
            }
278
279 3
            return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

279
            return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
280
        }
281
282
        return [];
283
    }
284
285
    /**
286
     * Gets all the text data with its internal representation of the page.
287
     *
288
     * @return array An array with the data and the internal representation
289
     */
290 6
    public function extractRawData()
291
    {
292
        /*
293
         * Now you can get the complete content of the object with the text on it
294
         */
295 6
        $extractedData = [];
296 6
        $content = $this->get('Contents');
297 6
        $values = $content->getContent();
298 6
        if (isset($values) and \is_array($values)) {
299
            $text = '';
300
            foreach ($values as $section) {
301
                $text .= $section->getContent();
302
            }
303
            $sectionsText = $this->getSectionsText($text);
304
            foreach ($sectionsText as $sectionText) {
305
                $commandsText = $this->getCommandsText($sectionText);
306
                foreach ($commandsText as $command) {
307
                    $extractedData[] = $command;
308
                }
309
            }
310
        } else {
311 6
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

311
            /** @scrutinizer ignore-call */ 
312
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
312 6
            foreach ($sectionsText as $sectionText) {
313 6
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
314
315 6
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

315
                /** @scrutinizer ignore-call */ 
316
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
316 6
                foreach ($commandsText as $command) {
317 6
                    $extractedData[] = $command;
318
                }
319
            }
320
        }
321
322 6
        return $extractedData;
323
    }
324
325
    /**
326
     * Gets all the decoded text data with it internal representation from a page.
327
     *
328
     * @param array $extractedRawData the extracted data return by extractRawData or
329
     *                                null if extractRawData should be called
330
     *
331
     * @return array An array with the data and the internal representation
332
     */
333 5
    public function extractDecodedRawData($extractedRawData = null)
334
    {
335 5
        if (!isset($extractedRawData) or !$extractedRawData) {
336 5
            $extractedRawData = $this->extractRawData();
337
        }
338 5
        $unicode = true;
339 5
        $currentFont = null;
340 5
        foreach ($extractedRawData as &$command) {
341 5
            if ('Tj' == $command['o'] or 'TJ' == $command['o']) {
342 5
                $data = $command['c'];
343 5
                if (!\is_array($data)) {
344 5
                    $tmpText = '';
345 5
                    if (isset($currentFont)) {
346 5
                        $tmpText = $currentFont->decodeOctal($data);
347
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
348
                    }
349 5
                    $tmpText = str_replace(
350 5
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
351 5
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
352
                            $tmpText
353
                    );
354 5
                    $tmpText = utf8_encode($tmpText);
355 5
                    if (isset($currentFont)) {
356 5
                        $tmpText = $currentFont->decodeContent($tmpText, $unicode);
357
                    }
358 5
                    $command['c'] = $tmpText;
359 5
                    continue;
360
                }
361 5
                $numText = \count($data);
362 5
                for ($i = 0; $i < $numText; ++$i) {
363 5
                    if (0 != ($i % 2)) {
364 5
                        continue;
365
                    }
366 5
                    $tmpText = $data[$i]['c'];
367 5
                    $decodedText = '';
368 5
                    if (isset($currentFont)) {
369 5
                        $decodedText = $currentFont->decodeOctal($tmpText);
370
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
371
                    }
372 5
                    $decodedText = str_replace(
373 5
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
374 5
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
375
                            $decodedText
376
                    );
377 5
                    $decodedText = utf8_encode($decodedText);
378 5
                    if (isset($currentFont)) {
379 5
                        $decodedText = $currentFont->decodeContent($decodedText, $unicode);
380
                    }
381 5
                    $command['c'][$i]['c'] = $decodedText;
382 5
                    continue;
383
                }
384 5
            } elseif ('Tf' == $command['o'] or 'TF' == $command['o']) {
385 5
                $fontId = explode(' ', $command['c'])[0];
386 5
                $currentFont = $this->getFont($fontId);
387 5
                continue;
388
            }
389
        }
390
391 5
        return $extractedRawData;
392
    }
393
394
    /**
395
     * Gets just the Text commands that are involved in text positions and
396
     * Text Matrix (Tm)
397
     *
398
     * It extract just the PDF commands that are involved with text positions, and
399
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
400
     *
401
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
402
     *                                       If it is null, the method extractDecodeRawData is called.
403
     *
404
     * @return array An array with the text command of the page
405
     */
406 4
    public function getDataCommands($extractedDecodedRawData = null)
407
    {
408 4
        if (!isset($extractedDecodedRawData) or !$extractedDecodedRawData) {
409 4
            $extractedDecodedRawData = $this->extractDecodedRawData();
410
        }
411 4
        $extractedData = [];
412 4
        foreach ($extractedDecodedRawData as $command) {
413 4
            switch ($command['o']) {
414
                /*
415
                 * BT
416
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
417
                 */
418 4
                case 'BT':
419 4
                    $extractedData[] = $command;
420 4
                    break;
421
422
                /*
423
                 * ET
424
                 * End a text object, discarding the text matrix
425
                 */
426 4
                case 'ET':
427
                    $extractedData[] = $command;
428
                    break;
429
430
                /*
431
                 * leading TL
432
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
433
                 * Initial value: 0
434
                 */
435 4
                case 'TL':
436 3
                    $extractedData[] = $command;
437 3
                    break;
438
439
                /*
440
                 * tx ty Td
441
                 * Move to the start of the next line, offset form the start of the
442
                 * current line by tx, ty.
443
                 */
444 4
                case 'Td':
445 4
                    $extractedData[] = $command;
446 4
                    break;
447
448
                /*
449
                 * tx ty TD
450
                 * Move to the start of the next line, offset form the start of the
451
                 * current line by tx, ty. As a side effect, this operator set the leading
452
                 * parameter in the text state. This operator has the same effect as the
453
                 * code:
454
                 * -ty TL
455
                 * tx ty Td
456
                 */
457 4
                case 'TD':
458
                    $extractedData[] = $command;
459
                    break;
460
461
                /*
462
                 * a b c d e f Tm
463
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
464
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
465
                 * [1 0 0 1 0 0]
466
                 */
467 4
                case 'Tm':
468 3
                    $extractedData[] = $command;
469 3
                    break;
470
471
                /*
472
                 * T*
473
                 * Move to the start of the next line. This operator has the same effect
474
                 * as the code:
475
                 * 0 Tl Td
476
                 * Where Tl is the current leading parameter in the text state.
477
                 */
478 4
                case 'T*':
479 3
                    $extractedData[] = $command;
480 3
                    break;
481
482
                /*
483
                 * string Tj
484
                 * Show a Text String
485
                 */
486 4
                case 'Tj':
487 4
                    $extractedData[] = $command;
488 4
                    break;
489
490
                /*
491
                 * string '
492
                 * Move to the next line and show a text string. This operator has the
493
                 * same effect as the code:
494
                 * T*
495
                 * string Tj
496
                 */
497 4
                case "'":
498
                    $extractedData[] = $command;
499
                    break;
500
501
                /*
502
                 * aw ac string "
503
                 * Move to the next lkine and show a text string, using aw as the word
504
                 * spacing and ac as the character spacing. This operator has the same
505
                 * effect as the code:
506
                 * aw Tw
507
                 * ac Tc
508
                 * string '
509
                 * Tw set the word spacing, Tw, to wordSpace.
510
                 * Tc Set the character spacing, Tc, to charsSpace.
511
                 */
512 4
                case '"':
513
                    $extractedData[] = $command;
514
                    break;
515
516
                /*
517
                 * array TJ
518
                 * Show one or more text strings allow individual glyph positioning.
519
                 * Each lement of array con be a string or a number. If the element is
520
                 * a string, this operator shows the string. If it is a number, the
521
                 * operator adjust the text position by that amount; that is, it translates
522
                 * the text matrix, Tm. This amount is substracted form the current
523
                 * horizontal or vertical coordinate, depending on the writing mode.
524
                 * in the default coordinate system, a positive adjustment has the effect
525
                 * of moving the next glyph painted either to the left or down by the given
526
                 * amount.
527
                 */
528 4
                case 'TJ':
529 4
                    $extractedData[] = $command;
530 4
                    break;
531
                default:
532
            }
533
        }
534
535 4
        return $extractedData;
536
    }
537
538
    /**
539
     * Gets the Text Matrix of the text in the page
540
     *
541
     * Return an array where every item is an array where the first item is the
542
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
543
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
544
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
545
     *
546
     * @param array $dataCommands the data extracted by getDataCommands
547
     *                            if null getDataCommands is called
548
     *
549
     * @return array an array with the data of the page including the Tm information
550
     *               of any text in the page
551
     */
552 3
    public function getDataTm($dataCommands = null)
553
    {
554 3
        if (!isset($dataCommands) or !$dataCommands) {
555 3
            $dataCommands = $this->getDataCommands();
556
        }
557
558
        /*
559
         * At the beginning of a text object Tm is the identity matrix
560
         */
561 3
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
562
563
        /*
564
         *  Set the text leading used by T*, ' and " operators
565
         */
566 3
        $defaultTl = 0;
567
568
        /*
569
         * Setting where are the X and Y coordinates in the matrix (Tm)
570
         */
571 3
        $x = 4;
572 3
        $y = 5;
573 3
        $Tx = 0;
574 3
        $Ty = 0;
575
576 3
        $Tm = $defaultTm;
577 3
        $Tl = $defaultTl;
578
579 3
        $extractedTexts = $this->getTextArray();
580 3
        $extractedData = [];
581 3
        foreach ($dataCommands as $command) {
582 3
            $currentText = $extractedTexts[\count($extractedData)];
583 3
            switch ($command['o']) {
584
                /*
585
                 * BT
586
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
587
                 */
588 3
                case 'BT':
589 3
                    $Tm = $defaultTm;
590 3
                    $Tl = $defaultTl; //review this.
591 3
                    $Tx = 0;
592 3
                    $Ty = 0;
593 3
                    break;
594
595
                /*
596
                 * ET
597
                 * End a text object, discarding the text matrix
598
                 */
599 3
                case 'ET':
600
                    $Tm = $defaultTm;
601
                    $Tl = $defaultTl;  //review this
602
                    $Tx = 0;
603
                    $Ty = 0;
604
                    break;
605
606
                /*
607
                 * leading TL
608
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
609
                 * Initial value: 0
610
                 */
611 3
                case 'TL':
612 2
                    $Tl = (float) $command['c'];
613 2
                    break;
614
615
                /*
616
                 * tx ty Td
617
                 * Move to the start of the next line, offset form the start of the
618
                 * current line by tx, ty.
619
                 */
620 3
                case 'Td':
621 3
                    $coord = explode(' ', $command['c']);
622 3
                    $Tx += (float) $coord[0];
623 3
                    $Ty += (float) $coord[1];
624 3
                    $Tm[$x] = (string) $Tx;
625 3
                    $Tm[$y] = (string) $Ty;
626 3
                    break;
627
628
                /*
629
                 * tx ty TD
630
                 * Move to the start of the next line, offset form the start of the
631
                 * current line by tx, ty. As a side effect, this operator set the leading
632
                 * parameter in the text state. This operator has the same effect as the
633
                 * code:
634
                 * -ty TL
635
                 * tx ty Td
636
                 */
637 3
                case 'TD':
638
                    $coord = explode(' ', $command['c']);
639
                    $Tl = (float) $coord[1];
640
                    $Tx += (float) $coord[0];
641
                    $Ty -= (float) $coord[1];
642
                    $Tm[$x] = (string) $Tx;
643
                    $Tm[$y] = (string) $Ty;
644
                    break;
645
646
                /*
647
                 * a b c d e f Tm
648
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
649
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
650
                 * [1 0 0 1 0 0]
651
                 */
652 3
                case 'Tm':
653 2
                    $Tm = explode(' ', $command['c']);
654 2
                    $Tx = (float) $Tm[$x];
655 2
                    $Ty = (float) $Tm[$y];
656 2
                    break;
657
658
                /*
659
                 * T*
660
                 * Move to the start of the next line. This operator has the same effect
661
                 * as the code:
662
                 * 0 Tl Td
663
                 * Where Tl is the current leading parameter in the text state.
664
                 */
665 3
                case 'T*':
666 2
                    $Ty -= $Tl;
667 2
                    $Tm[$y] = (string) $Ty;
668 2
                    break;
669
670
                /*
671
                 * string Tj
672
                 * Show a Text String
673
                 */
674 3
                case 'Tj':
675 3
                    $extractedData[] = [$Tm, $currentText];
676 3
                    break;
677
678
                /*
679
                 * string '
680
                 * Move to the next line and show a text string. This operator has the
681
                 * same effect as the code:
682
                 * T*
683
                 * string Tj
684
                 */
685 3
                case "'":
686
                    $Ty -= $Tl;
687
                    $Tm[$y] = (string) $Ty;
688
                    $extractedData[] = [$Tm, $currentText];
689
                    break;
690
691
                /*
692
                 * aw ac string "
693
                 * Move to the next line and show a text string, using aw as the word
694
                 * spacing and ac as the character spacing. This operator has the same
695
                 * effect as the code:
696
                 * aw Tw
697
                 * ac Tc
698
                 * string '
699
                 * Tw set the word spacing, Tw, to wordSpace.
700
                 * Tc Set the character spacing, Tc, to charsSpace.
701
                 */
702 3
                case '"':
703
                    $data = explode(' ', $currentText);
704
                    $Ty -= $Tl;
705
                    $Tm[$y] = (string) $Ty;
706
                    $extractedData[] = [$Tm, $data[2]]; //Verify
707
                    break;
708
709
                /*
710
                 * array TJ
711
                 * Show one or more text strings allow individual glyph positioning.
712
                 * Each lement of array con be a string or a number. If the element is
713
                 * a string, this operator shows the string. If it is a number, the
714
                 * operator adjust the text position by that amount; that is, it translates
715
                 * the text matrix, Tm. This amount is substracted form the current
716
                 * horizontal or vertical coordinate, depending on the writing mode.
717
                 * in the default coordinate system, a positive adjustment has the effect
718
                 * of moving the next glyph painted either to the left or down by the given
719
                 * amount.
720
                 */
721 3
                case 'TJ':
722 3
                    $extractedData[] = [$Tm, $currentText];
723 3
                    break;
724
                default:
725
            }
726
        }
727 3
        $this->dataTm = $extractedData;
728
729 3
        return $extractedData;
730
    }
731
732
    /**
733
     * Gets text data that are around the given coordinates (X,Y)
734
     *
735
     * If the text is in near the given coordinates (X,Y) (or the TM info),
736
     * the text is returned.  The extractedData return by getDataTm, could be use to see
737
     * where is the coordinates of a given text, using the TM info for it.
738
     *
739
     * @param float $x      The X value of the coordinate to search for. if null
740
     *                      just the Y value is considered (same Row)
741
     * @param float $y      The Y value of the coordinate to search for
742
     *                      just the X value is considered (same column)
743
     * @param float $xError The value less or more to consider an X to be "near"
744
     * @param float $yError The value less or more to consider an Y to be "near"
745
     *
746
     * @return array An array of text that are near the given coordinates. If no text
747
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
748
     *               and y coordinates are null, null is returned.
749
     */
750 1
    public function getTextXY($x = null, $y = null, $xError = 0, $yError = 0)
751
    {
752 1
        if (!isset($this->dataTm) or !$this->dataTm) {
753 1
            $this->getDataTm();
754
        }
755
756 1
        if (null !== $x) {
757 1
            $x = (float) $x;
758
        }
759
760 1
        if (null !== $y) {
761 1
            $y = (float) $y;
762
        }
763
764 1
        if (null === $x and null === $y) {
765
            return [];
766
        }
767
768 1
        $xError = (float) $xError;
769 1
        $yError = (float) $yError;
770
771 1
        $extractedData = [];
772 1
        foreach ($this->dataTm as $item) {
773 1
            $tm = $item[0];
774 1
            $xTm = (float) $tm[4];
775 1
            $yTm = (float) $tm[5];
776 1
            $text = $item[1];
777 1
            if (null === $y) {
778
                if (($xTm >= ($x - $xError)) and
779
                    ($xTm <= ($x + $xError))) {
780
                    $extractedData[] = [$tm, $text];
781
                    continue;
782
                }
783
            }
784 1
            if (null === $x) {
785
                if (($yTm >= ($y - $yError)) and
786
                    ($yTm <= ($y + $yError))) {
787
                    $extractedData[] = [$tm, $text];
788
                    continue;
789
                }
790
            }
791 1
            if (($xTm >= ($x - $xError)) and
792 1
                ($xTm <= ($x + $xError)) and
793 1
                ($yTm >= ($y - $yError)) and
794 1
                ($yTm <= ($y + $yError))) {
795 1
                $extractedData[] = [$tm, $text];
796 1
                continue;
797
            }
798
        }
799
800 1
        return $extractedData;
801
    }
802
}
803