Passed
Pull Request — master (#341)
by
unknown
01:42
created

Page::getText()   B

Complexity

Conditions 10
Paths 7

Size

Total Lines 40
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 15.2734

Importance

Changes 5
Bugs 0 Features 0
Metric Value
eloc 24
dl 0
loc 40
rs 7.6666
c 5
b 0
f 0
ccs 15
cts 24
cp 0.625
cc 10
nc 7
nop 1
crap 15.2734

How to fix   Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
class Page extends PDFObject
39
{
40
    /**
41
     * @var Font[]
42
     */
43
    protected $fonts = null;
44
45
    /**
46
     * @var PDFObject[]
47
     */
48
    protected $xobjects = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $dataTm = null;
54
55
    /**
56
     * @return Font[]
57
     */
58 11
    public function getFonts()
59
    {
60 11
        if (null !== $this->fonts) {
61 9
            return $this->fonts;
62
        }
63
64 11
        $resources = $this->get('Resources');
65
66 11
        if (method_exists($resources, 'has') && $resources->has('Font')) {
67 11
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

67
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
68 1
                return [];
69
            }
70
71 10
            if ($resources->get('Font') instanceof Header) {
72 5
                $fonts = $resources->get('Font')->getElements();
73
            } else {
74 7
                $fonts = $resources->get('Font')->getHeader()->getElements();
75
            }
76
77 10
            $table = [];
78
79 10
            foreach ($fonts as $id => $font) {
80 10
                if ($font instanceof Font) {
81 10
                    $table[$id] = $font;
82
83
                    // Store too on cleaned id value (only numeric)
84 10
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
85 10
                    if ('' != $id) {
86 10
                        $table[$id] = $font;
87
                    }
88
                }
89
            }
90
91 10
            return $this->fonts = $table;
92
        }
93
94 1
        return [];
95
    }
96
97
    /**
98
     * @param string $id
99
     *
100
     * @return Font|null
101
     */
102 9
    public function getFont($id)
103
    {
104 9
        $fonts = $this->getFonts();
105
106 9
        if (isset($fonts[$id])) {
107 9
            return $fonts[$id];
108
        }
109
110 2
        $id = preg_replace('/[^0-9\.\-_]/', '', $id);
111
112 2
        if (isset($fonts[$id])) {
113 1
            return $fonts[$id];
114
        }
115
116 1
        return null;
117
    }
118
119
    /**
120
     * Support for XObject
121
     *
122
     * @return PDFObject[]
123
     */
124 2
    public function getXObjects()
125
    {
126 2
        if (null !== $this->xobjects) {
127 2
            return $this->xobjects;
128
        }
129
130 2
        $resources = $this->get('Resources');
131
132 2
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
133 2
            if ($resources->get('XObject') instanceof Header) {
134 2
                $xobjects = $resources->get('XObject')->getElements();
135
            } else {
136
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
137
            }
138
139 2
            $table = [];
140
141 2
            foreach ($xobjects as $id => $xobject) {
142 2
                $table[$id] = $xobject;
143
144
                // Store too on cleaned id value (only numeric)
145 2
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
146 2
                if ('' != $id) {
147 2
                    $table[$id] = $xobject;
148
                }
149
            }
150
151 2
            return $this->xobjects = $table;
152
        }
153
154
        return [];
155
    }
156
157
    /**
158
     * @param string $id
159
     *
160
     * @return PDFObject|null
161
     */
162 2
    public function getXObject($id)
163
    {
164 2
        $xobjects = $this->getXObjects();
165
166 2
        if (isset($xobjects[$id])) {
167 2
            return $xobjects[$id];
168
        }
169
170
        return null;
171
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
172
173
        if (isset($xobjects[$id])) {
174
            return $xobjects[$id];
175
        } else {
176
            return null;
177
        }*/
178
    }
179
180
    /**
181
     * @param Page $page
182
     *
183
     * @return string
184
     */
185 4
    public function getText(self $page = null)
186
    {
187 4
        if ($contents = $this->get('Contents')) {
188 4
            if ($contents instanceof ElementMissing) {
189
                return '';
190 4
            } elseif ($contents instanceof ElementNull) {
191
                return '';
192 4
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
193 3
                $elements = $contents->getHeader()->getElements();
194
195 3
                if (is_numeric(key($elements))) {
196
                    $new_content = '';
197
198
                    foreach ($elements as $element) {
199
                        if ($element instanceof ElementXRef) {
200
                            $new_content .= $element->getObject()->getContent();
201
                        } else {
202
                            $new_content .= $element->getContent();
203
                        }
204
                    }
205
206
                    $header = new Header([], $this->document);
207 3
                    $contents = new PDFObject($this->document, $header, $new_content);
208
                }
209 2
            } elseif ($contents instanceof ElementArray) {
210
                // Create a virtual global content.
211 2
                $new_content = '';
212
213 2
                foreach ($contents->getContent() as $content) {
214 2
                    $new_content .= $content->getContent()."\n";
215
                }
216
217 2
                $header = new Header([], $this->document);
218 2
                $contents = new PDFObject($this->document, $header, $new_content);
219
            }
220
221 4
            return $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

221
            return $contents->/** @scrutinizer ignore-call */ getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
222
        }
223
224
        return '';
225
    }
226
227
    /**
228
     * @param Page $page
229
     *
230
     * @return array
231
     */
232
    public function getTextArray(self $page = null)
233
    {
234
        if ($contents = $this->get('Contents')) {
235
            if ($contents instanceof ElementMissing) {
236
                return [];
237
            } elseif ($contents instanceof ElementNull) {
238
                return [];
239
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
240
                $elements = $contents->getHeader()->getElements();
241
242
                if (is_numeric(key($elements))) {
243
                    $new_content = '';
244
245
                    /** @var PDFObject $element */
246
                    foreach ($elements as $element) {
247
                        if ($element instanceof ElementXRef) {
248
                            $new_content .= $element->getObject()->getContent();
249
                        } else {
250
                            $new_content .= $element->getContent();
251
                        }
252
                    }
253
254
                    $header = new Header([], $this->document);
255
                    $contents = new PDFObject($this->document, $header, $new_content);
256
                }
257
            } elseif ($contents instanceof ElementArray) {
258
                // Create a virtual global content.
259
                $new_content = '';
260
261
                /** @var PDFObject $content */
262
                foreach ($contents->getContent() as $content) {
263
                    $new_content .= $content->getContent()."\n";
264
                }
265
266
                $header = new Header([], $this->document);
267
                $contents = new PDFObject($this->document, $header, $new_content);
268
            }
269
270
            return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

270
            return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
271
        }
272
273
        return [];
274
    }
275
276
    /**
277
     * Gets all the text data with its internal representation of the page.
278
     *
279
     * @return array An array with the data and the internal representation
280
     */
281 5
    public function extractRawData()
282
    {
283
        /*
284
         * Now you can get the complete content of the object with the text on it
285
         */
286 5
        $extractedData = [];
287 5
        $content = $this->get('Contents');
288 5
        $values = $content->getContent();
289 5
        if (isset($values) and \is_array($values)) {
290
            $text = '';
291
            foreach ($values as $section) {
292
                $text .= $section->getContent();
293
            }
294
            $sectionsText = $this->getSectionsText($text);
295
            foreach ($sectionsText as $sectionText) {
296
                $commandsText = $this->getCommandsText($sectionText);
297
                foreach ($commandsText as $command) {
298
                    $extractedData[] = $command;
299
                }
300
            }
301
        } else {
302 5
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

302
            /** @scrutinizer ignore-call */ 
303
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
303 5
            foreach ($sectionsText as $sectionText) {
304 5
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
305
306 5
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

306
                /** @scrutinizer ignore-call */ 
307
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
307 5
                foreach ($commandsText as $command) {
308 5
                    $extractedData[] = $command;
309
                }
310
            }
311
        }
312
313 5
        return $extractedData;
314
    }
315
316
    /**
317
     * Gets all the decoded text data with it internal representation from a page.
318
     *
319
     * @param array $extractedRawData the extracted data return by extractRawData or
320
     *                                null if extractRawData should be called
321
     *
322
     * @return array An array with the data and the internal representation
323
     */
324 4
    public function extractDecodedRawData($extractedRawData = null)
325
    {
326 4
        if (!isset($extractedRawData) or !$extractedRawData) {
327 4
            $extractedRawData = $this->extractRawData();
328
        }
329 4
        $unicode = true;
330 4
        $currentFont = null;
331 4
        foreach ($extractedRawData as &$command) {
332 4
            if ('Tj' == $command['o'] or 'TJ' == $command['o']) {
333 4
                $data = $command['c'];
334 4
                if (!\is_array($data)) {
335 4
                    $tmpText = '';
336 4
                    if (isset($currentFont)) {
337 4
                        $tmpText = $currentFont->decodeOctal($data);
338
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
339
                    }
340 4
                    $tmpText = str_replace(
341 4
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
342 4
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
343
                            $tmpText
344
                    );
345 4
                    $tmpText = utf8_encode($tmpText);
346 4
                    if (isset($currentFont)) {
347 4
                        $tmpText = $currentFont->decodeContent($tmpText, $unicode);
348
                    }
349 4
                    $command['c'] = $tmpText;
350 4
                    continue;
351
                }
352 4
                $numText = \count($data);
353 4
                for ($i = 0; $i < $numText; ++$i) {
354 4
                    if (0 != ($i % 2)) {
355 4
                        continue;
356
                    }
357 4
                    $tmpText = $data[$i]['c'];
358 4
                    $decodedText = '';
359 4
                    if (isset($currentFont)) {
360 4
                        $decodedText = $currentFont->decodeOctal($tmpText);
361
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
362
                    }
363 4
                    $decodedText = str_replace(
364 4
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
365 4
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
366
                            $decodedText
367
                    );
368 4
                    $decodedText = utf8_encode($decodedText);
369 4
                    if (isset($currentFont)) {
370 4
                        $decodedText = $currentFont->decodeContent($decodedText, $unicode);
371
                    }
372 4
                    $command['c'][$i]['c'] = $decodedText;
373 4
                    continue;
374
                }
375 4
            } elseif ('Tf' == $command['o'] or 'TF' == $command['o']) {
376 4
                $fontId = explode(' ', $command['c'])[0];
377 4
                $currentFont = $this->getFont($fontId);
378 4
                continue;
379
            }
380
        }
381
382 4
        return $extractedRawData;
383
    }
384
385
    /**
386
     * Gets just the Text commands that are involved in text positions and
387
     * Text Matrix (Tm)
388
     *
389
     * It extract just the PDF commands that are involved with text positions, and
390
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
391
     *
392
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
393
     *                                       If it is null, the method extractDecodeRawData is called.
394
     *
395
     * @return array An array with the text command of the page
396
     */
397 3
    public function getDataCommands($extractedDecodedRawData = null)
398
    {
399 3
        if (!isset($extractedDecodedRawData) or !$extractedDecodedRawData) {
400 3
            $extractedDecodedRawData = $this->extractDecodedRawData();
401
        }
402 3
        $extractedData = [];
403 3
        foreach ($extractedDecodedRawData as $command) {
404 3
            switch ($command['o']) {
405
                /*
406
                 * BT
407
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
408
                 */
409 3
                case 'BT':
410 3
                    $extractedData[] = $command;
411 3
                    break;
412
413
                /*
414
                 * ET
415
                 * End a text object, discarding the text matrix
416
                 */
417 3
                case 'ET':
418
                    $extractedData[] = $command;
419
                    break;
420
421
                /*
422
                 * leading TL
423
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
424
                 * Initial value: 0
425
                 */
426 3
                case 'TL':
427 3
                    $extractedData[] = $command;
428 3
                    break;
429
430
                /*
431
                 * tx ty Td
432
                 * Move to the start of the next line, offset form the start of the
433
                 * current line by tx, ty.
434
                 */
435 3
                case 'Td':
436 3
                    $extractedData[] = $command;
437 3
                    break;
438
439
                /*
440
                 * tx ty TD
441
                 * Move to the start of the next line, offset form the start of the
442
                 * current line by tx, ty. As a side effect, this operator set the leading
443
                 * parameter in the text state. This operator has the same effect as the
444
                 * code:
445
                 * -ty TL
446
                 * tx ty Td
447
                 */
448 3
                case 'TD':
449
                    $extractedData[] = $command;
450
                    break;
451
452
                /*
453
                 * a b c d e f Tm
454
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
455
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
456
                 * [1 0 0 1 0 0]
457
                 */
458 3
                case 'Tm':
459 3
                    $extractedData[] = $command;
460 3
                    break;
461
462
                /*
463
                 * T*
464
                 * Move to the start of the next line. This operator has the same effect
465
                 * as the code:
466
                 * 0 Tl Td
467
                 * Where Tl is the current leading parameter in the text state.
468
                 */
469 3
                case 'T*':
470 3
                    $extractedData[] = $command;
471 3
                    break;
472
473
                /*
474
                 * string Tj
475
                 * Show a Text String
476
                 */
477 3
                case 'Tj':
478 3
                    $extractedData[] = $command;
479 3
                    break;
480
481
                /*
482
                 * string '
483
                 * Move to the next line and show a text string. This operator has the
484
                 * same effect as the code:
485
                 * T*
486
                 * string Tj
487
                 */
488 3
                case "'":
489
                    $extractedData[] = $command;
490
                    break;
491
492
                /*
493
                 * aw ac string "
494
                 * Move to the next lkine and show a text string, using aw as the word
495
                 * spacing and ac as the character spacing. This operator has the same
496
                 * effect as the code:
497
                 * aw Tw
498
                 * ac Tc
499
                 * string '
500
                 * Tw set the word spacing, Tw, to wordSpace.
501
                 * Tc Set the character spacing, Tc, to charsSpace.
502
                 */
503 3
                case '"':
504
                    $extractedData[] = $command;
505
                    break;
506
507
                /*
508
                 * array TJ
509
                 * Show one or more text strings allow individual glyph positioning.
510
                 * Each lement of array con be a string or a number. If the element is
511
                 * a string, this operator shows the string. If it is a number, the
512
                 * operator adjust the text position by that amount; that is, it translates
513
                 * the text matrix, Tm. This amount is substracted form the current
514
                 * horizontal or vertical coordinate, depending on the writing mode.
515
                 * in the default coordinate system, a positive adjustment has the effect
516
                 * of moving the next glyph painted either to the left or down by the given
517
                 * amount.
518
                 */
519 3
                case 'TJ':
520 3
                    $extractedData[] = $command;
521 3
                    break;
522
                default:
523
            }
524
        }
525
526 3
        return $extractedData;
527
    }
528
529
    /**
530
     * Gets the Text Matrix of the text in the page
531
     *
532
     * Return an array where every item is an array where the first item is the
533
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
534
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
535
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
536
     *
537
     * @param array $dataCommands the data extracted by getDataCommands
538
     *                            if null getDataCommands is called
539
     *
540
     * @return array an array with the data of the page including the Tm information
541
     *               of any text in the page
542
     */
543 2
    public function getDataTm($dataCommands = null)
544
    {
545 2
        if (!isset($dataCommands) or !$dataCommands) {
546 2
            $dataCommands = $this->getDataCommands();
547
        }
548
549
        /*
550
         * At the beginning of a text object Tm is the identity matrix
551
         */
552 2
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
553
554
        /*
555
         *  Set the text leading used by T*, ' and " operators
556
         */
557 2
        $defaultTl = 0;
558
559
        /*
560
         * Setting where are the X and Y coordinates in the matrix (Tm)
561
         */
562 2
        $x = 4;
563 2
        $y = 5;
564 2
        $Tx = 0;
565 2
        $Ty = 0;
566
567 2
        $Tm = $defaultTm;
568 2
        $Tl = $defaultTl;
569
570 2
        $extractedData = [];
571 2
        foreach ($dataCommands as $command) {
572 2
            switch ($command['o']) {
573
                /*
574
                 * BT
575
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
576
                 */
577 2
                case 'BT':
578 2
                    $Tm = $defaultTm;
579 2
                    $Tl = $defaultTl; //review this.
580 2
                    $Tx = 0;
581 2
                    $Ty = 0;
582 2
                    break;
583
584
                /*
585
                 * ET
586
                 * End a text object, discarding the text matrix
587
                 */
588 2
                case 'ET':
589
                    $Tm = $defaultTm;
590
                    $Tl = $defaultTl;  //review this
591
                    $Tx = 0;
592
                    $Ty = 0;
593
                    break;
594
595
                /*
596
                 * leading TL
597
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
598
                 * Initial value: 0
599
                 */
600 2
                case 'TL':
601 2
                    $Tl = (float) $command['c'];
602 2
                    break;
603
604
                /*
605
                 * tx ty Td
606
                 * Move to the start of the next line, offset form the start of the
607
                 * current line by tx, ty.
608
                 */
609 2
                case 'Td':
610 2
                    $coord = explode(' ', $command['c']);
611 2
                    $Tx += (float) $coord[0];
612 2
                    $Ty += (float) $coord[1];
613 2
                    $Tm[$x] = (string) $Tx;
614 2
                    $Tm[$y] = (string) $Ty;
615 2
                    break;
616
617
                /*
618
                 * tx ty TD
619
                 * Move to the start of the next line, offset form the start of the
620
                 * current line by tx, ty. As a side effect, this operator set the leading
621
                 * parameter in the text state. This operator has the same effect as the
622
                 * code:
623
                 * -ty TL
624
                 * tx ty Td
625
                 */
626 2
                case 'TD':
627
                    $coord = explode(' ', $command['c']);
628
                    $Tl = (float) $coord[1];
629
                    $Tx += (float) $coord[0];
630
                    $Ty -= (float) $coord[1];
631
                    $Tm[$x] = (string) $Tx;
632
                    $Tm[$y] = (string) $Ty;
633
                    break;
634
635
                /*
636
                 * a b c d e f Tm
637
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
638
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
639
                 * [1 0 0 1 0 0]
640
                 */
641 2
                case 'Tm':
642 2
                    $Tm = explode(' ', $command['c']);
643 2
                    $Tx = (float) $Tm[$x];
644 2
                    $Ty = (float) $Tm[$y];
645 2
                    break;
646
647
                /*
648
                 * T*
649
                 * Move to the start of the next line. This operator has the same effect
650
                 * as the code:
651
                 * 0 Tl Td
652
                 * Where Tl is the current leading parameter in the text state.
653
                 */
654 2
                case 'T*':
655 2
                    $Ty -= $Tl;
656 2
                    $Tm[$y] = (string) $Ty;
657 2
                    break;
658
659
                /*
660
                 * string Tj
661
                 * Show a Text String
662
                 */
663 2
                case 'Tj':
664 2
                    $extractedData[] = [$Tm, $command['c']];
665 2
                    break;
666
667
                /*
668
                 * string '
669
                 * Move to the next line and show a text string. This operator has the
670
                 * same effect as the code:
671
                 * T*
672
                 * string Tj
673
                 */
674 2
                case "'":
675
                    $Ty -= $Tl;
676
                    $Tm[$y] = (string) $Ty;
677
                    $extractedData[] = [$Tm, $command['c']];
678
                    break;
679
680
                /*
681
                 * aw ac string "
682
                 * Move to the next line and show a text string, using aw as the word
683
                 * spacing and ac as the character spacing. This operator has the same
684
                 * effect as the code:
685
                 * aw Tw
686
                 * ac Tc
687
                 * string '
688
                 * Tw set the word spacing, Tw, to wordSpace.
689
                 * Tc Set the character spacing, Tc, to charsSpace.
690
                 */
691 2
                case '"':
692
                    $data = explode(' ', $command['c']);
693
                    $Ty -= $Tl;
694
                    $Tm[$y] = (string) $Ty;
695
                    $extractedData[] = [$Tm, $data[2]]; //Verify
696
                    break;
697
698
                /*
699
                 * array TJ
700
                 * Show one or more text strings allow individual glyph positioning.
701
                 * Each lement of array con be a string or a number. If the element is
702
                 * a string, this operator shows the string. If it is a number, the
703
                 * operator adjust the text position by that amount; that is, it translates
704
                 * the text matrix, Tm. This amount is substracted form the current
705
                 * horizontal or vertical coordinate, depending on the writing mode.
706
                 * in the default coordinate system, a positive adjustment has the effect
707
                 * of moving the next glyph painted either to the left or down by the given
708
                 * amount.
709
                 */
710 2
                case 'TJ':
711 2
                    $text = [];
712 2
                    $data = $command['c'];
713 2
                    $numText = \count($data);
714 2
                    for ($i = 0; $i < $numText; ++$i) {
715 2
                        if ('n' == $data[$i]['t']) {
716 2
                            continue;
717
                        }
718 2
                        $tmpText = $data[$i]['c'];
719 2
                        $text[] = $tmpText;
720
                    }
721 2
                    $tjText = ''.implode('', $text);
722 2
                    $extractedData[] = [$Tm, $tjText];
723 2
                    break;
724
                default:
725
            }
726
        }
727 2
        $this->dataTm = $extractedData;
728
729 2
        return $extractedData;
730
    }
731
732
    /**
733
     * Gets text data that are around the given coordinates (X,Y)
734
     *
735
     * If the text is in near the given coordinates (X,Y) (or the TM info),
736
     * the text is returned.  The extractedData return by getDataTm, could be use to see
737
     * where is the coordinates of a given text, using the TM info for it.
738
     *
739
     * @param float $x      The X value of the coordinate to search for. if null
740
     *                      just the Y value is considered (same Row)
741
     * @param float $y      The Y value of the coordinate to search for
742
     *                      just the X value is considered (same column)
743
     * @param float $xError The value less or more to consider an X to be "near"
744
     * @param float $yError The value less or more to consider an Y to be "near"
745
     *
746
     * @return array An array of text that are near the given coordinates. If no text
747
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
748
     *               and y coordinates are null, null is returned.
749
     */
750 1
    public function getTextXY($x = null, $y = null, $xError = 0, $yError = 0)
751
    {
752 1
        if (!isset($this->dataTm) or !$this->dataTm) {
753 1
            $this->getDataTm();
754
        }
755
756 1
        if (null !== $x) {
757 1
            $x = (float) $x;
758
        }
759
760 1
        if (null !== $y) {
761 1
            $y = (float) $y;
762
        }
763
764 1
        if (null === $x and null === $y) {
765
            return [];
766
        }
767
768 1
        $xError = (float) $xError;
769 1
        $yError = (float) $yError;
770
771 1
        $extractedData = [];
772 1
        foreach ($this->dataTm as $item) {
773 1
            $tm = $item[0];
774 1
            $xTm = (float) $tm[4];
775 1
            $yTm = (float) $tm[5];
776 1
            $text = $item[1];
777 1
            if (null === $y) {
778
                if (($xTm >= ($x - $xError)) and
779
                    ($xTm <= ($x + $xError))) {
780
                    $extractedData[] = [$tm, $text];
781
                    continue;
782
                }
783
            }
784 1
            if (null === $x) {
785
                if (($yTm >= ($y - $yError)) and
786
                    ($yTm <= ($y + $yError))) {
787
                    $extractedData[] = [$tm, $text];
788
                    continue;
789
                }
790
            }
791 1
            if (($xTm >= ($x - $xError)) and
792 1
                ($xTm <= ($x + $xError)) and
793 1
                ($yTm >= ($y - $yError)) and
794 1
                ($yTm <= ($y + $yError))) {
795 1
                $extractedData[] = [$tm, $text];
796 1
                continue;
797
            }
798
        }
799
800 1
        return $extractedData;
801
    }
802
}
803