Passed
Pull Request — master (#341)
by
unknown
01:42
created

Page::getDataCommands()   C

Complexity

Conditions 15
Paths 26

Size

Total Lines 130
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 32
CRAP Score 17.3795

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 15
eloc 40
c 1
b 0
f 0
nc 26
nop 1
dl 0
loc 130
ccs 32
cts 41
cp 0.7805
crap 17.3795
rs 5.9166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
class Page extends PDFObject
39
{
40
    /**
41
     * @var Font[]
42
     */
43
    protected $fonts = null;
44
45
    /**
46
     * @var PDFObject[]
47
     */
48
    protected $xobjects = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $dataTm = null;
54
55
    /**
56
     * @return Font[]
57
     */
58 11
    public function getFonts()
59
    {
60 11
        if (null !== $this->fonts) {
61 9
            return $this->fonts;
62
        }
63
64 11
        $resources = $this->get('Resources');
65
66 11
        if (method_exists($resources, 'has') && $resources->has('Font')) {
67 11
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

67
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
68 1
                return [];
69
            }
70
71 10
            if ($resources->get('Font') instanceof Header) {
72 5
                $fonts = $resources->get('Font')->getElements();
73
            } else {
74 7
                $fonts = $resources->get('Font')->getHeader()->getElements();
75
            }
76
77 10
            $table = [];
78
79 10
            foreach ($fonts as $id => $font) {
80 10
                if ($font instanceof Font) {
81 10
                    $table[$id] = $font;
82
83
                    // Store too on cleaned id value (only numeric)
84 10
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
85 10
                    if ('' != $id) {
86 10
                        $table[$id] = $font;
87
                    }
88
                }
89
            }
90
91 10
            return $this->fonts = $table;
92
        }
93
94 1
        return [];
95
    }
96
97
    /**
98
     * @param string $id
99
     *
100
     * @return Font|null
101
     */
102 9
    public function getFont($id)
103
    {
104 9
        $fonts = $this->getFonts();
105
106 9
        if (isset($fonts[$id])) {
107 9
            return $fonts[$id];
108
        }
109
110 2
        $id = preg_replace('/[^0-9\.\-_]/', '', $id);
111
112 2
        if (isset($fonts[$id])) {
113 1
            return $fonts[$id];
114
        }
115
116 1
        return null;
117
    }
118
119
    /**
120
     * Support for XObject
121
     *
122
     * @return PDFObject[]
123
     */
124 2
    public function getXObjects()
125
    {
126 2
        if (null !== $this->xobjects) {
127 2
            return $this->xobjects;
128
        }
129
130 2
        $resources = $this->get('Resources');
131
132 2
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
133 2
            if ($resources->get('XObject') instanceof Header) {
134 2
                $xobjects = $resources->get('XObject')->getElements();
135
            } else {
136
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
137
            }
138
139 2
            $table = [];
140
141 2
            foreach ($xobjects as $id => $xobject) {
142 2
                $table[$id] = $xobject;
143
144
                // Store too on cleaned id value (only numeric)
145 2
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
146 2
                if ('' != $id) {
147 2
                    $table[$id] = $xobject;
148
                }
149
            }
150
151 2
            return $this->xobjects = $table;
152
        }
153
154
        return [];
155
    }
156
157
    /**
158
     * @param string $id
159
     *
160
     * @return PDFObject|null
161
     */
162 2
    public function getXObject($id)
163
    {
164 2
        $xobjects = $this->getXObjects();
165
166 2
        if (isset($xobjects[$id])) {
167 2
            return $xobjects[$id];
168
        }
169
170
        return null;
171
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
172
173
        if (isset($xobjects[$id])) {
174
            return $xobjects[$id];
175
        } else {
176
            return null;
177
        }*/
178
    }
179
180
    /**
181
     * @param Page $page
182
     *
183
     * @return string
184
     */
185 4
    public function getText(self $page = null)
186
    {
187 4
        if ($contents = $this->get('Contents')) {
188 4
            if ($contents instanceof ElementMissing) {
189
                return '';
190 4
            } elseif ($contents instanceof ElementNull) {
191
                return '';
192 4
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
193 3
                $elements = $contents->getHeader()->getElements();
194
195 3
                if (is_numeric(key($elements))) {
196
                    $new_content = '';
197
198
                    foreach ($elements as $element) {
199
                        if ($element instanceof ElementXRef) {
200
                            $new_content .= $element->getObject()->getContent();
201
                        } else {
202
                            $new_content .= $element->getContent();
203
                        }
204
                    }
205
206
                    $header = new Header([], $this->document);
207 3
                    $contents = new PDFObject($this->document, $header, $new_content);
208
                }
209 2
            } elseif ($contents instanceof ElementArray) {
210
                // Create a virtual global content.
211 2
                $new_content = '';
212
213 2
                foreach ($contents->getContent() as $content) {
214 2
                    $new_content .= $content->getContent()."\n";
215
                }
216
217 2
                $header = new Header([], $this->document);
218 2
                $contents = new PDFObject($this->document, $header, $new_content);
219
            }
220
221 4
            return $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

221
            return $contents->/** @scrutinizer ignore-call */ getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
222
        }
223
224
        return '';
225
    }
226
227
    /**
228
     * @param Page $page
229
     *
230
     * @return array
231
     */
232
    public function getTextArray(self $page = null)
233
    {
234
        if ($contents = $this->get('Contents')) {
235
            if ($contents instanceof ElementMissing) {
236
                return [];
237
            } elseif ($contents instanceof ElementNull) {
238
                return [];
239
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
240
                $elements = $contents->getHeader()->getElements();
241
242
                if (is_numeric(key($elements))) {
243
                    $new_content = '';
244
245
                    /** @var PDFObject $element */
246
                    foreach ($elements as $element) {
247
                        if ($element instanceof ElementXRef) {
248
                            $new_content .= $element->getObject()->getContent();
249
                        } else {
250
                            $new_content .= $element->getContent();
251
                        }
252
                    }
253
254
                    $header = new Header([], $this->document);
255
                    $contents = new PDFObject($this->document, $header, $new_content);
256
                }
257
            } elseif ($contents instanceof ElementArray) {
258
                // Create a virtual global content.
259
                $new_content = '';
260
261
                /** @var PDFObject $content */
262
                foreach ($contents->getContent() as $content) {
263
                    $new_content .= $content->getContent()."\n";
264
                }
265
266
                $header = new Header([], $this->document);
267
                $contents = new PDFObject($this->document, $header, $new_content);
268
            }
269
270
            return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

270
            return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
271
        }
272
273
        return [];
274
    }
275
276
    /**
277
     * Gets all the text data with its internal representation of the page.
278
     *
279
     * @return array An array with the data and the internal representation
280
     */
281 5
    public function extractRawData()
282
    {
283
        /*
284
         * Now you can get the complete content of the object with the text on it
285
         */
286 5
        $extractedData = [];
287 5
        $content = $this->get('Contents');
288 5
        $values = $content->getContent();
289 5
        if (isset($values) and \is_array($values)) {
290
            $text = '';
291
            foreach ($values as $section) {
292
                $text .= $section->getContent();
293
            }
294
            $sectionsText = $this->getSectionsText($text);
295
            foreach ($sectionsText as $sectionText) {
296
                $commandsText = $this->getCommandsText($sectionText);
297
                foreach ($commandsText as $command) {
298
                    $extractedData[] = $command;
299
                }
300
            }
301
        } else {
302 5
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

302
            /** @scrutinizer ignore-call */ 
303
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
303 5
            foreach ($sectionsText as $sectionText) {
304 5
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
305
306 5
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

306
                /** @scrutinizer ignore-call */ 
307
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
307 5
                foreach ($commandsText as $command) {
308 5
                    $extractedData[] = $command;
309
                }
310
            }
311
        }
312
313 5
        return $extractedData;
314
    }
315
316
    /**
317
     * Gets all the decoded text data with it internal representation from a page.
318
     *
319
     * @param array $extractedRawData the extracted data return by extractRawData or
320
     *                                null if extractRawData should be called
321
     *
322
     * @return array An array with the data and the internal representation
323
     */
324 4
    public function extractDecodedRawData($extractedRawData = null)
325
    {
326 4
        if (!isset($extractedRawData) or !$extractedRawData) {
327 4
            $extractedRawData = $this->extractRawData();
328
        }
329 4
        $unicode = true;
330 4
        $currentFont = null;
331 4
        foreach ($extractedRawData as &$command) {
332 4
            if ('Tj' == $command['o'] or 'TJ' == $command['o']) {
333 4
                $data = $command['c'];
334 4
                if (!\is_array($data)) {
335 4
                    $tmpText = '';
336 4
                    if (isset($currentFont)) {
337 4
                        $tmpText = $currentFont->decodeOctal($data);
338
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
339
                    }
340 4
                    $tmpText = str_replace(
341 4
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
342 4
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
343
                            $tmpText
344
                    );
345 4
                    $tmpText = utf8_encode($tmpText);
346 4
                    if (isset($currentFont)) {
347 4
                        $tmpText = $currentFont->decodeContent($tmpText, $unicode);
348
                    }
349 4
                    $command['c'] = $tmpText;
350 4
                    continue;
351
                }
352 4
                $numText = \count($data);
353 4
                for ($i = 0; $i < $numText; ++$i) {
354 4
                    if (0 != ($i % 2)) {
355 4
                        continue;
356
                    }
357 4
                    $tmpText = $data[$i]['c'];
358 4
                    $decodedText = '';
359 4
                    if (isset($currentFont)) {
360 4
                        $decodedText = $currentFont->decodeOctal($tmpText);
361
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
362
                    }
363 4
                    $decodedText = str_replace(
364 4
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
365 4
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
366
                            $decodedText
367
                    );
368 4
                    $decodedText = utf8_encode($decodedText);
369 4
                    if (isset($currentFont)) {
370 4
                        $decodedText = $currentFont->decodeContent($decodedText, $unicode);
371
                    }
372 4
                    $command['c'][$i]['c'] = $decodedText;
373 4
                    continue;
374
                }
375 4
            } elseif ('Tf' == $command['o'] or 'TF' == $command['o']) {
376 4
                $fontId = explode(' ', $command['c'])[0];
377 4
                $currentFont = $this->getFont($fontId);
378 4
                continue;
379
            }
380
        }
381
382 4
        return $extractedRawData;
383
    }
384
385
    /**
386
     * Gets just the Text commands that are involved in text positions and
387
     * Text Matrix (Tm)
388
     *
389
     * It extract just the PDF commands that are involved with text positions, and
390
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
391
     *
392
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
393
     *                                       If it is null, the method extractDecodeRawData is called.
394
     *
395
     * @return array An array with the text command of the page
396
     */
397 3
    public function getDataCommands($extractedDecodedRawData = null)
398
    {
399 3
        if (!isset($extractedDecodedRawData) or !$extractedDecodedRawData) {
400 3
            $extractedDecodedRawData = $this->extractDecodedRawData();
401
        }
402 3
        $extractedData = [];
403 3
        foreach ($extractedDecodedRawData as $command) {
404 3
            switch ($command['o']) {
405
                /*
406
                 * BT
407
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
408
                 */
409 3
                case 'BT':
410 3
                    $extractedData[] = $command;
411 3
                    break;
412
413
                /*
414
                 * ET
415
                 * End a text object, discarding the text matrix
416
                 */
417 3
                case 'ET':
418
                    $extractedData[] = $command;
419
                    break;
420
421
                /*
422
                 * leading TL
423
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
424
                 * Initial value: 0
425
                 */
426 3
                case 'TL':
427 3
                    $extractedData[] = $command;
428 3
                    break;
429
430
                /*
431
                 * tx ty Td
432
                 * Move to the start of the next line, offset form the start of the
433
                 * current line by tx, ty.
434
                 */
435 3
                case 'Td':
436 3
                    $extractedData[] = $command;
437 3
                    break;
438
439
                /*
440
                 * tx ty TD
441
                 * Move to the start of the next line, offset form the start of the
442
                 * current line by tx, ty. As a side effect, this operator set the leading
443
                 * parameter in the text state. This operator has the same effect as the
444
                 * code:
445
                 * -ty TL
446
                 * tx ty Td
447
                 */
448 3
                case 'TD':
449
                    $extractedData[] = $command;
450
                    break;
451
452
                /*
453
                 * a b c d e f Tm
454
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
455
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
456
                 * [1 0 0 1 0 0]
457
                 */
458 3
                case 'Tm':
459 3
                    $extractedData[] = $command;
460 3
                    break;
461
462
                /*
463
                 * T*
464
                 * Move to the start of the next line. This operator has the same effect
465
                 * as the code:
466
                 * 0 Tl Td
467
                 * Where Tl is the current leading parameter in the text state.
468
                 */
469 3
                case 'T*':
470 3
                    $extractedData[] = $command;
471 3
                    break;
472
473
                /*
474
                 * string Tj
475
                 * Show a Text String
476
                 */
477 3
                case 'Tj':
478 3
                    $extractedData[] = $command;
479 3
                    break;
480
481
                /*
482
                 * string '
483
                 * Move to the next line and show a text string. This operator has the
484
                 * same effect as the code:
485
                 * T*
486
                 * string Tj
487
                 */
488 3
                case "'":
489
                    $extractedData[] = $command;
490
                    break;
491
492
                /*
493
                 * aw ac string "
494
                 * Move to the next lkine and show a text string, using aw as the word
495
                 * spacing and ac as the character spacing. This operator has the same
496
                 * effect as the code:
497
                 * aw Tw
498
                 * ac Tc
499
                 * string '
500
                 * Tw set the word spacing, Tw, to wordSpace.
501
                 * Tc Set the character spacing, Tc, to charsSpace.
502
                 */
503 3
                case '"':
504
                    $extractedData[] = $command;
505
                    break;
506
507
                /*
508
                 * array TJ
509
                 * Show one or more text strings allow individual glyph positioning.
510
                 * Each lement of array con be a string or a number. If the element is
511
                 * a string, this operator shows the string. If it is a number, the
512
                 * operator adjust the text position by that amount; that is, it translates
513
                 * the text matrix, Tm. This amount is substracted form the current
514
                 * horizontal or vertical coordinate, depending on the writing mode.
515
                 * in the default coordinate system, a positive adjustment has the effect
516
                 * of moving the next glyph painted either to the left or down by the given
517
                 * amount.
518
                 */
519 3
                case 'TJ':
520 3
                    $extractedData[] = $command;
521 3
                    break;
522
                default:
523
            }
524
        }
525
526 3
        return $extractedData;
527
    }
528
529
    /**
530
     * Gets the Text Matrix of the text in the page
531
     *
532
     * Return an array where every item is an array where the first item is the
533
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
534
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
535
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
536
     *
537
     * @param array $dataCommands the data extracted by getDataCommands
538
     *                            if null getDataCommands is called
539
     *
540
     * @return array an array with the data of the page including the Tm information
541
     *               of any text in the page
542
     */
543 2
    public function getDataTm($dataCommands = null)
544
    {
545 2
        if (!isset($dataCommands) or !$dataCommands) {
546 2
            $dataCommands = $this->getDataCommands();
547
        }
548
549
        /*
550
         * At the beginning of a text object Tm is the identity matrix
551
         */
552 2
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
553
554
        /*
555
         *  Set the text leading used by T*, ' and " operators
556
         */
557 2
        $defaultTl = 0;
558
559
        /*
560
         * Setting where are the X and Y coordinates in the matrix (Tm)
561
         */
562 2
        $x = 4;
563 2
        $y = 5;
564 2
        $Tx = 0;
565 2
        $Ty = 0;
566
567 2
        $Tm = $defaultTm;
568 2
        $Tl = $defaultTl;
569
570 2
        $extractedData = [];
571 2
        foreach ($dataCommands as $command) {
572 2
            switch ($command['o']) {
573
                /*
574
                 * BT
575
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
576
                 */
577 2
                case 'BT':
578 2
                    $Tm = $defaultTm;
579 2
                    $Tl = $defaultTl; //review this.
580 2
                    $Tx = 0;
581 2
                    $Ty = 0;
582 2
                    break;
583
584
                /*
585
                 * ET
586
                 * End a text object, discarding the text matrix
587
                 */
588 2
                case 'ET':
589
                    $Tm = $defaultTm;
590
                    $Tl = $defaultTl;  //review this
591
                    $Tx = 0;
592
                    $Ty = 0;
593
                    break;
594
595
                /*
596
                 * leading TL
597
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
598
                 * Initial value: 0
599
                 */
600 2
                case 'TL':
601 2
                    $Tl = (float) $command['c'];
602 2
                    break;
603
604
                /*
605
                 * tx ty Td
606
                 * Move to the start of the next line, offset form the start of the
607
                 * current line by tx, ty.
608
                 */
609 2
                case 'Td':
610 2
                    $coord = explode(' ', $command['c']);
611 2
                    $Tx += (float) $coord[0];
612 2
                    $Ty += (float) $coord[1];
613 2
                    $Tm[$x] = (string) $Tx;
614 2
                    $Tm[$y] = (string) $Ty;
615 2
                    break;
616
617
                /*
618
                 * tx ty TD
619
                 * Move to the start of the next line, offset form the start of the
620
                 * current line by tx, ty. As a side effect, this operator set the leading
621
                 * parameter in the text state. This operator has the same effect as the
622
                 * code:
623
                 * -ty TL
624
                 * tx ty Td
625
                 */
626 2
                case 'TD':
627
                    $coord = explode(' ', $command['c']);
628
                    $Tl = (float) $coord[1];
629
                    $Tx += (float) $coord[0];
630
                    $Ty -= (float) $coord[1];
631
                    $Tm[$x] = (string) $Tx;
632
                    $Tm[$y] = (string) $Ty;
633
                    break;
634
635
                /*
636
                 * a b c d e f Tm
637
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
638
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
639
                 * [1 0 0 1 0 0]
640
                 */
641 2
                case 'Tm':
642 2
                    $Tm = explode(' ', $command['c']);
643 2
                    $Tx = (float) $Tm[$x];
644 2
                    $Ty = (float) $Tm[$y];
645 2
                    break;
646
647
                /*
648
                 * T*
649
                 * Move to the start of the next line. This operator has the same effect
650
                 * as the code:
651
                 * 0 Tl Td
652
                 * Where Tl is the current leading parameter in the text state.
653
                 */
654 2
                case 'T*':
655 2
                    $Ty -= $Tl;
656 2
                    $Tm[$y] = (string) $Ty;
657 2
                    break;
658
659
                /*
660
                 * string Tj
661
                 * Show a Text String
662
                 */
663 2
                case 'Tj':
664 2
                    $extractedData[] = [$Tm, $command['c']];
665 2
                    break;
666
667
                /*
668
                 * string '
669
                 * Move to the next line and show a text string. This operator has the
670
                 * same effect as the code:
671
                 * T*
672
                 * string Tj
673
                 */
674 2
                case "'":
675
                    $Ty -= $Tl;
676
                    $Tm[$y] = (string) $Ty;
677
                    $extractedData[] = [$Tm, $command['c']];
678
                    break;
679
680
                /*
681
                 * aw ac string "
682
                 * Move to the next line and show a text string, using aw as the word
683
                 * spacing and ac as the character spacing. This operator has the same
684
                 * effect as the code:
685
                 * aw Tw
686
                 * ac Tc
687
                 * string '
688
                 * Tw set the word spacing, Tw, to wordSpace.
689
                 * Tc Set the character spacing, Tc, to charsSpace.
690
                 */
691 2
                case '"':
692
                    $data = explode(' ', $command['c']);
693
                    $Ty -= $Tl;
694
                    $Tm[$y] = (string) $Ty;
695
                    $extractedData[] = [$Tm, $data[2]]; //Verify
696
                    break;
697
698
                /*
699
                 * array TJ
700
                 * Show one or more text strings allow individual glyph positioning.
701
                 * Each lement of array con be a string or a number. If the element is
702
                 * a string, this operator shows the string. If it is a number, the
703
                 * operator adjust the text position by that amount; that is, it translates
704
                 * the text matrix, Tm. This amount is substracted form the current
705
                 * horizontal or vertical coordinate, depending on the writing mode.
706
                 * in the default coordinate system, a positive adjustment has the effect
707
                 * of moving the next glyph painted either to the left or down by the given
708
                 * amount.
709
                 */
710 2
                case 'TJ':
711 2
                    $text = [];
712 2
                    $data = $command['c'];
713 2
                    $numText = \count($data);
714 2
                    for ($i = 0; $i < $numText; ++$i) {
715 2
                        if ('n' == $data[$i]['t']) {
716 2
                            continue;
717
                        }
718 2
                        $tmpText = $data[$i]['c'];
719 2
                        $text[] = $tmpText;
720
                    }
721 2
                    $tjText = ''.implode('', $text);
722 2
                    $extractedData[] = [$Tm, $tjText];
723 2
                    break;
724
                default:
725
            }
726
        }
727 2
        $this->dataTm = $extractedData;
728
729 2
        return $extractedData;
730
    }
731
732
    /**
733
     * Gets text data that are around the given coordinates (X,Y)
734
     *
735
     * If the text is in near the given coordinates (X,Y) (or the TM info),
736
     * the text is returned.  The extractedData return by getDataTm, could be use to see
737
     * where is the coordinates of a given text, using the TM info for it.
738
     *
739
     * @param float $x      The X value of the coordinate to search for. if null
740
     *                      just the Y value is considered (same Row)
741
     * @param float $y      The Y value of the coordinate to search for
742
     *                      just the X value is considered (same column)
743
     * @param float $xError The value less or more to consider an X to be "near"
744
     * @param float $yError The value less or more to consider an Y to be "near"
745
     *
746
     * @return array An array of text that are near the given coordinates. If no text
747
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
748
     *               and y coordinates are null, null is returned.
749
     */
750 1
    public function getTextXY($x = null, $y = null, $xError = 0, $yError = 0)
751
    {
752 1
        if (!isset($this->dataTm) or !$this->dataTm) {
753 1
            $this->getDataTm();
754
        }
755
756 1
        if (null !== $x) {
757 1
            $x = (float) $x;
758
        }
759
760 1
        if (null !== $y) {
761 1
            $y = (float) $y;
762
        }
763
764 1
        if (null === $x and null === $y) {
765
            return [];
766
        }
767
768 1
        $xError = (float) $xError;
769 1
        $yError = (float) $yError;
770
771 1
        $extractedData = [];
772 1
        foreach ($this->dataTm as $item) {
773 1
            $tm = $item[0];
774 1
            $xTm = (float) $tm[4];
775 1
            $yTm = (float) $tm[5];
776 1
            $text = $item[1];
777 1
            if (null === $y) {
778
                if (($xTm >= ($x - $xError)) and
779
                    ($xTm <= ($x + $xError))) {
780
                    $extractedData[] = [$tm, $text];
781
                    continue;
782
                }
783
            }
784 1
            if (null === $x) {
785
                if (($yTm >= ($y - $yError)) and
786
                    ($yTm <= ($y + $yError))) {
787
                    $extractedData[] = [$tm, $text];
788
                    continue;
789
                }
790
            }
791 1
            if (($xTm >= ($x - $xError)) and
792 1
                ($xTm <= ($x + $xError)) and
793 1
                ($yTm >= ($y - $yError)) and
794 1
                ($yTm <= ($y + $yError))) {
795 1
                $extractedData[] = [$tm, $text];
796 1
                continue;
797
            }
798
        }
799
800 1
        return $extractedData;
801
    }
802
}
803