Completed
Push — master ( 6bc9dc...7f2d31 )
by Konrad
15s queued 12s
created

Page::getDataTm()   C

Complexity

Conditions 17
Paths 30

Size

Total Lines 187
Code Lines 81

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 17
eloc 81
c 2
b 0
f 0
nc 30
nop 1
dl 0
loc 187
rs 5.2166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
/**
39
 * Class Page
40
 */
41
class Page extends PDFObject
42
{
43
    /**
44
     * @var Font[]
45
     */
46
    protected $fonts = null;
47
48
    /**
49
     * @var PDFObject[]
50
     */
51
    protected $xobjects = null;
52
53
    /**
54
     * @var[]
55
     */
56
    protected $dataTm = null;
57
58
    /**
59
     * @return Font[]
60
     */
61
    public function getFonts()
62
    {
63
        if (null !== $this->fonts) {
64
            return $this->fonts;
65
        }
66
67
        $resources = $this->get('Resources');
68
69
        if (method_exists($resources, 'has') && $resources->has('Font')) {
70
            if ($resources->get('Font') instanceof Header) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

70
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof Header) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
71
                $fonts = $resources->get('Font')->getElements();
72
            } else {
73
                $fonts = $resources->get('Font')->getHeader()->getElements();
74
            }
75
76
            $table = [];
77
78
            foreach ($fonts as $id => $font) {
79
                if ($font instanceof Font) {
80
                    $table[$id] = $font;
81
82
                    // Store too on cleaned id value (only numeric)
83
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
84
                    if ('' != $id) {
85
                        $table[$id] = $font;
86
                    }
87
                }
88
            }
89
90
            return $this->fonts = $table;
91
        } else {
92
            return [];
93
        }
94
    }
95
96
    /**
97
     * @param string $id
98
     *
99
     * @return Font
100
     */
101
    public function getFont($id)
102
    {
103
        $fonts = $this->getFonts();
104
105
        if (isset($fonts[$id])) {
106
            return $fonts[$id];
107
        } else {
108
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
109
110
            if (isset($fonts[$id])) {
111
                return $fonts[$id];
112
            } else {
113
                return null;
114
            }
115
        }
116
    }
117
118
    /**
119
     * Support for XObject
120
     *
121
     * @return PDFObject[]
122
     */
123
    public function getXObjects()
124
    {
125
        if (null !== $this->xobjects) {
126
            return $this->xobjects;
127
        }
128
129
        $resources = $this->get('Resources');
130
131
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
132
            if ($resources->get('XObject') instanceof Header) {
133
                $xobjects = $resources->get('XObject')->getElements();
134
            } else {
135
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
136
            }
137
138
            $table = [];
139
140
            foreach ($xobjects as $id => $xobject) {
141
                $table[$id] = $xobject;
142
143
                // Store too on cleaned id value (only numeric)
144
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
145
                if ('' != $id) {
146
                    $table[$id] = $xobject;
147
                }
148
            }
149
150
            return $this->xobjects = $table;
151
        } else {
152
            return [];
153
        }
154
    }
155
156
    /**
157
     * @param string $id
158
     *
159
     * @return PDFObject
160
     */
161
    public function getXObject($id)
162
    {
163
        $xobjects = $this->getXObjects();
164
165
        if (isset($xobjects[$id])) {
166
            return $xobjects[$id];
167
        } else {
168
            return null;
169
            /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
170
171
            if (isset($xobjects[$id])) {
172
                return $xobjects[$id];
173
            } else {
174
                return null;
175
            }*/
176
        }
177
    }
178
179
    /**
180
     * @param Page
181
     *
182
     * @return string
183
     */
184
    public function getText(self $page = null)
185
    {
186
        if ($contents = $this->get('Contents')) {
187
            if ($contents instanceof ElementMissing) {
188
                return '';
189
            } elseif ($contents instanceof ElementNull) {
190
                return '';
191
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
192
                $elements = $contents->getHeader()->getElements();
193
194
                if (is_numeric(key($elements))) {
195
                    $new_content = '';
196
197
                    foreach ($elements as $element) {
198
                        if ($element instanceof ElementXRef) {
199
                            $new_content .= $element->getObject()->getContent();
200
                        } else {
201
                            $new_content .= $element->getContent();
202
                        }
203
                    }
204
205
                    $header = new Header([], $this->document);
206
                    $contents = new PDFObject($this->document, $header, $new_content);
207
                }
208
            } elseif ($contents instanceof ElementArray) {
209
                // Create a virtual global content.
210
                $new_content = '';
211
212
                foreach ($contents->getContent() as $content) {
213
                    $new_content .= $content->getContent()."\n";
214
                }
215
216
                $header = new Header([], $this->document);
217
                $contents = new PDFObject($this->document, $header, $new_content);
218
            }
219
220
            return $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
            return $contents->/** @scrutinizer ignore-call */ getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221
        }
222
223
        return '';
224
    }
225
226
    /**
227
     * @param Page
228
     *
229
     * @return array
230
     */
231
    public function getTextArray(self $page = null)
232
    {
233
        if ($contents = $this->get('Contents')) {
234
            if ($contents instanceof ElementMissing) {
235
                return [];
236
            } elseif ($contents instanceof ElementNull) {
237
                return [];
238
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
239
                $elements = $contents->getHeader()->getElements();
240
241
                if (is_numeric(key($elements))) {
242
                    $new_content = '';
243
244
                    /** @var PDFObject $element */
245
                    foreach ($elements as $element) {
246
                        if ($element instanceof ElementXRef) {
247
                            $new_content .= $element->getObject()->getContent();
248
                        } else {
249
                            $new_content .= $element->getContent();
250
                        }
251
                    }
252
253
                    $header = new Header([], $this->document);
254
                    $contents = new PDFObject($this->document, $header, $new_content);
255
                }
256
            } elseif ($contents instanceof ElementArray) {
257
                // Create a virtual global content.
258
                $new_content = '';
259
260
                /** @var PDFObject $content */
261
                foreach ($contents->getContent() as $content) {
262
                    $new_content .= $content->getContent()."\n";
263
                }
264
265
                $header = new Header([], $this->document);
266
                $contents = new PDFObject($this->document, $header, $new_content);
267
            }
268
269
            return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

269
            return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
270
        }
271
272
        return [];
273
    }
274
275
    /*
276
     * Gets all the text data with its internal representation of the page.
277
     *
278
     * @return array An array with the data and the internal representation
279
     *
280
     */
281
282
    public function extractRawData()
283
    {
284
        $text = $this->getText();
0 ignored issues
show
Unused Code introduced by
The assignment to $text is dead and can be removed.
Loading history...
285
        /*
286
         * Now you can get the complete content of the object with the text on it
287
         */
288
        $extractedData = [];
289
        $content = $this->get('Contents');
290
        $values = $content->getContent();
291
        if (isset($values) and \is_array($values)) {
292
            $text = '';
293
            foreach ($values as $section) {
294
                $text .= $section->getContent();
295
            }
296
            $sectionsText = $this->getSectionsText($text);
297
            foreach ($sectionsText as $sectionText) {
298
                $commandsText = $this->getCommandsText($sectionText);
299
                foreach ($commandsText as $command) {
300
                    $extractedData[] = $command;
301
                }
302
            }
303
        } else {
304
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

304
            /** @scrutinizer ignore-call */ 
305
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
305
            foreach ($sectionsText as $sectionText) {
306
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

306
                /** @scrutinizer ignore-call */ 
307
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
307
                foreach ($commandsText as $command) {
308
                    $extractedData[] = $command;
309
                }
310
            }
311
        }
312
313
        return $extractedData;
314
    }
315
316
    /*
317
     * Gets all the decoded text data with it internal representation from a page.
318
     *
319
     * @param array $extractedRawData the extracted data return by extractRawData or
320
     *                                null if extractRawData should be called
321
     *
322
     * @return array An array with the data and the internal representation
323
     *
324
     */
325
    public function extractDecodedRawData($extractedRawData = null)
326
    {
327
        if (!isset($extractedRawData) or !$extractedRawData) {
328
            $extractedRawData = $this->extractRawData();
329
        }
330
        $unicode = true;
331
        $currentFont = null;
332
        foreach ($extractedRawData as &$command) {
333
            if ('Tj' == $command['o'] or 'TJ' == $command['o']) {
334
                $text = [];
0 ignored issues
show
Unused Code introduced by
The assignment to $text is dead and can be removed.
Loading history...
335
                $data = $command['c'];
336
                if (!\is_array($data)) {
337
                    if (isset($currentFont)) {
338
                        $tmpText = $currentFont->decodeOctal($data);
339
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
340
                    }
341
                    $tmpText = $tjText = str_replace(
0 ignored issues
show
Unused Code introduced by
The assignment to $tjText is dead and can be removed.
Loading history...
342
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
343
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
344
                            $tmpText
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $tmpText seems to be defined later in this foreach loop on line 338. Are you sure it is defined here?
Loading history...
345
                    );
346
                    $tmpText = utf8_encode($tmpText);
347
                    if (isset($currentFont)) {
348
                        $tmpText = $currentFont->decodeContent($tmpText, $unicode);
349
                    }
350
                    $command['c'] = $tmpText;
351
                    continue;
352
                }
353
                $numText = \count($data);
354
                for ($i = 0; $i < $numText; ++$i) {
355
                    if (0 != ($i % 2)) {
356
                        continue;
357
                    }
358
                    $tmpText = $data[$i]['c'];
359
                    if (isset($currentFont)) {
360
                        $decodedText = $currentFont->decodeOctal($tmpText);
361
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
362
                    }
363
                    $decodedText = $tjText = str_replace(
364
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
365
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
366
                            $decodedText
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $decodedText does not seem to be defined for all execution paths leading up to this point.
Loading history...
367
                    );
368
                    $decodedText = utf8_encode($decodedText);
369
                    if (isset($currentFont)) {
370
                        $decodedText = $currentFont->decodeContent($decodedText, $unicode);
371
                    }
372
                    $command['c'][$i]['c'] = $decodedText;
373
                    continue;
374
                }
375
            } elseif ('Tf' == $command['o'] or 'TF' == $command['o']) {
376
                $fontId = explode(' ', $command['c'])[0];
377
                $currentFont = $this->getFont($fontId);
378
                continue;
379
            }
380
        }
381
382
        return $extractedRawData;
383
    }
384
385
    /*
386
     * Gets just the Text commands that are involved in text positions and
387
     * Text Matrix (Tm)
388
     *
389
     * It extract just the PDF commands that are involved with text positions, and
390
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
391
     *
392
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData
393
                           if it is null, the method extractDecodeRawData is called.
394
     *
395
     * @return array An array with the text command of the page
396
     *
397
     */
398
    public function getDataCommands($extractedDecodedRawData = null)
399
    {
400
        if (!isset($extractedDecodedRawData) or !$extractedDecodedRawData) {
401
            $extractedDecodedRawData = $this->extractDecodedRawData();
402
        }
403
        $extractedData = [];
404
        foreach ($extractedDecodedRawData as $command) {
405
            switch ($command['o']) {
406
                /*
407
                 * BT
408
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
409
                 */
410
                case 'BT':
411
                    $extractedData[] = $command;
412
                    break;
413
414
                /*
415
                 * ET
416
                 * End a text object, discarding the text matrix
417
                 */
418
                case 'ET':
419
                    $extractedData[] = $command;
420
                    break;
421
422
                /*
423
                 * leading TL
424
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
425
                 * Initial value: 0
426
                 */
427
                case 'TL':
428
                    $extractedData[] = $command;
429
                    break;
430
431
                /*
432
                 * tx ty Td
433
                 * Move to the start of the next line, offset form the start of the
434
                 * current line by tx, ty.
435
                 */
436
                case 'Td':
437
                    $extractedData[] = $command;
438
                    break;
439
440
                /*
441
                 * tx ty TD
442
                 * Move to the start of the next line, offset form the start of the
443
                 * current line by tx, ty. As a side effect, this operator set the leading
444
                 * parameter in the text state. This operator has the same effect as the
445
                 * code:
446
                 * -ty TL
447
                 * tx ty Td
448
                 */
449
                case 'TD':
450
                    $extractedData[] = $command;
451
                    break;
452
453
                /*
454
                 * a b c d e f Tm
455
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
456
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
457
                 * [1 0 0 1 0 0]
458
                 */
459
                case 'Tm':
460
                    $extractedData[] = $command;
461
                    break;
462
463
                /*
464
                 * T*
465
                 * Move to the start of the next line. This operator has the same effect
466
                 * as the code:
467
                 * 0 Tl Td
468
                 * Where Tl is the current leading parameter in the text state.
469
                 */
470
                case 'T*':
471
                    $extractedData[] = $command;
472
                    break;
473
474
                /*
475
                 * string Tj
476
                 * Show a Text String
477
                 */
478
                case 'Tj':
479
                    $extractedData[] = $command;
480
                    break;
481
482
                /*
483
                 * string '
484
                 * Move to the next line and show a text string. This operator has the
485
                 * same effect as the code:
486
                 * T*
487
                 * string Tj
488
                 */
489
                case "'":
490
                    $extractedData[] = $command;
491
                    break;
492
493
                /*
494
                 * aw ac string "
495
                 * Move to the next lkine and show a text string, using aw as the word
496
                 * spacing and ac as the character spacing. This operator has the same
497
                 * effect as the code:
498
                 * aw Tw
499
                 * ac Tc
500
                 * string '
501
                 * Tw set the word spacing, Tw, to wordSpace.
502
                 * Tc Set the character spacing, Tc, to charsSpace.
503
                 */
504
                case '"':
505
                    $extractedData[] = $command;
506
                    break;
507
508
                /*
509
                 * array TJ
510
                 * Show one or more text strings allow individual glyph positioning.
511
                 * Each lement of array con be a string or a number. If the element is
512
                 * a string, this operator shows the string. If it is a number, the
513
                 * operator adjust the text position by that amount; that is, it translates
514
                 * the text matrix, Tm. This amount is substracted form the current
515
                 * horizontal or vertical coordinate, depending on the writing mode.
516
                 * in the default coordinate system, a positive adjustment has the effect
517
                 * of moving the next glyph painted either to the left or down by the given
518
                 * amount.
519
                 */
520
                case 'TJ':
521
                    $extractedData[] = $command;
522
                    break;
523
                default:
524
            }
525
        }
526
527
        return $extractedData;
528
    }
529
530
    /*
531
     * Gets the Text Matrix of the text in the page
532
     *
533
     * Return an array where every item is an array where the first item is the
534
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
535
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
536
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
537
     *
538
     * @param array $dataCommands the data extracted by getDataCommands
539
     *                     if null getDataCommands is called.
540
     *
541
     * @return array An array with the data of the page including the Tm information
542
     *         of any text in the page.
543
     */
544
545
    public function getDataTm($dataCommands = null)
546
    {
547
        if (!isset($dataCommands) or !$dataCommands) {
548
            $dataCommands = $this->getDataCommands();
549
        }
550
551
        /*
552
         * At the beginning of a text object Tm is the identity matrix
553
         */
554
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
555
556
        /*
557
         *  Set the text leading used by T*, ' and " operators
558
         */
559
        $defaultTl = 0;
560
561
        /*
562
         * Setting where are the X and Y coordinates in the matrix (Tm)
563
         */
564
        $x = 4;
565
        $y = 5;
566
        $Tx = 0;
567
        $Ty = 0;
568
569
        $Tm = $defaultTm;
570
        $Tl = $defaultTl;
571
572
        $extractedData = [];
573
        foreach ($dataCommands as $command) {
574
            switch ($command['o']) {
575
                /*
576
                 * BT
577
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
578
                 */
579
                case 'BT':
580
                    $Tm = $defaultTl;
581
                    $Tl = $defaultTl; //review this.
582
                    $Tx = 0;
583
                    $Ty = 0;
584
                    break;
585
586
                /*
587
                 * ET
588
                 * End a text object, discarding the text matrix
589
                 */
590
                case 'ET':
591
                    $Tm = $defaultTl;
592
                    $Tl = $defaultTl;  //review this
593
                    $Tx = 0;
594
                    $Ty = 0;
595
                    break;
596
597
                /*
598
                 * leading TL
599
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
600
                 * Initial value: 0
601
                 */
602
                case 'TL':
603
                    $Tl = (float) $command['c'];
604
                    break;
605
606
                /*
607
                 * tx ty Td
608
                 * Move to the start of the next line, offset form the start of the
609
                 * current line by tx, ty.
610
                 */
611
                case 'Td':
612
                    $coord = explode(' ', $command['c']);
613
                    $Tx += (float) $coord[0];
614
                    $Ty += (float) $coord[1];
615
                    $Tm[$x] = (string) $Tx;
616
                    $Tm[$y] = (string) $Ty;
617
                    break;
618
619
                /*
620
                 * tx ty TD
621
                 * Move to the start of the next line, offset form the start of the
622
                 * current line by tx, ty. As a side effect, this operator set the leading
623
                 * parameter in the text state. This operator has the same effect as the
624
                 * code:
625
                 * -ty TL
626
                 * tx ty Td
627
                 */
628
                case 'TD':
629
                    $coord = explode(' ', $command['c']);
630
                    $Tl = (float) $coord[1];
631
                    $Tx += (float) $coord[0];
632
                    $Ty -= (float) $coord[1];
633
                    $Tm[$x] = (string) $Tx;
634
                    $Tm[$y] = (string) $Ty;
635
                    break;
636
637
                /*
638
                 * a b c d e f Tm
639
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
640
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
641
                 * [1 0 0 1 0 0]
642
                 */
643
                case 'Tm':
644
                    $Tm = explode(' ', $command['c']);
645
                    $Tx = (float) $Tm[$x];
646
                    $Ty = (float) $Tm[$y];
647
                    break;
648
649
                /*
650
                 * T*
651
                 * Move to the start of the next line. This operator has the same effect
652
                 * as the code:
653
                 * 0 Tl Td
654
                 * Where Tl is the current leading parameter in the text state.
655
                 */
656
                case 'T*':
657
                    $Ty -= $Tl;
658
                    $Tm[$y] = (string) $Ty;
659
                    break;
660
661
                /*
662
                 * string Tj
663
                 * Show a Text String
664
                 */
665
                case 'Tj':
666
                    $extractedData[] = [$Tm, $command['c']];
667
                    break;
668
669
                /*
670
                 * string '
671
                 * Move to the next line and show a text string. This operator has the
672
                 * same effect as the code:
673
                 * T*
674
                 * string Tj
675
                 */
676
                case "'":
677
                    $Ty -= Tl;
0 ignored issues
show
Bug introduced by
The constant Smalot\PdfParser\Tl was not found. Maybe you did not declare it correctly or list all dependencies?
Loading history...
678
                    $Tm[$y] = (string) $Ty;
679
                    $extractedData[] = [$Tm, $command['c']];
680
                    break;
681
682
                /*
683
                 * aw ac string "
684
                 * Move to the next line and show a text string, using aw as the word
685
                 * spacing and ac as the character spacing. This operator has the same
686
                 * effect as the code:
687
                 * aw Tw
688
                 * ac Tc
689
                 * string '
690
                 * Tw set the word spacing, Tw, to wordSpace.
691
                 * Tc Set the character spacing, Tc, to charsSpace.
692
                 */
693
                case '"':
694
                    $data = explode(' ', $command['c']);
695
                    $Ty -= Tl;
696
                    $Tm[$y] = (string) $Ty;
697
                    $extractedData[] = [$Tm, $data[2]]; //Verify
698
                    break;
699
700
                /*
701
                 * array TJ
702
                 * Show one or more text strings allow individual glyph positioning.
703
                 * Each lement of array con be a string or a number. If the element is
704
                 * a string, this operator shows the string. If it is a number, the
705
                 * operator adjust the text position by that amount; that is, it translates
706
                 * the text matrix, Tm. This amount is substracted form the current
707
                 * horizontal or vertical coordinate, depending on the writing mode.
708
                 * in the default coordinate system, a positive adjustment has the effect
709
                 * of moving the next glyph painted either to the left or down by the given
710
                 * amount.
711
                 */
712
                case 'TJ':
713
                    $text = [];
714
                    $data = $command['c'];
715
                    $numText = \count($data);
716
                    for ($i = 0; $i < $numText; ++$i) {
717
                        if ('n' == $data[$i]['t']) {
718
                            continue;
719
                        }
720
                        $tmpText = $data[$i]['c'];
721
                        $text[] = $tmpText;
722
                    }
723
                    $tjText = ''.implode('', $text);
724
                    $extractedData[] = [$Tm, $tjText];
725
                    break;
726
                default:
727
            }
728
        }
729
        $this->dataTm = $extractedData;
730
731
        return $extractedData;
732
    }
733
734
    /*
735
     * Gets text data that are around the given coordinates (X,Y)
736
     *
737
     * If the text is in near the given coordinates (X,Y) (or the TM info),
738
     * the text is returned.  The extractedData return by getDataTm, could be use to see
739
     * where is the coordinates of a given text, using the TM info for it.
740
     *
741
     * @param float $x The X value of the coordinate to search for. if null
742
     *                 just the Y value is considered (same Row)
743
     * @param float $y The Y value of the coordinate to search for
744
     *                 just the X value is considered (same column)
745
     * @param float $xError The value less or more to consider an X to be "near"
746
     * @param float $yError The value less or more to consider an Y to be "near"
747
     *
748
     * @return array An array of text that are near the given coordinates. If no text
749
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
750
     *               and y coordinates are null, null is returned.
751
     */
752
    public function getTextXY($x, $y, $xError = 0, $yError = 0)
753
    {
754
        if (!isset($this->dataTm) or !$this->dataTm) {
755
            $this->getDataTm();
756
        }
757
        if (isset($x)) {
758
            $x = (float) $x;
759
        }
760
        if (isset($y)) {
761
            $y = (float) $y;
762
        }
763
        if (!isset($x) and !isset($y)) {
764
            return null;
765
        }
766
767
        if (!isset($xError)) {
768
            $xError = 0;
769
        } else {
770
            $xError = (float) $xError;
771
        }
772
        if (!isset($yError)) {
773
            $yError = 0;
774
        } else {
775
            $yError = (float) $yError;
776
        }
777
        $extractedData = [];
778
        foreach ($this->dataTm as $item) {
779
            $tm = $item[0];
780
            $xTm = (float) $tm[4];
781
            $yTm = (float) $tm[5];
782
            $text = $item[1];
783
            if (!isset($y)) {
784
                if (($xTm >= ($x - $xError)) and
785
                    ($xTm <= ($x + $xError))) {
786
                    $extractedData[] = [$tm, $text];
787
                    continue;
788
                }
789
            }
790
            if (!isset($x)) {
791
                if (($yTm >= ($y - $yError)) and
792
                    ($yTm <= ($y + $yError))) {
793
                    $extractedData[] = [$tm, $text];
794
                    continue;
795
                }
796
            }
797
            if (($xTm >= ($x - $xError)) and
798
                ($xTm <= ($x + $xError)) and
799
                ($yTm >= ($y - $yError)) and
800
                ($yTm <= ($y + $yError))) {
801
                $extractedData[] = [$tm, $text];
802
                continue;
803
            }
804
        }
805
806
        return $extractedData;
807
    }
808
}
809