Test Failed
Pull Request — master (#455)
by
unknown
02:09
created

Page::createPageForFpdf()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 5
nc 1
nop 0
dl 0
loc 8
ccs 1
cts 1
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 * @date    2017-01-03
9
 *
10
 * @license LGPLv3
11
 * @url     <https://github.com/smalot/pdfparser>
12
 *
13
 *  PdfParser is a pdf library written in PHP, extraction oriented.
14
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
15
 *
16
 *  This program is free software: you can redistribute it and/or modify
17
 *  it under the terms of the GNU Lesser General Public License as published by
18
 *  the Free Software Foundation, either version 3 of the License, or
19
 *  (at your option) any later version.
20
 *
21
 *  This program is distributed in the hope that it will be useful,
22
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
23
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24
 *  GNU Lesser General Public License for more details.
25
 *
26
 *  You should have received a copy of the GNU Lesser General Public License
27
 *  along with this program.
28
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
29
 */
30
31
namespace Smalot\PdfParser;
32
33
use Smalot\PdfParser\Element\ElementArray;
34
use Smalot\PdfParser\Element\ElementMissing;
35
use Smalot\PdfParser\Element\ElementNull;
36
use Smalot\PdfParser\Element\ElementXRef;
37
38
class Page extends PDFObject
39
{
40
    /**
41
     * @var Font[]
42
     */
43
    protected $fonts = null;
44
45
    /**
46
     * @var PDFObject[]
47
     */
48
    protected $xobjects = null;
49
50
    /**
51
     * @var array
52
     */
53
    protected $dataTm = null;
54
55
    /**
56
     * @return Font[]
57
     */
58 23
    public function getFonts()
59
    {
60 23
        if (null !== $this->fonts) {
61 19
            return $this->fonts;
62
        }
63
64 23
        $resources = $this->get('Resources');
65
66 23
        if (method_exists($resources, 'has') && $resources->has('Font')) {
67 20
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

67
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
68 1
                return [];
69
            }
70
71 19
            if ($resources->get('Font') instanceof Header) {
72 13
                $fonts = $resources->get('Font')->getElements();
73
            } else {
74 8
                $fonts = $resources->get('Font')->getHeader()->getElements();
75
            }
76
77 19
            $table = [];
78
79 19
            foreach ($fonts as $id => $font) {
80 19
                if ($font instanceof Font) {
81 19
                    $table[$id] = $font;
82
83
                    // Store too on cleaned id value (only numeric)
84 19
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
85 19
                    if ('' != $id) {
86 19
                        $table[$id] = $font;
87
                    }
88
                }
89
            }
90
91 19
            return $this->fonts = $table;
92
        }
93
94 5
        return [];
95
    }
96
97 21
    public function getFont(string $id): ?Font
98
    {
99 21
        $fonts = $this->getFonts();
100
101 21
        if (isset($fonts[$id])) {
102 18
            return $fonts[$id];
103
        }
104
105
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
106
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
107
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
108
109 4
        if (isset($fonts[$id])) {
110
            return $fonts[$id];
111
        } else {
112 4
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
113 4
            if (isset($fonts[$id])) {
114 1
                return $fonts[$id];
115
            }
116
        }
117
118 3
        return null;
119
    }
120
121
    /**
122
     * Support for XObject
123
     *
124
     * @return PDFObject[]
125
     */
126 4
    public function getXObjects()
127
    {
128 4
        if (null !== $this->xobjects) {
129 3
            return $this->xobjects;
130
        }
131
132 4
        $resources = $this->get('Resources');
133
134 4
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
135 4
            if ($resources->get('XObject') instanceof Header) {
136 4
                $xobjects = $resources->get('XObject')->getElements();
137
            } else {
138
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
139
            }
140
141 4
            $table = [];
142
143 4
            foreach ($xobjects as $id => $xobject) {
144 4
                $table[$id] = $xobject;
145
146
                // Store too on cleaned id value (only numeric)
147 4
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
148 4
                if ('' != $id) {
149 4
                    $table[$id] = $xobject;
150
                }
151
            }
152
153 4
            return $this->xobjects = $table;
154
        }
155
156
        return [];
157
    }
158
159 4
    public function getXObject(string $id): ?PDFObject
160
    {
161 4
        $xobjects = $this->getXObjects();
162
163 4
        if (isset($xobjects[$id])) {
164 4
            return $xobjects[$id];
165
        }
166
167
        return null;
168
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
169
170
        if (isset($xobjects[$id])) {
171
            return $xobjects[$id];
172
        } else {
173
            return null;
174
        }*/
175
    }
176
177 13
    public function getText(self $page = null): string
178
    {
179 13
        if ($contents = $this->get('Contents')) {
180 13
            if ($contents instanceof ElementMissing) {
181
                return '';
182 13
            } elseif ($contents instanceof ElementNull) {
183
                return '';
184 13
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
185 10
                $elements = $contents->getHeader()->getElements();
186
187 10
                if (is_numeric(key($elements))) {
188
                    $new_content = '';
189
190
                    foreach ($elements as $element) {
191
                        if ($element instanceof ElementXRef) {
192
                            $new_content .= $element->getObject()->getContent();
193
                        } else {
194
                            $new_content .= $element->getContent();
195
                        }
196
                    }
197
198
                    $header = new Header([], $this->document);
199 10
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
200
                }
201 3
            } elseif ($contents instanceof ElementArray) {
202
                // Create a virtual global content.
203 3
                $new_content = '';
204
205 3
                foreach ($contents->getContent() as $content) {
206 3
                    $new_content .= $content->getContent()."\n";
207
                }
208
209 3
                $header = new Header([], $this->document);
210 3
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
211
            }
212
213 13
            /*
214
             * Elements referencing each other on the same page can cause endless loops during text parsing.
215
             * To combat this we keep a recursionStack containing already parsed elements on the page.
216
             * The stack is only emptied here after getting text from a page.
217
             */
218
            $contentsText = $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

218
            /** @scrutinizer ignore-call */ 
219
            $contentsText = $contents->getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
219 4
            PDFObject::$recursionStack = [];
220
221 4
            return $contentsText;
222 4
        }
223
224 4
        return '';
225
    }
226 4
227 4
    /**
228
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
229 4
     *
230
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
231
     * pdf file was generated by FPDF/Fpfi.
232
     *
233
     * @return bool true is the current page is a FPDI/FPDF document
234
     */
235
    public function isFpdf(): bool
236
    {
237
        if (\array_key_exists('Producer', $this->document->getDetails()) &&
238
            \is_string($this->document->getDetails()['Producer']) &&
239
            str_starts_with($this->document->getDetails()['Producer'], 'FPDF')) {
240
            return true;
241
        }
242
243
        return false;
244
    }
245 4
246 1
    /**
247 4
     * Return the page number of the PDF document of the page object
248
     *
249
     * @return int the page number
250
     */
251
    public function getPageNumber(): int
252
    {
253
        $pages = $this->document->getPages();
254
        $numOfPages = \count($pages);
255
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
256
            if ($pages[$pageNum] === $this) {
257
                break;
258
            }
259
        }
260
261
        return $pageNum;
262
    }
263 3
264
    /**
265
     * Return the Object of the page if the document is a FPDF/FPDI document
266
     *
267
     * If the document was generated by FPDF/FPDI it returns the
268
     * PDFObject of the given page
269
     *
270
     * @return PDFObject The PDFObject for the page
271
     */
272
    public function getPDFObjectForFpdf(): PDFObject
273
    {
274 8
        $pageNum = $this->getPageNumber();
275
        $xObjects = $this->getXObjects();
276
277
        return $xObjects[$pageNum];
278
    }
279 8
280 8
    /**
281 8
     * Return a new PDFObject of the document created with FPDF/FPDI
282 8
     *
283
     * For a document generated by FPDF/FPDI, it generates a
284
     * new PDFObject for that document
285
     *
286
     * @return PDFObject The PDFObject
287
     */
288
    public function createPDFObjectForFpdf(): PDFObject
289
    {
290
        $pdfObject = $this->getPDFObjectForFpdf();
291
        $new_content = $pdfObject->getContent();
292
        $header = $pdfObject->getHeader();
293
        $config = $pdfObject->config;
294
295 8
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
296 8
    }
297 8
298
    /**
299 8
     * Return page if document is a FPDF/FPDI document
300 8
     *
301 8
     * @return Page The page
302
     */
303
    public function createPageForFpdf(): self
304
    {
305
        $pdfObject = $this->getPDFObjectForFpdf();
306 8
        $new_content = $pdfObject->getContent();
307
        $header = $pdfObject->getHeader();
308
        $config = $pdfObject->config;
309
310
        return new self($pdfObject->document, $header, $new_content, $config);
311
    }
312
313
    public function getTextArray(self $page = null): array
314
    {
315
        if ($this->isFpdf()) {
316
            $pdfObject = $this->getPDFObjectForFpdf();
317 7
            $newPdfObject = $this->createPDFObjectForFpdf();
318
319 7
            return $newPdfObject->getTextArray($pdfObject);
320 7
        }
321
        if ($contents = $this->get('Contents')) {
322 7
            if ($contents instanceof ElementMissing) {
323 7
                return [];
324 7
            } elseif ($contents instanceof ElementNull) {
325 7
                return [];
326 7
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
327 7
                $elements = $contents->getHeader()->getElements();
328 5
329 5
                if (is_numeric(key($elements))) {
330 5
                    $new_content = '';
331
332
                    /** @var PDFObject $element */
333 5
                    foreach ($elements as $element) {
334 5
                        if ($element instanceof ElementXRef) {
335 5
                            $new_content .= $element->getObject()->getContent();
336
                        } else {
337
                            $new_content .= $element->getContent();
338 5
                        }
339 5
                    }
340 5
341
                    $header = new Header([], $this->document);
342 5
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
343 5
                } else {
344
                    try {
345 7
                        $contents->getTextArray($this);
346 7
                    } catch (\Throwable $e) {
347 7
                        return $contents->getTextArray();
348 5
                    }
349
                }
350 7
            } elseif ($contents instanceof ElementArray) {
351 7
                // Create a virtual global content.
352 7
                $new_content = '';
353 7
354 7
                /** @var PDFObject $content */
355
                foreach ($contents->getContent() as $content) {
356
                    $new_content .= $content->getContent()."\n";
357 7
                }
358 7
359 5
                $header = new Header([], $this->document);
360
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
361 7
            }
362 7
363
            return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

363
            return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
364 7
        }
365 7
366 7
        return [];
367 7
    }
368 7
369
    /**
370 7
     * Gets all the text data with its internal representation of the page.
371
     *
372
     * Returns an array with the data and the internal representation
373
     */
374
    public function extractRawData(): array
375 7
    {
376
        /*
377
         * Now you can get the complete content of the object with the text on it
378
         */
379
        $extractedData = [];
380
        $content = $this->get('Contents');
381
        $values = $content->getContent();
382
        if (isset($values) && \is_array($values)) {
383
            $text = '';
384
            foreach ($values as $section) {
385
                $text .= $section->getContent();
386
            }
387
            $sectionsText = $this->getSectionsText($text);
388
            foreach ($sectionsText as $sectionText) {
389
                $commandsText = $this->getCommandsText($sectionText);
390 5
                foreach ($commandsText as $command) {
391
                    $extractedData[] = $command;
392 5
                }
393 5
            }
394
        } else {
395 5
            if ($this->isFpdf()) {
396 5
                $content = $this->getPDFObjectForFpdf();
397 5
            }
398
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

398
            /** @scrutinizer ignore-call */ 
399
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
399
            foreach ($sectionsText as $sectionText) {
400
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
401
402 5
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

402
                /** @scrutinizer ignore-call */ 
403
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
403 5
                foreach ($commandsText as $command) {
404 5
                    $extractedData[] = $command;
405
                }
406
            }
407
        }
408
409
        return $extractedData;
410 5
    }
411
412
    /**
413
     * Gets all the decoded text data with it internal representation from a page.
414
     *
415
     * @param array $extractedRawData the extracted data return by extractRawData or
416
     *                                null if extractRawData should be called
417
     *
418
     * @return array An array with the data and the internal representation
419 5
     */
420 3
    public function extractDecodedRawData(array $extractedRawData = null): array
421 3
    {
422
        if (!isset($extractedRawData) || !$extractedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
423
            $extractedRawData = $this->extractRawData();
424
        }
425
        $currentFont = null; /** @var Font $currentFont */
426
        $clippedFont = null;
427
        $fpdfPage = null;
428 5
        if ($this->isFpdf()) {
429 5
            $fpdfPage = $this->createPageForFpdf();
430 5
        }
431
        foreach ($extractedRawData as &$command) {
432
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
433
                $data = $command['c'];
434
                if (!\is_array($data)) {
435
                    $tmpText = '';
436
                    if (isset($currentFont)) {
437
                        $tmpText = $currentFont->decodeOctal($data);
438
                        //$tmpText = $currentFont->decodeHexadecimal($tmpText, false);
439
                    }
440
                    $tmpText = str_replace(
441 5
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
442
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
443
                            $tmpText
444
                    );
445
                    $tmpText = utf8_encode($tmpText);
446
                    if (isset($currentFont)) {
447
                        $tmpText = $currentFont->decodeContent($tmpText);
448
                    }
449
                    $command['c'] = $tmpText;
450
                    continue;
451 5
                }
452 3
                $numText = \count($data);
453 3
                for ($i = 0; $i < $numText; ++$i) {
454
                    if (0 != ($i % 2)) {
455
                        continue;
456
                    }
457
                    $tmpText = $data[$i]['c'];
458
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
459
                    $decodedText = str_replace(
460
                            ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
461
                            ['\\', '(', ')', "\n", "\r", "\t", ' '],
462 5
                            $decodedText
463 3
                    );
464 3
                    $decodedText = utf8_encode($decodedText);
465
                    if (isset($currentFont)) {
466
                        $decodedText = $currentFont->decodeContent($decodedText);
467
                    }
468
                    $command['c'][$i]['c'] = $decodedText;
469
                    continue;
470 5
                }
471 4
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
472 4
                $fontId = explode(' ', $command['c'])[0];
473
                // If document is a FPDI/FPDF the $page has the correct font
474
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
475
                continue;
476
            } elseif ('Q' == $command['o']) {
477
                $currentFont = $clippedFont;
478
            } elseif ('q' == $command['o']) {
479
                $clippedFont = $currentFont;
480
            }
481 5
        }
482
483
        return $extractedRawData;
484
    }
485
486
    /**
487
     * Gets just the Text commands that are involved in text positions and
488
     * Text Matrix (Tm)
489
     *
490
     * It extract just the PDF commands that are involved with text positions, and
491
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
492
     *
493
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
494
     *                                       If it is null, the method extractDecodeRawData is called.
495
     *
496 5
     * @return array An array with the text command of the page
497
     */
498
    public function getDataCommands(array $extractedDecodedRawData = null): array
499
    {
500
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
501
            $extractedDecodedRawData = $this->extractDecodedRawData();
502
        }
503
        $extractedData = [];
504
        foreach ($extractedDecodedRawData as $command) {
505
            switch ($command['o']) {
506
                /*
507
                 * BT
508
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
509
                 */
510
                case 'BT':
511
                    $extractedData[] = $command;
512 5
                    break;
513 5
514 5
                /*
515
                 * ET
516
                 * End a text object, discarding the text matrix
517
                 */
518
                case 'ET':
519 5
                    $extractedData[] = $command;
520
                    break;
521
522
                /*
523
                 * leading TL
524
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
525
                 * Initial value: 0
526
                 */
527
                case 'TL':
528
                    $extractedData[] = $command;
529
                    break;
530
531
                /*
532
                 * tx ty Td
533
                 * Move to the start of the next line, offset form the start of the
534
                 * current line by tx, ty.
535
                 */
536 4
                case 'Td':
537
                    $extractedData[] = $command;
538 4
                    break;
539 4
540
                /*
541
                 * tx ty TD
542
                 * Move to the start of the next line, offset form the start of the
543
                 * current line by tx, ty. As a side effect, this operator set the leading
544
                 * parameter in the text state. This operator has the same effect as the
545 4
                 * code:
546
                 * -ty TL
547
                 * tx ty Td
548
                 */
549
                case 'TD':
550 4
                    $extractedData[] = $command;
551
                    break;
552
553
                /*
554
                 * a b c d e f Tm
555 4
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
556 4
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
557 4
                 * [1 0 0 1 0 0]
558 4
                 */
559
                case 'Tm':
560 4
                    $extractedData[] = $command;
561 4
                    break;
562
563 4
                /*
564 4
                 * T*
565 4
                 * Move to the start of the next line. This operator has the same effect
566 4
                 * as the code:
567 4
                 * 0 Tl Td
568
                 * Where Tl is the current leading parameter in the text state.
569
                 */
570
                case 'T*':
571
                    $extractedData[] = $command;
572 4
                    break;
573 4
574 4
                /*
575 4
                 * string Tj
576 4
                 * Show a Text String
577 4
                 */
578
                case 'Tj':
579
                    $extractedData[] = $command;
580
                    break;
581
582
                /*
583 4
                 * string '
584
                 * Move to the next line and show a text string. This operator has the
585
                 * same effect as the code:
586
                 * T*
587
                 * string Tj
588
                 */
589
                case "'":
590
                    $extractedData[] = $command;
591
                    break;
592
593
                /*
594
                 * aw ac string "
595 4
                 * Move to the next lkine and show a text string, using aw as the word
596 2
                 * spacing and ac as the character spacing. This operator has the same
597 2
                 * effect as the code:
598
                 * aw Tw
599
                 * ac Tc
600
                 * string '
601
                 * Tw set the word spacing, Tw, to wordSpace.
602
                 * Tc Set the character spacing, Tc, to charsSpace.
603
                 */
604 4
                case '"':
605 4
                    $extractedData[] = $command;
606 4
                    break;
607 4
608 4
                /*
609 4
                 * array TJ
610 4
                 * Show one or more text strings allow individual glyph positioning.
611
                 * Each lement of array con be a string or a number. If the element is
612
                 * a string, this operator shows the string. If it is a number, the
613
                 * operator adjust the text position by that amount; that is, it translates
614
                 * the text matrix, Tm. This amount is substracted form the current
615
                 * horizontal or vertical coordinate, depending on the writing mode.
616
                 * in the default coordinate system, a positive adjustment has the effect
617
                 * of moving the next glyph painted either to the left or down by the given
618
                 * amount.
619
                 */
620
                case 'TJ':
621 4
                    $extractedData[] = $command;
622
                    break;
623
                default:
624
            }
625
        }
626
627
        return $extractedData;
628
    }
629
630
    /**
631
     * Gets the Text Matrix of the text in the page
632
     *
633
     * Return an array where every item is an array where the first item is the
634
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
635
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
636 4
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
637 2
     *
638 2
     * @param array $dataCommands the data extracted by getDataCommands
639 2
     *                            if null getDataCommands is called
640 2
     *
641
     * @return array an array with the data of the page including the Tm information
642
     *               of any text in the page
643
     */
644
    public function getDataTm(array $dataCommands = null): array
645
    {
646
        if (!isset($dataCommands) || !$dataCommands) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
647
            $dataCommands = $this->getDataCommands();
648
        }
649 4
650 2
        /*
651 2
         * At the beginning of a text object Tm is the identity matrix
652 2
         */
653
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
654
655
        /*
656
         *  Set the text leading used by T*, ' and " operators
657
         */
658 4
        $defaultTl = 0;
659 3
660 3
        /*
661
         * Setting where are the X and Y coordinates in the matrix (Tm)
662
         */
663
        $x = 4;
664
        $y = 5;
665
        $Tx = 0;
666
        $Ty = 0;
667
668
        $Tm = $defaultTm;
669 4
        $Tl = $defaultTl;
670
671
        $extractedTexts = $this->getTextArray();
672
        $extractedData = [];
673
        foreach ($dataCommands as $command) {
674
            $currentText = $extractedTexts[\count($extractedData)];
675
            switch ($command['o']) {
676
                /*
677
                 * BT
678
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
679
                 */
680
                case 'BT':
681
                    $Tm = $defaultTm;
682
                    $Tl = $defaultTl; //review this.
683
                    $Tx = 0;
684
                    $Ty = 0;
685
                    break;
686 4
687
                /*
688
                 * ET
689
                 * End a text object, discarding the text matrix
690
                 */
691
                case 'ET':
692
                    $Tm = $defaultTm;
693
                    $Tl = $defaultTl;  //review this
694
                    $Tx = 0;
695
                    $Ty = 0;
696
                    break;
697
698
                /*
699
                 * leading TL
700
                 * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
701
                 * Initial value: 0
702
                 */
703
                case 'TL':
704
                    $Tl = (float) $command['c'];
705 4
                    break;
706 4
707 4
                /*
708
                 * tx ty Td
709
                 * Move to the start of the next line, offset form the start of the
710
                 * current line by tx, ty.
711 4
                 */
712
                case 'Td':
713 4
                    $coord = explode(' ', $command['c']);
714
                    $Tx += (float) $coord[0];
715
                    $Ty += (float) $coord[1];
716
                    $Tm[$x] = (string) $Tx;
717
                    $Tm[$y] = (string) $Ty;
718
                    break;
719
720
                /*
721
                 * tx ty TD
722
                 * Move to the start of the next line, offset form the start of the
723
                 * current line by tx, ty. As a side effect, this operator set the leading
724
                 * parameter in the text state. This operator has the same effect as the
725
                 * code:
726
                 * -ty TL
727
                 * tx ty Td
728
                 */
729
                case 'TD':
730
                    $coord = explode(' ', $command['c']);
731
                    $Tl = (float) $coord[1];
732
                    $Tx += (float) $coord[0];
733
                    $Ty -= (float) $coord[1];
734 1
                    $Tm[$x] = (string) $Tx;
735
                    $Tm[$y] = (string) $Ty;
736 1
                    break;
737 1
738
                /*
739
                 * a b c d e f Tm
740 1
                 * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
741 1
                 * all numbers, and the initial value for Tm and Tlm is the identity matrix
742
                 * [1 0 0 1 0 0]
743
                 */
744 1
                case 'Tm':
745 1
                    $Tm = explode(' ', $command['c']);
746
                    $Tx = (float) $Tm[$x];
747
                    $Ty = (float) $Tm[$y];
748 1
                    break;
749
750
                /*
751
                 * T*
752 1
                 * Move to the start of the next line. This operator has the same effect
753 1
                 * as the code:
754
                 * 0 Tl Td
755 1
                 * Where Tl is the current leading parameter in the text state.
756 1
                 */
757 1
                case 'T*':
758 1
                    $Ty -= $Tl;
759 1
                    $Tm[$y] = (string) $Ty;
760 1
                    break;
761 1
762
                /*
763
                 * string Tj
764
                 * Show a Text String
765
                 */
766
                case 'Tj':
767
                    $extractedData[] = [$Tm, $currentText];
768 1
                    break;
769
770
                /*
771
                 * string '
772
                 * Move to the next line and show a text string. This operator has the
773
                 * same effect as the code:
774
                 * T*
775 1
                 * string Tj
776 1
                 */
777 1
                case "'":
778 1
                    $Ty -= $Tl;
779 1
                    $Tm[$y] = (string) $Ty;
780 1
                    $extractedData[] = [$Tm, $currentText];
781
                    break;
782
783
                /*
784 1
                 * aw ac string "
785
                 * Move to the next line and show a text string, using aw as the word
786
                 * spacing and ac as the character spacing. This operator has the same
787
                 * effect as the code:
788
                 * aw Tw
789
                 * ac Tc
790
                 * string '
791
                 * Tw set the word spacing, Tw, to wordSpace.
792
                 * Tc Set the character spacing, Tc, to charsSpace.
793
                 */
794
                case '"':
795
                    $data = explode(' ', $currentText);
796
                    $Ty -= $Tl;
797
                    $Tm[$y] = (string) $Ty;
798
                    $extractedData[] = [$Tm, $data[2]]; //Verify
799
                    break;
800
801
                /*
802
                 * array TJ
803
                 * Show one or more text strings allow individual glyph positioning.
804
                 * Each lement of array con be a string or a number. If the element is
805
                 * a string, this operator shows the string. If it is a number, the
806
                 * operator adjust the text position by that amount; that is, it translates
807
                 * the text matrix, Tm. This amount is substracted form the current
808
                 * horizontal or vertical coordinate, depending on the writing mode.
809
                 * in the default coordinate system, a positive adjustment has the effect
810
                 * of moving the next glyph painted either to the left or down by the given
811
                 * amount.
812
                 */
813
                case 'TJ':
814
                    $extractedData[] = [$Tm, $currentText];
815
                    break;
816
                default:
817
            }
818
        }
819
        $this->dataTm = $extractedData;
820
821
        return $extractedData;
822
    }
823
824
    /**
825
     * Gets text data that are around the given coordinates (X,Y)
826
     *
827
     * If the text is in near the given coordinates (X,Y) (or the TM info),
828
     * the text is returned.  The extractedData return by getDataTm, could be use to see
829
     * where is the coordinates of a given text, using the TM info for it.
830
     *
831
     * @param float $x      The X value of the coordinate to search for. if null
832
     *                      just the Y value is considered (same Row)
833
     * @param float $y      The Y value of the coordinate to search for
834
     *                      just the X value is considered (same column)
835
     * @param float $xError The value less or more to consider an X to be "near"
836
     * @param float $yError The value less or more to consider an Y to be "near"
837
     *
838
     * @return array An array of text that are near the given coordinates. If no text
839
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
840
     *               and y coordinates are null, null is returned.
841
     */
842
    public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
843
    {
844
        if (!isset($this->dataTm) || !$this->dataTm) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
845
            $this->getDataTm();
846
        }
847
848
        if (null !== $x) {
849
            $x = (float) $x;
850
        }
851
852
        if (null !== $y) {
853
            $y = (float) $y;
854
        }
855
856
        if (null === $x && null === $y) {
857
            return [];
858
        }
859
860
        $xError = (float) $xError;
861
        $yError = (float) $yError;
862
863
        $extractedData = [];
864
        foreach ($this->dataTm as $item) {
865
            $tm = $item[0];
866
            $xTm = (float) $tm[4];
867
            $yTm = (float) $tm[5];
868
            $text = $item[1];
869
            if (null === $y) {
870
                if (($xTm >= ($x - $xError)) &&
871
                    ($xTm <= ($x + $xError))) {
872
                    $extractedData[] = [$tm, $text];
873
                    continue;
874
                }
875
            }
876
            if (null === $x) {
877
                if (($yTm >= ($y - $yError)) &&
878
                    ($yTm <= ($y + $yError))) {
879
                    $extractedData[] = [$tm, $text];
880
                    continue;
881
                }
882
            }
883
            if (($xTm >= ($x - $xError)) &&
884
                ($xTm <= ($x + $xError)) &&
885
                ($yTm >= ($y - $yError)) &&
886
                ($yTm <= ($y + $yError))) {
887
                $extractedData[] = [$tm, $text];
888
                continue;
889
            }
890
        }
891
892
        return $extractedData;
893
    }
894
}
895