Test Failed
Pull Request — master (#559)
by
unknown
02:31
created

Page::createPDFObjectForFpdf()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 5
nc 1
nop 0
dl 0
loc 8
ccs 6
cts 6
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Element\ElementNull;
38
use Smalot\PdfParser\Element\ElementXRef;
39
40
class Page extends PDFObject
41
{
42
    /**
43
     * @var Font[]
44
     */
45
    protected $fonts = null;
46
47
    /**
48
     * @var PDFObject[]
49
     */
50
    protected $xobjects = null;
51
52
    /**
53
     * @var array
54
     */
55
    protected $dataTm = null;
56
57
    /**
58
     * @return Font[]
59
     */
60 31
    public function getFonts()
61
    {
62 31
        if (null !== $this->fonts) {
63 25
            return $this->fonts;
64
        }
65
66 31
        $resources = $this->get('Resources');
67
68 31
        if (method_exists($resources, 'has') && $resources->has('Font')) {
69 26
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

69
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
70 1
                return [];
71
            }
72
73 25
            if ($resources->get('Font') instanceof Header) {
74 19
                $fonts = $resources->get('Font')->getElements();
75
            } else {
76 10
                $fonts = $resources->get('Font')->getHeader()->getElements();
77
            }
78
79 25
            $table = [];
80
81 25
            foreach ($fonts as $id => $font) {
82 25
                if ($font instanceof Font) {
83 25
                    $table[$id] = $font;
84
85
                    // Store too on cleaned id value (only numeric)
86 25
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
87 25
                    if ('' != $id) {
88 24
                        $table[$id] = $font;
89
                    }
90
                }
91
            }
92
93 25
            return $this->fonts = $table;
94
        }
95
96 7
        return [];
97
    }
98
99 28
    public function getFont(string $id): ?Font
100
    {
101 28
        $fonts = $this->getFonts();
102
103 28
        if (isset($fonts[$id])) {
104 24
            return $fonts[$id];
105
        }
106
107
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
108
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
109
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
110
111 5
        if (isset($fonts[$id])) {
112
            return $fonts[$id];
113
        } else {
114 5
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
115 5
            if (isset($fonts[$id])) {
116 1
                return $fonts[$id];
117
            }
118
        }
119
120 4
        return null;
121
    }
122
123
    /**
124
     * Support for XObject
125
     *
126
     * @return PDFObject[]
127
     */
128 5
    public function getXObjects()
129
    {
130 5
        if (null !== $this->xobjects) {
131 4
            return $this->xobjects;
132
        }
133
134 5
        $resources = $this->get('Resources');
135
136 5
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
137 5
            if ($resources->get('XObject') instanceof Header) {
138 5
                $xobjects = $resources->get('XObject')->getElements();
139
            } else {
140
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
141
            }
142
143 5
            $table = [];
144
145 5
            foreach ($xobjects as $id => $xobject) {
146 5
                $table[$id] = $xobject;
147
148
                // Store too on cleaned id value (only numeric)
149 5
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
150 5
                if ('' != $id) {
151 5
                    $table[$id] = $xobject;
152
                }
153
            }
154
155 5
            return $this->xobjects = $table;
156
        }
157
158
        return [];
159
    }
160
161 4
    public function getXObject(string $id): ?PDFObject
162
    {
163 4
        $xobjects = $this->getXObjects();
164
165 4
        if (isset($xobjects[$id])) {
166 4
            return $xobjects[$id];
167
        }
168
169
        return null;
170
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
171
172
        if (isset($xobjects[$id])) {
173
            return $xobjects[$id];
174
        } else {
175
            return null;
176
        }*/
177
    }
178
179 18
    public function getText(self $page = null): string
180
    {
181 18
        if ($contents = $this->get('Contents')) {
182 18
            if ($contents instanceof ElementMissing) {
183
                return '';
184 18
            } elseif ($contents instanceof ElementNull) {
185
                return '';
186 18
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
187 14
                $elements = $contents->getHeader()->getElements();
188
189 14
                if (is_numeric(key($elements))) {
190
                    $new_content = '';
191
192
                    foreach ($elements as $element) {
193
                        if ($element instanceof ElementXRef) {
194
                            $new_content .= $element->getObject()->getContent();
195
                        } else {
196
                            $new_content .= $element->getContent();
197
                        }
198
                    }
199
200
                    $header = new Header([], $this->document);
201 14
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
202
                }
203 6
            } elseif ($contents instanceof ElementArray) {
204
                // Create a virtual global content.
205 6
                $new_content = '';
206
207 6
                foreach ($contents->getContent() as $content) {
208 6
                    $new_content .= $content->getContent()."\n";
209
                }
210
211 6
                $header = new Header([], $this->document);
212 6
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
213
            }
214
215
            /*
216
             * Elements referencing each other on the same page can cause endless loops during text parsing.
217
             * To combat this we keep a recursionStack containing already parsed elements on the page.
218
             * The stack is only emptied here after getting text from a page.
219
             */
220 18
            $contentsText = $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

220
            /** @scrutinizer ignore-call */ 
221
            $contentsText = $contents->getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
221 18
            PDFObject::$recursionStack = [];
222
223 18
            return $contentsText;
224
        }
225
226
        return '';
227
    }
228
229
    /**
230
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
231
     *
232
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
233
     * pdf file was generated by FPDF/Fpfi.
234
     *
235
     * @return bool true is the current page is a FPDI/FPDF document
236
     */
237 11
    public function isFpdf(): bool
238
    {
239 11
        if (\array_key_exists('Producer', $this->document->getDetails()) &&
240 11
            \is_string($this->document->getDetails()['Producer']) &&
241 11
            0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) {
242 2
            return true;
243
        }
244
245 10
        return false;
246
    }
247
248
    /**
249
     * Return the page number of the PDF document of the page object
250
     *
251
     * @return int the page number
252
     */
253 2
    public function getPageNumber(): int
254
    {
255 2
        $pages = $this->document->getPages();
256 2
        $numOfPages = \count($pages);
257 2
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
258 2
            if ($pages[$pageNum] === $this) {
259 2
                break;
260
            }
261
        }
262
263 2
        return $pageNum;
264
    }
265
266
    /**
267
     * Return the Object of the page if the document is a FPDF/FPDI document
268
     *
269
     * If the document was generated by FPDF/FPDI it returns the
270
     * PDFObject of the given page
271
     *
272
     * @return PDFObject The PDFObject for the page
273
     */
274 1
    public function getPDFObjectForFpdf(): PDFObject
275
    {
276 1
        $pageNum = $this->getPageNumber();
277 1
        $xObjects = $this->getXObjects();
278
279 1
        return $xObjects[$pageNum];
280
    }
281
282
    /**
283
     * Return a new PDFObject of the document created with FPDF/FPDI
284
     *
285
     * For a document generated by FPDF/FPDI, it generates a
286
     * new PDFObject for that document
287
     *
288
     * @return PDFObject The PDFObject
289
     */
290 1
    public function createPDFObjectForFpdf(): PDFObject
291
    {
292 1
        $pdfObject = $this->getPDFObjectForFpdf();
293 1
        $new_content = $pdfObject->getContent();
294 1
        $header = $pdfObject->getHeader();
295 1
        $config = $pdfObject->config;
296
297 1
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
298
    }
299
300
    /**
301
     * Return page if document is a FPDF/FPDI document
302
     *
303
     * @return Page The page
304
     */
305 1
    public function createPageForFpdf(): self
306
    {
307 1
        $pdfObject = $this->getPDFObjectForFpdf();
308 1
        $new_content = $pdfObject->getContent();
309 1
        $header = $pdfObject->getHeader();
310 1
        $config = $pdfObject->config;
311
312 1
        return new self($pdfObject->document, $header, $new_content, $config);
313
    }
314
315 6
    public function getTextArray(self $page = null): array
316
    {
317 6
        if ($this->isFpdf()) {
318 1
            $pdfObject = $this->getPDFObjectForFpdf();
319 1
            $newPdfObject = $this->createPDFObjectForFpdf();
320
321 1
            return $newPdfObject->getTextArray($pdfObject);
322
        } else {
323 5
            if ($contents = $this->get('Contents')) {
324 5
                if ($contents instanceof ElementMissing) {
325
                    return [];
326 5
                } elseif ($contents instanceof ElementNull) {
327
                    return [];
328 5
                } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
329 5
                    $elements = $contents->getHeader()->getElements();
330
331 5
                    if (is_numeric(key($elements))) {
332
                        $new_content = '';
333
334
                        /** @var PDFObject $element */
335
                        foreach ($elements as $element) {
336
                            if ($element instanceof ElementXRef) {
337
                                $new_content .= $element->getObject()->getContent();
338
                            } else {
339
                                $new_content .= $element->getContent();
340
                            }
341
                        }
342
343
                        $header = new Header([], $this->document);
344
                        $contents = new PDFObject($this->document, $header, $new_content, $this->config);
345
                    } else {
346
                        try {
347 5
                            $contents->getTextArray($this);
348 1
                        } catch (\Throwable $e) {
349 5
                            return $contents->getTextArray();
350
                        }
351
                    }
352 1
                } elseif ($contents instanceof ElementArray) {
353
                    // Create a virtual global content.
354 1
                    $new_content = '';
355
356
                    /** @var PDFObject $content */
357 1
                    foreach ($contents->getContent() as $content) {
358 1
                        $new_content .= $content->getContent()."\n";
359
                    }
360
361 1
                    $header = new Header([], $this->document);
362 1
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
363
                }
364
365 4
                return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

365
                return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
366
            }
367
368
            return [];
369
        }
370
    }
371
372
    /**
373
     * Gets all the text data with its internal representation of the page.
374
     *
375
     * Returns an array with the data and the internal representation
376
     */
377 10
    public function extractRawData(): array
378
    {
379
        /*
380
         * Now you can get the complete content of the object with the text on it
381
         */
382 10
        $extractedData = [];
383 10
        $content = $this->get('Contents');
384 10
        $values = $content->getContent();
385 10
        if (isset($values) && \is_array($values)) {
386 1
            $text = '';
387 1
            foreach ($values as $section) {
388 1
                $text .= $section->getContent();
389
            }
390 1
            $sectionsText = $this->getSectionsText($text);
391 1
            foreach ($sectionsText as $sectionText) {
392 1
                $commandsText = $this->getCommandsText($sectionText);
393 1
                foreach ($commandsText as $command) {
394 1
                    $extractedData[] = $command;
395
                }
396
            }
397
        } else {
398 10
            if ($this->isFpdf()) {
399 1
                $content = $this->getPDFObjectForFpdf();
400
            }
401 10
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

401
            /** @scrutinizer ignore-call */ 
402
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
402 10
            foreach ($sectionsText as $sectionText) {
403 10
                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
404
405 10
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

405
                /** @scrutinizer ignore-call */ 
406
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
406 10
                foreach ($commandsText as $command) {
407 10
                    $extractedData[] = $command;
408
                }
409
            }
410
        }
411
412 10
        return $extractedData;
413
    }
414
415
    /**
416
     * Gets all the decoded text data with it internal representation from a page.
417
     *
418
     * @param array $extractedRawData the extracted data return by extractRawData or
419
     *                                null if extractRawData should be called
420
     *
421
     * @return array An array with the data and the internal representation
422
     */
423 9
    public function extractDecodedRawData(array $extractedRawData = null): array
424
    {
425 9
        if (!isset($extractedRawData) || !$extractedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
426 9
            $extractedRawData = $this->extractRawData();
427
        }
428 9
        $currentFont = null; /** @var Font $currentFont */
429 9
        $clippedFont = null;
430 9
        $fpdfPage = null;
431 9
        if ($this->isFpdf()) {
432 1
            $fpdfPage = $this->createPageForFpdf();
433
        }
434 9
        foreach ($extractedRawData as &$command) {
435 9
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
436 9
                $data = $command['c'];
437 9
                if (!\is_array($data)) {
438 7
                    $tmpText = '';
439 7
                    if (isset($currentFont)) {
440 7
                        $tmpText = $currentFont->decodeOctal($data);
441
                        // $tmpText = $currentFont->decodeHexadecimal($tmpText, false);
442
                    }
443 7
                    $tmpText = str_replace(
444 7
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
445 7
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
446
                        $tmpText
447
                    );
448 7
                    $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1');
449 7
                    if (isset($currentFont)) {
450 7
                        $tmpText = $currentFont->decodeContent($tmpText);
0 ignored issues
show
Bug introduced by
It seems like $tmpText can also be of type array; however, parameter $text of Smalot\PdfParser\Font::decodeContent() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

450
                        $tmpText = $currentFont->decodeContent(/** @scrutinizer ignore-type */ $tmpText);
Loading history...
451
                    }
452 7
                    $command['c'] = $tmpText;
453 7
                    continue;
454
                }
455 9
                $numText = \count($data);
456 9
                for ($i = 0; $i < $numText; ++$i) {
457 9
                    if (0 != ($i % 2)) {
458 7
                        continue;
459
                    }
460 9
                    $tmpText = $data[$i]['c'];
461 9
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
462 9
                    $decodedText = str_replace(
463 9
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
464 9
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
465
                        $decodedText
466
                    );
467
468 9
                    $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1');
469
470 9
                    if (isset($currentFont)) {
471 7
                        $decodedText = $currentFont->decodeContent($decodedText);
472
                    }
473 9
                    $command['c'][$i]['c'] = $decodedText;
474 9
                    continue;
475
                }
476 9
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
477 9
                $fontId = explode(' ', $command['c'])[0];
478
                // If document is a FPDI/FPDF the $page has the correct font
479 9
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
480 9
                continue;
481 9
            } elseif ('Q' == $command['o']) {
482
                $currentFont = $clippedFont;
483 9
            } elseif ('q' == $command['o']) {
484
                $clippedFont = $currentFont;
485
            }
486
        }
487
488 9
        return $extractedRawData;
489
    }
490
491
    /**
492
     * Gets just the Text commands that are involved in text positions and
493
     * Text Matrix (Tm)
494
     *
495
     * It extracts just the PDF commands that are involved with text positions, and
496
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
497
     *
498
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
499
     *                                       If it is null, the method extractDecodeRawData is called.
500
     *
501
     * @return array An array with the text command of the page
502
     */
503 7
    public function getDataCommands(array $extractedDecodedRawData = null): array
504
    {
505 7
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
506 7
            $extractedDecodedRawData = $this->extractDecodedRawData();
507
        }
508 7
        $extractedData = [];
509 7
        foreach ($extractedDecodedRawData as $command) {
510 7
            switch ($command['o']) {
511
                /*
512
                 * BT
513
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
514
                 */
515 7
                case 'BT':
516 7
                    $extractedData[] = $command;
517 7
                    break;
518
519
                    /*
520
                     * ET
521
                     * End a text object, discarding the text matrix
522
                     */
523 7
                case 'ET':
524
                    $extractedData[] = $command;
525
                    break;
526
527
                    /*
528
                     * leading TL
529
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
530
                     * Initial value: 0
531
                     */
532 7
                case 'TL':
533 5
                    $extractedData[] = $command;
534 5
                    break;
535
536
                    /*
537
                     * tx ty Td
538
                     * Move to the start of the next line, offset form the start of the
539
                     * current line by tx, ty.
540
                     */
541 7
                case 'Td':
542 7
                    $extractedData[] = $command;
543 7
                    break;
544
545
                    /*
546
                     * tx ty TD
547
                     * Move to the start of the next line, offset form the start of the
548
                     * current line by tx, ty. As a side effect, this operator set the leading
549
                     * parameter in the text state. This operator has the same effect as the
550
                     * code:
551
                     * -ty TL
552
                     * tx ty Td
553
                     */
554 7
                case 'TD':
555
                    $extractedData[] = $command;
556
                    break;
557
558
                    /*
559
                     * a b c d e f Tm
560
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
561
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
562
                     * [1 0 0 1 0 0]
563
                     */
564 7
                case 'Tm':
565 5
                    $extractedData[] = $command;
566 5
                    break;
567
568
                    /*
569
                     * T*
570
                     * Move to the start of the next line. This operator has the same effect
571
                     * as the code:
572
                     * 0 Tl Td
573
                     * Where Tl is the current leading parameter in the text state.
574
                     */
575 7
                case 'T*':
576 5
                    $extractedData[] = $command;
577 5
                    break;
578
579
                    /*
580
                     * string Tj
581
                     * Show a Text String
582
                     */
583 7
                case 'Tj':
584 6
                    $extractedData[] = $command;
585 6
                    break;
586
587
                    /*
588
                     * string '
589
                     * Move to the next line and show a text string. This operator has the
590
                     * same effect as the code:
591
                     * T*
592
                     * string Tj
593
                     */
594 7
                case "'":
595
                    $extractedData[] = $command;
596
                    break;
597
598
                    /*
599
                     * aw ac string "
600
                     * Move to the next lkine and show a text string, using aw as the word
601
                     * spacing and ac as the character spacing. This operator has the same
602
                     * effect as the code:
603
                     * aw Tw
604
                     * ac Tc
605
                     * string '
606
                     * Tw set the word spacing, Tw, to wordSpace.
607
                     * Tc Set the character spacing, Tc, to charsSpace.
608
                     */
609 7
                case '"':
610
                    $extractedData[] = $command;
611
                    break;
612
613
                    // TODO Tfs
614
615 7
                case 'Tf':
616 7
                case 'TF':
617 7
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
618 1
                        $extractedData[] = $command;
619
                    }
620 7
                    break;
621
622
                    /*
623
                     * array TJ
624
                     * Show one or more text strings allow individual glyph positioning.
625
                     * Each lement of array con be a string or a number. If the element is
626
                     * a string, this operator shows the string. If it is a number, the
627
                     * operator adjust the text position by that amount; that is, it translates
628
                     * the text matrix, Tm. This amount is substracted form the current
629
                     * horizontal or vertical coordinate, depending on the writing mode.
630
                     * in the default coordinate system, a positive adjustment has the effect
631
                     * of moving the next glyph painted either to the left or down by the given
632
                     * amount.
633
                     */
634 7
                case 'TJ':
635 7
                    $extractedData[] = $command;
636 7
                    break;
637
                default:
638
            }
639
        }
640
641 7
        return $extractedData;
642
    }
643
644
    /**
645
     * Gets the Text Matrix of the text in the page
646
     *
647
     * Return an array where every item is an array where the first item is the
648
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
649
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
650
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
651
     *
652
     * @param array $dataCommands the data extracted by getDataCommands
653
     *                            if null getDataCommands is called
654
     *
655
     * @return array an array with the data of the page including the Tm information
656
     *               of any text in the page
657
     */
658 6
    public function getDataTm(array $dataCommands = null): array
659
    {
660 6
        if (!isset($dataCommands) || !$dataCommands) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
661 6
            $dataCommands = $this->getDataCommands();
662
        }
663
664
        /*
665
         * At the beginning of a text object Tm is the identity matrix
666
         */
667 6
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
668
669
        /*
670
         *  Set the text leading used by T*, ' and " operators
671
         */
672 6
        $defaultTl = 0;
673
674
        /*
675
         *  Set default values for font data
676
         */
677 6
        $defaultFontId = -1;
678 6
        $defaultFontSize = 1;
679
680
        /*
681
         * Setting where the scaling and X,Y-coordinates in the matrix are (Tm)
682
         */
683 6
        $hSc = 0; // horizontal scaling
684 6
        $vSc = 3; // vertical scaling
685 6
        $x = 4;
686 6
        $y = 5;
687
688
        /*
689
         * x,y-coordinates of text space origin in user units
690
         *
691
         * These will be assigned the value of the currently printed string
692
         */
693 6
        $Tx = 0;
694 6
        $Ty = 0;
695
696 6
        $Tm = $defaultTm;
697 6
        $Tl = $defaultTl;
698 6
        $fontId = $defaultFontId;
699 6
        $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs
700
701 6
        $extractedTexts = $this->getTextArray();
702 6
        $extractedData = [];
703 6
        foreach ($dataCommands as $command) {
704 6
            $currentText = $extractedTexts[\count($extractedData)];
705 6
            switch ($command['o']) {
706
                /*
707
                 * BT
708
                 * Begin a text object, initializing the Tm and Tlm to identity matrix
709
                 */
710 6
                case 'BT':
711 6
                    $Tm = $defaultTm;
712 6
                    $Tl = $defaultTl; // review this.
713 6
                    $Tx = 0;
714 6
                    $Ty = 0;
715 6
                    $fontId = $defaultFontId;
716 6
                    $fontSize = $defaultFontSize;
717 6
                    break;
718
719
                    /*
720
                     * ET
721
                     * End a text object, discarding the text matrix
722
                     */
723 6
                case 'ET':
724
                    $Tm = $defaultTm;
725
                    $Tl = $defaultTl;  // review this
726
                    $Tx = 0;
727
                    $Ty = 0;
728
                    $fontId = $defaultFontId;
729
                    $fontSize = $defaultFontSize;
730
                    break;
731
732
                    /*
733
                     * leading TL
734
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
735
                     * Initial value: 0
736
                     */
737 6
                case 'TL':
738 4
                    $Tl = (float) $command['c'];
739 4
                    break;
740
741
                    /*
742
                     * tx ty Td
743
                     * Move to the start of the next line, offset form the start of the
744
                     * current line by tx, ty.
745
                     */
746 6
                case 'Td':
747 6
                    $coord = explode(' ', $command['c']);
748 6
                    $Tx += (float) $coord[0] * (float) $fontSize * (float) $Tm[$hSc];
749 6
                    $Ty += (float) $coord[1] * (float) $fontSize * (float) $Tm[$vSc];
750 6
                    $Tm[$x] = (string) $Tx;
751 6
                    $Tm[$y] = (string) $Ty;
752 6
                    break;
753
754
                    /*
755
                     * tx ty TD
756
                     * Move to the start of the next line, offset form the start of the
757
                     * current line by tx, ty. As a side effect, this operator set the leading
758
                     * parameter in the text state. This operator has the same effect as the
759
                     * code:
760
                     * -ty TL
761
                     * tx ty Td
762
                     */
763 6
                case 'TD':
764 1
                    $coord = explode(' ', $command['c']);
765 1
                    $Tl = (float) $coord[1] * (float) $fontSize * (float) $Tm[$vSc];
766 1
                    $Tx += (float) $coord[0] * (float) $fontSize * (float) $Tm[$hSc];
767 1
                    $Ty += (float) $coord[1] * (float) $fontSize * (float) $Tm[$vSc];
768 1
                    $Tm[$x] = (string) $Tx;
769 1
                    $Tm[$y] = (string) $Ty;
770 1
                    break;
771
772
                    /*
773
                     * a b c d e f Tm
774
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
775
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
776
                     * [1 0 0 1 0 0]
777
                     */
778 6
                case 'Tm':
779 4
                    $Tm = explode(' ', $command['c']);
780 4
                    $Tx = (float) $Tm[$x];
781 4
                    $Ty = (float) $Tm[$y];
782 4
                    break;
783
784
                    /*
785
                     * T*
786
                     * Move to the start of the next line. This operator has the same effect
787
                     * as the code:
788
                     * 0 -Tl Td
789
                     * Where Tl is the current leading parameter in the text state.
790
                     */
791 6
                case 'T*':
792 4
                    $Ty -= $Tl;
793 4
                    $Tm[$y] = (string) $Ty;
794 4
                    break;
795
796
                    /*
797
                     * string Tj
798
                     * Show a Text String
799
                     */
800 6
                case 'Tj':
801 5
                    $data = [$Tm, $currentText];
802 5
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
803 1
                        $data[] = $fontId;
804 1
                        $data[] = $fontSize;
805
                    }
806 5
                    $extractedData[] = $data;
807 5
                    break;
808
809
                    /*
810
                     * string '
811
                     * Move to the next line and show a text string. This operator has the
812
                     * same effect as the code:
813
                     * T*
814
                     * string Tj
815
                     */
816 6
                case "'":
817
                    $Ty -= $Tl;
818
                    $Tm[$y] = (string) $Ty;
819
                    $extractedData[] = [$Tm, $currentText];
820
                    break;
821
822
                    /*
823
                     * aw ac string "
824
                     * Move to the next line and show a text string, using aw as the word
825
                     * spacing and ac as the character spacing. This operator has the same
826
                     * effect as the code:
827
                     * aw Tw
828
                     * ac Tc
829
                     * string '
830
                     * Tw set the word spacing, Tw, to wordSpace.
831
                     * Tc Set the character spacing, Tc, to charsSpace.
832
                     */
833 6
                case '"':
834
                    $data = explode(' ', $currentText);
835
                    $Ty -= $Tl;
836
                    $Tm[$y] = (string) $Ty;
837
                    $extractedData[] = [$Tm, $data[2]]; // Verify
838
                    break;
839
840 6
                case 'Tf':
841
                    /*
842
                     * From PDF 1.0 specification, page 106:
843
                     *     fontname size Tf Set font and size
844
                     *     Sets the text font and text size in the graphics state. There is no default value for
845
                     *     either fontname or size; they must be selected using Tf before drawing any text.
846
                     *     fontname is a resource name. size is a number expressed in text space units.
847
                     *
848
                     * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
849
                     * Introduced with https://github.com/smalot/pdfparser/pull/516
850
                     */
851 2
                    list($fontId, $fontSize) = explode(' ', $command['c'], 2);
852 2
                    break;
853
854
                    /*
855
                     * array TJ
856
                     * Show one or more text strings allow individual glyph positioning.
857
                     * Each lement of array con be a string or a number. If the element is
858
                     * a string, this operator shows the string. If it is a number, the
859
                     * operator adjust the text position by that amount; that is, it translates
860
                     * the text matrix, Tm. This amount is substracted form the current
861
                     * horizontal or vertical coordinate, depending on the writing mode.
862
                     * in the default coordinate system, a positive adjustment has the effect
863
                     * of moving the next glyph painted either to the left or down by the given
864
                     * amount.
865
                     */
866 6
                case 'TJ':
867 6
                    $data = [$Tm, $currentText];
868 6
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
869 1
                        $data[] = $fontId;
870 1
                        $data[] = $fontSize;
871
                    }
872 6
                    $extractedData[] = $data;
873 6
                    break;
874
                default:
875
            }
876
        }
877 6
        $this->dataTm = $extractedData;
878
879 6
        return $extractedData;
880
    }
881
882
    /**
883
     * Gets text data that are around the given coordinates (X,Y)
884
     *
885
     * If the text is in near the given coordinates (X,Y) (or the TM info),
886
     * the text is returned.  The extractedData return by getDataTm, could be use to see
887
     * where is the coordinates of a given text, using the TM info for it.
888
     *
889
     * @param float $x      The X value of the coordinate to search for. if null
890
     *                      just the Y value is considered (same Row)
891
     * @param float $y      The Y value of the coordinate to search for
892
     *                      just the X value is considered (same column)
893
     * @param float $xError The value less or more to consider an X to be "near"
894
     * @param float $yError The value less or more to consider an Y to be "near"
895
     *
896
     * @return array An array of text that are near the given coordinates. If no text
897
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
898
     *               and y coordinates are null, null is returned.
899
     */
900 2
    public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
901
    {
902 2
        if (!isset($this->dataTm) || !$this->dataTm) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
903 1
            $this->getDataTm();
904
        }
905
906 2
        if (null !== $x) {
907 2
            $x = (float) $x;
908
        }
909
910 2
        if (null !== $y) {
911 2
            $y = (float) $y;
912
        }
913
914 2
        if (null === $x && null === $y) {
915
            return [];
916
        }
917
918 2
        $xError = (float) $xError;
919 2
        $yError = (float) $yError;
920
921 2
        $extractedData = [];
922 2
        foreach ($this->dataTm as $item) {
923 2
            $tm = $item[0];
924 2
            $xTm = (float) $tm[4];
925 2
            $yTm = (float) $tm[5];
926 2
            $text = $item[1];
927 2
            if (null === $y) {
928
                if (($xTm >= ($x - $xError)) &&
929
                    ($xTm <= ($x + $xError))) {
930
                    $extractedData[] = [$tm, $text];
931
                    continue;
932
                }
933
            }
934 2
            if (null === $x) {
935
                if (($yTm >= ($y - $yError)) &&
936
                    ($yTm <= ($y + $yError))) {
937
                    $extractedData[] = [$tm, $text];
938
                    continue;
939
                }
940
            }
941 2
            if (($xTm >= ($x - $xError)) &&
942 2
                ($xTm <= ($x + $xError)) &&
943 2
                ($yTm >= ($y - $yError)) &&
944 2
                ($yTm <= ($y + $yError))) {
945 2
                $extractedData[] = [$tm, $text];
946 2
                continue;
947
            }
948
        }
949
950 2
        return $extractedData;
951
    }
952
}
953