Passed
Pull Request — master (#698)
by
unknown
02:57
created

Page::setFonts()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 2
c 0
b 0
f 0
nc 2
nop 1
dl 0
loc 4
ccs 3
cts 3
cp 1
crap 2
rs 10
1
<?php
2
3
/**
4
 * @file
5
 *          This file is part of the PdfParser library.
6
 *
7
 * @author  Sébastien MALOT <[email protected]>
8
 *
9
 * @date    2017-01-03
10
 *
11
 * @license LGPLv3
12
 *
13
 * @url     <https://github.com/smalot/pdfparser>
14
 *
15
 *  PdfParser is a pdf library written in PHP, extraction oriented.
16
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17
 *
18
 *  This program is free software: you can redistribute it and/or modify
19
 *  it under the terms of the GNU Lesser General Public License as published by
20
 *  the Free Software Foundation, either version 3 of the License, or
21
 *  (at your option) any later version.
22
 *
23
 *  This program is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU Lesser General Public License for more details.
27
 *
28
 *  You should have received a copy of the GNU Lesser General Public License
29
 *  along with this program.
30
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31
 */
32
33
namespace Smalot\PdfParser;
34
35
use Smalot\PdfParser\Element\ElementArray;
36
use Smalot\PdfParser\Element\ElementMissing;
37
use Smalot\PdfParser\Element\ElementNull;
38
use Smalot\PdfParser\Element\ElementXRef;
39
40
class Page extends PDFObject
41
{
42
    /**
43
     * @var Font[]
44
     */
45
    protected $fonts;
46
47
    /**
48
     * @var PDFObject[]
49
     */
50
    protected $xobjects;
51
52
    /**
53
     * @var array
54
     */
55
    protected $dataTm;
56
57
    /**
58
     * @param array<\Smalot\PdfParser\Font> $fonts
59
     */
60 8
    public function setFonts($fonts)
61
    {
62 8
        if (empty($this->fonts)) {
63 8
            $this->fonts = $fonts;
64
        }
65
    }
66
67
    /**
68
     * @return Font[]
69
     */
70 50
    public function getFonts()
71
    {
72 50
        if (null !== $this->fonts) {
73 48
            return $this->fonts;
74
        }
75
76 43
        $resources = $this->get('Resources');
77
78 43
        if (method_exists($resources, 'has') && $resources->has('Font')) {
79 42
            if ($resources->get('Font') instanceof ElementMissing) {
0 ignored issues
show
Bug introduced by
The method get() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

79
            if ($resources->/** @scrutinizer ignore-call */ get('Font') instanceof ElementMissing) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
80 1
                return [];
81
            }
82
83 41
            if ($resources->get('Font') instanceof Header) {
84 35
                $fonts = $resources->get('Font')->getElements();
85
            } else {
86 10
                $fonts = $resources->get('Font')->getHeader()->getElements();
87
            }
88
89 41
            $table = [];
90
91 41
            foreach ($fonts as $id => $font) {
92 41
                if ($font instanceof Font) {
93 41
                    $table[$id] = $font;
94
95
                    // Store too on cleaned id value (only numeric)
96 41
                    $id = preg_replace('/[^0-9\.\-_]/', '', $id);
97 41
                    if ('' != $id) {
98 40
                        $table[$id] = $font;
99
                    }
100
                }
101
            }
102
103 41
            return $this->fonts = $table;
104
        }
105
106 4
        return [];
107
    }
108
109 47
    public function getFont(string $id): ?Font
110
    {
111 47
        $fonts = $this->getFonts();
112
113 47
        if (isset($fonts[$id])) {
114 47
            return $fonts[$id];
115
        }
116
117
        // According to the PDF specs (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 238)
118
        // "The font resource name presented to the Tf operator is arbitrary, as are the names for all kinds of resources"
119
        // Instead, we search for the unfiltered name first and then do this cleaning as a fallback, so all tests still pass.
120
121 1
        if (isset($fonts[$id])) {
122
            return $fonts[$id];
123
        } else {
124 1
            $id = preg_replace('/[^0-9\.\-_]/', '', $id);
125 1
            if (isset($fonts[$id])) {
126 1
                return $fonts[$id];
127
            }
128
        }
129
130
        return null;
131
    }
132
133
    /**
134
     * Support for XObject
135
     *
136
     * @return PDFObject[]
137
     */
138 16
    public function getXObjects()
139
    {
140 16
        if (null !== $this->xobjects) {
141 10
            return $this->xobjects;
142
        }
143
144 16
        $resources = $this->get('Resources');
145
146 16
        if (method_exists($resources, 'has') && $resources->has('XObject')) {
147 16
            if ($resources->get('XObject') instanceof Header) {
148 15
                $xobjects = $resources->get('XObject')->getElements();
149
            } else {
150 1
                $xobjects = $resources->get('XObject')->getHeader()->getElements();
151
            }
152
153 16
            $table = [];
154
155 16
            foreach ($xobjects as $id => $xobject) {
156 16
                $table[$id] = $xobject;
157
158
                // Store too on cleaned id value (only numeric)
159 16
                $id = preg_replace('/[^0-9\.\-_]/', '', $id);
160 16
                if ('' != $id) {
161 16
                    $table[$id] = $xobject;
162
                }
163
            }
164
165 16
            return $this->xobjects = $table;
166
        }
167
168
        return [];
169
    }
170
171 15
    public function getXObject(string $id): ?PDFObject
172
    {
173 15
        $xobjects = $this->getXObjects();
174
175 15
        if (isset($xobjects[$id])) {
176 15
            return $xobjects[$id];
177
        }
178
179
        return null;
180
        /*$id = preg_replace('/[^0-9\.\-_]/', '', $id);
181
182
        if (isset($xobjects[$id])) {
183
            return $xobjects[$id];
184
        } else {
185
            return null;
186
        }*/
187
    }
188
189 35
    public function getText(?self $page = null): string
190
    {
191 35
        if ($contents = $this->get('Contents')) {
192 35
            if ($contents instanceof ElementMissing) {
193
                return '';
194 35
            } elseif ($contents instanceof ElementNull) {
195
                return '';
196 35
            } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
197 29
                $elements = $contents->getHeader()->getElements();
198
199 29
                if (is_numeric(key($elements))) {
200
                    $new_content = '';
201
202
                    foreach ($elements as $element) {
203
                        if ($element instanceof ElementXRef) {
204
                            $new_content .= $element->getObject()->getContent();
205
                        } else {
206
                            $new_content .= $element->getContent();
207
                        }
208
                    }
209
210
                    $header = new Header([], $this->document);
211 29
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
212
                }
213 9
            } elseif ($contents instanceof ElementArray) {
214
                // Create a virtual global content.
215 9
                $new_content = '';
216
217 9
                foreach ($contents->getContent() as $content) {
218 9
                    $new_content .= $content->getContent()."\n";
219
                }
220
221 9
                $header = new Header([], $this->document);
222 9
                $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

222
                $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
223
            }
224
225
            /*
226
             * Elements referencing each other on the same page can cause endless loops during text parsing.
227
             * To combat this we keep a recursionStack containing already parsed elements on the page.
228
             * The stack is only emptied here after getting text from a page.
229
             */
230 35
            $contentsText = $contents->getText($this);
0 ignored issues
show
Bug introduced by
The method getText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

230
            /** @scrutinizer ignore-call */ 
231
            $contentsText = $contents->getText($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
231 35
            PDFObject::$recursionStack = [];
232
233 35
            return $contentsText;
234
        }
235
236
        return '';
237
    }
238
239
    /**
240
     * Return true if the current page is a (setasign\Fpdi\Fpdi) FPDI/FPDF document
241
     *
242
     * The metadata 'Producer' should have the value of "FPDF" . FPDF_VERSION if the
243
     * pdf file was generated by FPDF/Fpfi.
244
     *
245
     * @return bool true is the current page is a FPDI/FPDF document
246
     */
247 13
    public function isFpdf(): bool
248
    {
249 13
        if (\array_key_exists('Producer', $this->document->getDetails())
0 ignored issues
show
Bug introduced by
The method getDetails() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

249
        if (\array_key_exists('Producer', $this->document->/** @scrutinizer ignore-call */ getDetails())

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
250 13
            && \is_string($this->document->getDetails()['Producer'])
251 13
            && 0 === strncmp($this->document->getDetails()['Producer'], 'FPDF', 4)) {
252 2
            return true;
253
        }
254
255 12
        return false;
256
    }
257
258
    /**
259
     * Return the page number of the PDF document of the page object
260
     *
261
     * @return int the page number
262
     */
263 2
    public function getPageNumber(): int
264
    {
265 2
        $pages = $this->document->getPages();
266 2
        $numOfPages = \count($pages);
267 2
        for ($pageNum = 0; $pageNum < $numOfPages; ++$pageNum) {
268 2
            if ($pages[$pageNum] === $this) {
269 2
                break;
270
            }
271
        }
272
273 2
        return $pageNum;
274
    }
275
276
    /**
277
     * Return the Object of the page if the document is a FPDF/FPDI document
278
     *
279
     * If the document was generated by FPDF/FPDI it returns the
280
     * PDFObject of the given page
281
     *
282
     * @return PDFObject The PDFObject for the page
283
     */
284 1
    public function getPDFObjectForFpdf(): PDFObject
285
    {
286 1
        $pageNum = $this->getPageNumber();
287 1
        $xObjects = $this->getXObjects();
288
289 1
        return $xObjects[$pageNum];
290
    }
291
292
    /**
293
     * Return a new PDFObject of the document created with FPDF/FPDI
294
     *
295
     * For a document generated by FPDF/FPDI, it generates a
296
     * new PDFObject for that document
297
     *
298
     * @return PDFObject The PDFObject
299
     */
300 1
    public function createPDFObjectForFpdf(): PDFObject
301
    {
302 1
        $pdfObject = $this->getPDFObjectForFpdf();
303 1
        $new_content = $pdfObject->getContent();
304 1
        $header = $pdfObject->getHeader();
305 1
        $config = $pdfObject->config;
306
307 1
        return new PDFObject($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

307
        return new PDFObject(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
308
    }
309
310
    /**
311
     * Return page if document is a FPDF/FPDI document
312
     *
313
     * @return Page The page
314
     */
315 1
    public function createPageForFpdf(): self
316
    {
317 1
        $pdfObject = $this->getPDFObjectForFpdf();
318 1
        $new_content = $pdfObject->getContent();
319 1
        $header = $pdfObject->getHeader();
320 1
        $config = $pdfObject->config;
321
322 1
        return new self($pdfObject->document, $header, $new_content, $config);
0 ignored issues
show
Bug introduced by
It seems like $pdfObject->document can also be of type null; however, parameter $document of Smalot\PdfParser\Page::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

322
        return new self(/** @scrutinizer ignore-type */ $pdfObject->document, $header, $new_content, $config);
Loading history...
323
    }
324
325 8
    public function getTextArray(?self $page = null): array
326
    {
327 8
        if ($this->isFpdf()) {
328 1
            $pdfObject = $this->getPDFObjectForFpdf();
329 1
            $newPdfObject = $this->createPDFObjectForFpdf();
330
331 1
            return $newPdfObject->getTextArray($pdfObject);
332
        } else {
333 7
            if ($contents = $this->get('Contents')) {
334 7
                if ($contents instanceof ElementMissing) {
335
                    return [];
336 7
                } elseif ($contents instanceof ElementNull) {
337
                    return [];
338 7
                } elseif ($contents instanceof PDFObject) {
0 ignored issues
show
introduced by
$contents is never a sub-type of Smalot\PdfParser\PDFObject.
Loading history...
339 7
                    $elements = $contents->getHeader()->getElements();
340
341 7
                    if (is_numeric(key($elements))) {
342
                        $new_content = '';
343
344
                        /** @var PDFObject $element */
345
                        foreach ($elements as $element) {
346
                            if ($element instanceof ElementXRef) {
347
                                $new_content .= $element->getObject()->getContent();
348
                            } else {
349
                                $new_content .= $element->getContent();
350
                            }
351
                        }
352
353
                        $header = new Header([], $this->document);
354
                        $contents = new PDFObject($this->document, $header, $new_content, $this->config);
355
                    } else {
356
                        try {
357 7
                            $contents->getTextArray($this);
358
                        } catch (\Throwable $e) {
359 7
                            return $contents->getTextArray();
360
                        }
361
                    }
362 1
                } elseif ($contents instanceof ElementArray) {
363
                    // Create a virtual global content.
364 1
                    $new_content = '';
365
366
                    /** @var PDFObject $content */
367 1
                    foreach ($contents->getContent() as $content) {
368 1
                        $new_content .= $content->getContent()."\n";
369
                    }
370
371 1
                    $header = new Header([], $this->document);
372 1
                    $contents = new PDFObject($this->document, $header, $new_content, $this->config);
0 ignored issues
show
Bug introduced by
It seems like $this->document can also be of type null; however, parameter $document of Smalot\PdfParser\PDFObject::__construct() does only seem to accept Smalot\PdfParser\Document, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

372
                    $contents = new PDFObject(/** @scrutinizer ignore-type */ $this->document, $header, $new_content, $this->config);
Loading history...
373
                }
374
375 7
                return $contents->getTextArray($this);
0 ignored issues
show
Bug introduced by
The method getTextArray() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

375
                return $contents->/** @scrutinizer ignore-call */ getTextArray($this);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
376
            }
377
378
            return [];
379
        }
380
    }
381
382
    /**
383
     * Gets all the text data with its internal representation of the page.
384
     *
385
     * Returns an array with the data and the internal representation
386
     */
387 12
    public function extractRawData(): array
388
    {
389
        /*
390
         * Now you can get the complete content of the object with the text on it
391
         */
392 12
        $extractedData = [];
393 12
        $content = $this->get('Contents');
394 12
        $values = $content->getContent();
395 12
        if (isset($values) && \is_array($values)) {
396 1
            $text = '';
397 1
            foreach ($values as $section) {
398 1
                $text .= $section->getContent();
399
            }
400 1
            $sectionsText = $this->getSectionsText($text);
401 1
            foreach ($sectionsText as $sectionText) {
402 1
                $commandsText = $this->getCommandsText($sectionText);
403 1
                foreach ($commandsText as $command) {
404 1
                    $extractedData[] = $command;
405
                }
406
            }
407
        } else {
408 12
            if ($this->isFpdf()) {
409 1
                $content = $this->getPDFObjectForFpdf();
410
            }
411 12
            $sectionsText = $content->getSectionsText($content->getContent());
0 ignored issues
show
Bug introduced by
The method getSectionsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

411
            /** @scrutinizer ignore-call */ 
412
            $sectionsText = $content->getSectionsText($content->getContent());

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
412 12
            foreach ($sectionsText as $sectionText) {
413 12
                $commandsText = $content->getCommandsText($sectionText);
0 ignored issues
show
Bug introduced by
The method getCommandsText() does not exist on Smalot\PdfParser\Element. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

413
                /** @scrutinizer ignore-call */ 
414
                $commandsText = $content->getCommandsText($sectionText);

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
414 12
                foreach ($commandsText as $command) {
415 12
                    $extractedData[] = $command;
416
                }
417
            }
418
        }
419
420 12
        return $extractedData;
421
    }
422
423
    /**
424
     * Gets all the decoded text data with it internal representation from a page.
425
     *
426
     * @param array $extractedRawData the extracted data return by extractRawData or
427
     *                                null if extractRawData should be called
428
     *
429
     * @return array An array with the data and the internal representation
430
     */
431 11
    public function extractDecodedRawData(?array $extractedRawData = null): array
432
    {
433 11
        if (!isset($extractedRawData) || !$extractedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
434 11
            $extractedRawData = $this->extractRawData();
435
        }
436 11
        $currentFont = null; /** @var Font $currentFont */
437 11
        $clippedFont = null;
438 11
        $fpdfPage = null;
439 11
        if ($this->isFpdf()) {
440 1
            $fpdfPage = $this->createPageForFpdf();
441
        }
442 11
        foreach ($extractedRawData as &$command) {
443 11
            if ('Tj' == $command['o'] || 'TJ' == $command['o']) {
444 11
                $data = $command['c'];
445 11
                if (!\is_array($data)) {
446 9
                    $tmpText = '';
447 9
                    if (isset($currentFont)) {
448 9
                        $tmpText = $currentFont->decodeOctal($data);
449
                        // $tmpText = $currentFont->decodeHexadecimal($tmpText, false);
450
                    }
451 9
                    $tmpText = str_replace(
452 9
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
453 9
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
454 9
                        $tmpText
455 9
                    );
456 9
                    $tmpText = mb_convert_encoding($tmpText, 'UTF-8', 'ISO-8859-1');
457 9
                    if (isset($currentFont)) {
458 9
                        $tmpText = $currentFont->decodeContent($tmpText);
0 ignored issues
show
Bug introduced by
It seems like $tmpText can also be of type array; however, parameter $text of Smalot\PdfParser\Font::decodeContent() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

458
                        $tmpText = $currentFont->decodeContent(/** @scrutinizer ignore-type */ $tmpText);
Loading history...
459
                    }
460 9
                    $command['c'] = $tmpText;
461 9
                    continue;
462
                }
463 11
                $numText = \count($data);
464 11
                for ($i = 0; $i < $numText; ++$i) {
465 11
                    if (0 != ($i % 2)) {
466 7
                        continue;
467
                    }
468 11
                    $tmpText = $data[$i]['c'];
469 11
                    $decodedText = isset($currentFont) ? $currentFont->decodeOctal($tmpText) : $tmpText;
470 11
                    $decodedText = str_replace(
471 11
                        ['\\\\', '\(', '\)', '\n', '\r', '\t', '\ '],
472 11
                        ['\\', '(', ')', "\n", "\r", "\t", ' '],
473 11
                        $decodedText
474 11
                    );
475
476 11
                    $decodedText = mb_convert_encoding($decodedText, 'UTF-8', 'ISO-8859-1');
477
478 11
                    if (isset($currentFont)) {
479 11
                        $decodedText = $currentFont->decodeContent($decodedText);
480
                    }
481 11
                    $command['c'][$i]['c'] = $decodedText;
482 11
                    continue;
483
                }
484 11
            } elseif ('Tf' == $command['o'] || 'TF' == $command['o']) {
485 11
                $fontId = explode(' ', $command['c'])[0];
486
                // If document is a FPDI/FPDF the $page has the correct font
487 11
                $currentFont = isset($fpdfPage) ? $fpdfPage->getFont($fontId) : $this->getFont($fontId);
488 11
                continue;
489 11
            } elseif ('Q' == $command['o']) {
490 8
                $currentFont = $clippedFont;
491 11
            } elseif ('q' == $command['o']) {
492 8
                $clippedFont = $currentFont;
493
            }
494
        }
495
496 11
        return $extractedRawData;
497
    }
498
499
    /**
500
     * Gets just the Text commands that are involved in text positions and
501
     * Text Matrix (Tm)
502
     *
503
     * It extract just the PDF commands that are involved with text positions, and
504
     * the Text Matrix (Tm). These are: BT, ET, TL, Td, TD, Tm, T*, Tj, ', ", and TJ
505
     *
506
     * @param array $extractedDecodedRawData The data extracted by extractDecodeRawData.
507
     *                                       If it is null, the method extractDecodeRawData is called.
508
     *
509
     * @return array An array with the text command of the page
510
     */
511 9
    public function getDataCommands(?array $extractedDecodedRawData = null): array
512
    {
513 9
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $extractedDecodedRawData of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
514 9
            $extractedDecodedRawData = $this->extractDecodedRawData();
515
        }
516 9
        $extractedData = [];
517 9
        foreach ($extractedDecodedRawData as $command) {
518 9
            switch ($command['o']) {
519
                /*
520
                 * BT
521
                 * Begin a text object, inicializind the Tm and Tlm to identity matrix
522
                 */
523 9
                case 'BT':
524 9
                    $extractedData[] = $command;
525 9
                    break;
526
527
                    /*
528
                     * ET
529
                     * End a text object, discarding the text matrix
530
                     */
531 9
                case 'ET':
532 9
                    $extractedData[] = $command;
533 9
                    break;
534
535
                    /*
536
                     * leading TL
537
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
538
                     * Initial value: 0
539
                     */
540 9
                case 'TL':
541 5
                    $extractedData[] = $command;
542 5
                    break;
543
544
                    /*
545
                     * tx ty Td
546
                     * Move to the start of the next line, offset form the start of the
547
                     * current line by tx, ty.
548
                     */
549 9
                case 'Td':
550 9
                    $extractedData[] = $command;
551 9
                    break;
552
553
                    /*
554
                     * tx ty TD
555
                     * Move to the start of the next line, offset form the start of the
556
                     * current line by tx, ty. As a side effect, this operator set the leading
557
                     * parameter in the text state. This operator has the same effect as the
558
                     * code:
559
                     * -ty TL
560
                     * tx ty Td
561
                     */
562 9
                case 'TD':
563
                    $extractedData[] = $command;
564
                    break;
565
566
                    /*
567
                     * a b c d e f Tm
568
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
569
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
570
                     * [1 0 0 1 0 0]
571
                     */
572 9
                case 'Tm':
573 7
                    $extractedData[] = $command;
574 7
                    break;
575
576
                    /*
577
                     * T*
578
                     * Move to the start of the next line. This operator has the same effect
579
                     * as the code:
580
                     * 0 Tl Td
581
                     * Where Tl is the current leading parameter in the text state.
582
                     */
583 9
                case 'T*':
584 5
                    $extractedData[] = $command;
585 5
                    break;
586
587
                    /*
588
                     * string Tj
589
                     * Show a Text String
590
                     */
591 9
                case 'Tj':
592 8
                    $extractedData[] = $command;
593 8
                    break;
594
595
                    /*
596
                     * string '
597
                     * Move to the next line and show a text string. This operator has the
598
                     * same effect as the code:
599
                     * T*
600
                     * string Tj
601
                     */
602 9
                case "'":
603
                    $extractedData[] = $command;
604
                    break;
605
606
                    /*
607
                     * aw ac string "
608
                     * Move to the next lkine and show a text string, using aw as the word
609
                     * spacing and ac as the character spacing. This operator has the same
610
                     * effect as the code:
611
                     * aw Tw
612
                     * ac Tc
613
                     * string '
614
                     * Tw set the word spacing, Tw, to wordSpace.
615
                     * Tc Set the character spacing, Tc, to charsSpace.
616
                     */
617 9
                case '"':
618
                    $extractedData[] = $command;
619
                    break;
620
621 9
                case 'Tf':
622 9
                case 'TF':
623 9
                    $extractedData[] = $command;
624 9
                    break;
625
626
                    /*
627
                     * array TJ
628
                     * Show one or more text strings allow individual glyph positioning.
629
                     * Each lement of array con be a string or a number. If the element is
630
                     * a string, this operator shows the string. If it is a number, the
631
                     * operator adjust the text position by that amount; that is, it translates
632
                     * the text matrix, Tm. This amount is substracted form the current
633
                     * horizontal or vertical coordinate, depending on the writing mode.
634
                     * in the default coordinate system, a positive adjustment has the effect
635
                     * of moving the next glyph painted either to the left or down by the given
636
                     * amount.
637
                     */
638 9
                case 'TJ':
639 9
                    $extractedData[] = $command;
640 9
                    break;
641
                default:
642
            }
643
        }
644
645 9
        return $extractedData;
646
    }
647
648
    /**
649
     * Gets the Text Matrix of the text in the page
650
     *
651
     * Return an array where every item is an array where the first item is the
652
     * Text Matrix (Tm) and the second is a string with the text data.  The Text matrix
653
     * is an array of 6 numbers. The last 2 numbers are the coordinates X and Y of the
654
     * text. The first 4 numbers has to be with Scalation, Rotation and Skew of the text.
655
     *
656
     * @param array $dataCommands the data extracted by getDataCommands
657
     *                            if null getDataCommands is called
658
     *
659
     * @return array an array with the data of the page including the Tm information
660
     *               of any text in the page
661
     */
662 8
    public function getDataTm(?array $dataCommands = null): array
663
    {
664 8
        if (!isset($dataCommands) || !$dataCommands) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $dataCommands of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
665 8
            $dataCommands = $this->getDataCommands();
666
        }
667
668
        /*
669
         * At the beginning of a text object Tm is the identity matrix
670
         */
671 8
        $defaultTm = ['1', '0', '0', '1', '0', '0'];
672
673
        /*
674
         *  Set the text leading used by T*, ' and " operators
675
         */
676 8
        $defaultTl = 0;
677
678
        /*
679
         *  Set default values for font data
680
         */
681 8
        $defaultFontId = -1;
682 8
        $defaultFontSize = 1;
683
684
        /*
685
         * Indexes of horizontal/vertical scaling and X,Y-coordinates in the matrix (Tm)
686
         */
687 8
        $hSc = 0; // horizontal scaling
688
        /**
689
         * index of vertical scaling in the array that encodes the text matrix.
690
         * for more information: https://github.com/smalot/pdfparser/pull/559#discussion_r1053415500
691
         */
692 8
        $vSc = 3;
693 8
        $x = 4;
694 8
        $y = 5;
695
696
        /*
697
         * x,y-coordinates of text space origin in user units
698
         *
699
         * These will be assigned the value of the currently printed string
700
         */
701 8
        $Tx = 0;
702 8
        $Ty = 0;
703
704 8
        $Tm = $defaultTm;
705 8
        $Tl = $defaultTl;
706 8
        $fontId = $defaultFontId;
707 8
        $fontSize = $defaultFontSize; // reflects fontSize set by Tf or Tfs
708
709 8
        $extractedTexts = $this->getTextArray();
710 8
        $extractedData = [];
711 8
        foreach ($dataCommands as $command) {
712
            // If we've used up all the texts from getTextArray(), exit
713
            // so we aren't accessing non-existent array indices
714
            // Fixes 'undefined array key' errors in Issues #575, #576
715 8
            if (\count($extractedTexts) <= \count($extractedData)) {
716 6
                break;
717
            }
718 8
            $currentText = $extractedTexts[\count($extractedData)];
719 8
            switch ($command['o']) {
720
                /*
721
                 * BT
722
                 * Begin a text object, initializing the Tm and Tlm to identity matrix
723
                 */
724 8
                case 'BT':
725 8
                    $Tm = $defaultTm;
726 8
                    $Tl = $defaultTl;
727 8
                    $Tx = 0;
728 8
                    $Ty = 0;
729 8
                    break;
730
731
                    /*
732
                     * ET
733
                     * End a text object
734
                     */
735 8
                case 'ET':
736 7
                    break;
737
738
                    /*
739
                     * text leading TL
740
                     * Set the text leading, Tl, to leading. Tl is used by the T*, ' and " operators.
741
                     * Initial value: 0
742
                     */
743 8
                case 'TL':
744
                    // scaled text leading
745 4
                    $Tl = (float) $command['c'] * (float) $Tm[$vSc];
746 4
                    break;
747
748
                    /*
749
                     * tx ty Td
750
                     * Move to the start of the next line, offset from the start of the
751
                     * current line by tx, ty.
752
                     */
753 8
                case 'Td':
754 8
                    $coord = explode(' ', $command['c']);
755 8
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
756 8
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
757 8
                    $Tm[$x] = (string) $Tx;
758 8
                    $Tm[$y] = (string) $Ty;
759 8
                    break;
760
761
                    /*
762
                     * tx ty TD
763
                     * Move to the start of the next line, offset form the start of the
764
                     * current line by tx, ty. As a side effect, this operator set the leading
765
                     * parameter in the text state. This operator has the same effect as the
766
                     * code:
767
                     * -ty TL
768
                     * tx ty Td
769
                     */
770 8
                case 'TD':
771 1
                    $coord = explode(' ', $command['c']);
772 1
                    $Tl = -((float) $coord[1] * (float) $Tm[$vSc]);
773 1
                    $Tx += (float) $coord[0] * (float) $Tm[$hSc];
774 1
                    $Ty += (float) $coord[1] * (float) $Tm[$vSc];
775 1
                    $Tm[$x] = (string) $Tx;
776 1
                    $Tm[$y] = (string) $Ty;
777 1
                    break;
778
779
                    /*
780
                     * a b c d e f Tm
781
                     * Set the text matrix, Tm, and the text line matrix, Tlm. The operands are
782
                     * all numbers, and the initial value for Tm and Tlm is the identity matrix
783
                     * [1 0 0 1 0 0]
784
                     */
785 8
                case 'Tm':
786 6
                    $Tm = explode(' ', $command['c']);
787 6
                    $Tx = (float) $Tm[$x];
788 6
                    $Ty = (float) $Tm[$y];
789 6
                    break;
790
791
                    /*
792
                     * T*
793
                     * Move to the start of the next line. This operator has the same effect
794
                     * as the code:
795
                     * 0 Tl Td
796
                     * Where Tl is the current leading parameter in the text state.
797
                     */
798 8
                case 'T*':
799 4
                    $Ty -= $Tl;
800 4
                    $Tm[$y] = (string) $Ty;
801 4
                    break;
802
803
                    /*
804
                     * string Tj
805
                     * Show a Text String
806
                     */
807 8
                case 'Tj':
808 7
                    $data = [$Tm, $currentText];
809 7
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
0 ignored issues
show
Bug introduced by
The method getDataTmFontInfoHasToBeIncluded() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

809
                    if ($this->config->/** @scrutinizer ignore-call */ getDataTmFontInfoHasToBeIncluded()) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
810 2
                        $data[] = $fontId;
811 2
                        $data[] = $fontSize;
812
                    }
813 7
                    $extractedData[] = $data;
814 7
                    break;
815
816
                    /*
817
                     * string '
818
                     * Move to the next line and show a text string. This operator has the
819
                     * same effect as the code:
820
                     * T*
821
                     * string Tj
822
                     */
823 8
                case "'":
824 1
                    $Ty -= $Tl;
825 1
                    $Tm[$y] = (string) $Ty;
826 1
                    $extractedData[] = [$Tm, $currentText];
827 1
                    break;
828
829
                    /*
830
                     * aw ac string "
831
                     * Move to the next line and show a text string, using aw as the word
832
                     * spacing and ac as the character spacing. This operator has the same
833
                     * effect as the code:
834
                     * aw Tw
835
                     * ac Tc
836
                     * string '
837
                     * Tw set the word spacing, Tw, to wordSpace.
838
                     * Tc Set the character spacing, Tc, to charsSpace.
839
                     */
840 8
                case '"':
841
                    $data = explode(' ', $currentText);
842
                    $Ty -= $Tl;
843
                    $Tm[$y] = (string) $Ty;
844
                    $extractedData[] = [$Tm, $data[2]]; // Verify
845
                    break;
846
847 8
                case 'Tf':
848
                    /*
849
                     * From PDF 1.0 specification, page 106:
850
                     *     fontname size Tf Set font and size
851
                     *     Sets the text font and text size in the graphics state. There is no default value for
852
                     *     either fontname or size; they must be selected using Tf before drawing any text.
853
                     *     fontname is a resource name. size is a number expressed in text space units.
854
                     *
855
                     * Source: https://ia902503.us.archive.org/10/items/pdfy-0vt8s-egqFwDl7L2/PDF%20Reference%201.0.pdf
856
                     * Introduced with https://github.com/smalot/pdfparser/pull/516
857
                     */
858 8
                    list($fontId, $fontSize) = explode(' ', $command['c'], 2);
859 8
                    break;
860
861
                    /*
862
                     * array TJ
863
                     * Show one or more text strings allow individual glyph positioning.
864
                     * Each lement of array con be a string or a number. If the element is
865
                     * a string, this operator shows the string. If it is a number, the
866
                     * operator adjust the text position by that amount; that is, it translates
867
                     * the text matrix, Tm. This amount is substracted form the current
868
                     * horizontal or vertical coordinate, depending on the writing mode.
869
                     * in the default coordinate system, a positive adjustment has the effect
870
                     * of moving the next glyph painted either to the left or down by the given
871
                     * amount.
872
                     */
873 8
                case 'TJ':
874 8
                    $data = [$Tm, $currentText];
875 8
                    if ($this->config->getDataTmFontInfoHasToBeIncluded()) {
876 2
                        $data[] = $fontId;
877 2
                        $data[] = $fontSize;
878
                    }
879 8
                    $extractedData[] = $data;
880 8
                    break;
881
                default:
882
            }
883
        }
884 8
        $this->dataTm = $extractedData;
885
886 8
        return $extractedData;
887
    }
888
889
    /**
890
     * Gets text data that are around the given coordinates (X,Y)
891
     *
892
     * If the text is in near the given coordinates (X,Y) (or the TM info),
893
     * the text is returned.  The extractedData return by getDataTm, could be use to see
894
     * where is the coordinates of a given text, using the TM info for it.
895
     *
896
     * @param float $x      The X value of the coordinate to search for. if null
897
     *                      just the Y value is considered (same Row)
898
     * @param float $y      The Y value of the coordinate to search for
899
     *                      just the X value is considered (same column)
900
     * @param float $xError The value less or more to consider an X to be "near"
901
     * @param float $yError The value less or more to consider an Y to be "near"
902
     *
903
     * @return array An array of text that are near the given coordinates. If no text
904
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
905
     *               and y coordinates are null, null is returned.
906
     */
907 2
    public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
908
    {
909 2
        if (!isset($this->dataTm) || !$this->dataTm) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->dataTm of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
910 1
            $this->getDataTm();
911
        }
912
913 2
        if (null !== $x) {
914 2
            $x = (float) $x;
915
        }
916
917 2
        if (null !== $y) {
918 2
            $y = (float) $y;
919
        }
920
921 2
        if (null === $x && null === $y) {
922
            return [];
923
        }
924
925 2
        $xError = (float) $xError;
926 2
        $yError = (float) $yError;
927
928 2
        $extractedData = [];
929 2
        foreach ($this->dataTm as $item) {
930 2
            $tm = $item[0];
931 2
            $xTm = (float) $tm[4];
932 2
            $yTm = (float) $tm[5];
933 2
            $text = $item[1];
934 2
            if (null === $y) {
935
                if (($xTm >= ($x - $xError))
936
                    && ($xTm <= ($x + $xError))) {
937
                    $extractedData[] = [$tm, $text];
938
                    continue;
939
                }
940
            }
941 2
            if (null === $x) {
942
                if (($yTm >= ($y - $yError))
943
                    && ($yTm <= ($y + $yError))) {
944
                    $extractedData[] = [$tm, $text];
945
                    continue;
946
                }
947
            }
948 2
            if (($xTm >= ($x - $xError))
949 2
                && ($xTm <= ($x + $xError))
950 2
                && ($yTm >= ($y - $yError))
951 2
                && ($yTm <= ($y + $yError))) {
952 2
                $extractedData[] = [$tm, $text];
953 2
                continue;
954
            }
955
        }
956
957 2
        return $extractedData;
958
    }
959
}
960