ParserTest::testChangedFontSpaceLimit() - Code Metrics - Inspection of "Might avoid invalid characters message" - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#510)

by Jeremy

created 2022-01-31 13:28 UTC

ParserTest::testChangedFontSpaceLimit() A

↳ Parent: ParserTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	11
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	6
nc	1
nop	0
dl	0
loc	11
rs	10
c	0
b	0
f	0

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 * @date    2017-01-03
 *
 * @license LGPLv3
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Tests\Smalot\PdfParser\Integration;

use Exception;
use Smalot\PdfParser\Config;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Parser;
use Smalot\PdfParser\XObject\Image;
use Tests\Smalot\PdfParser\TestCase;

class ParserTest extends TestCase
{
    protected function setUp(): void
    {
        parent::setUp();

        $this->fixture = new Parser();
    }

    /**
     * Notice: it may fail to run in Scrutinizer because of memory limitations.
     *
     * @group memory-heavy
     */
    public function testParseFile(): void
    {
        $directory = $this->rootDir.'/samples/bugs';

        if (is_dir($directory)) {
            $files = scandir($directory);

            foreach ($files as $file) {
                if (preg_match('/^.*\.pdf$/i', $file)) {
                    try {
                        $document = $this->fixture->parseFile($directory.'/'.$file);
                        $pages = $document->getPages();
                        $this->assertTrue(0 < \count($pages));

                        foreach ($pages as $page) {
                            $content = $page->getText();
                            $this->assertTrue(0 < \strlen($content));
                        }
                    } catch (Exception $e) {
                        if (
                            'Secured pdf file are currently not supported.' !== $e->getMessage()
                            && 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
                        ) {
                            throw $e;
                        }
                    }
                }
            }
        }
    }

    /**
     * Properly decode international unicode characters
     *
     * @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support
     */
    public function testUnicodeDecoding(): void
    {
        $filename = $this->rootDir.'/samples/InternationalChars.pdf';

        $document = $this->fixture->parseFile($filename);

        $testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
        $testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
        $testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
        $testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
        $testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
        $testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';

        $this->assertStringContainsString($testString_cyrillic, $document->getText());
        $this->assertStringContainsString($testString_greek, $document->getText());
        $this->assertStringContainsString($testString_armenian, $document->getText());
        $this->assertStringContainsString($testString_georgean, $document->getText());
        $this->assertStringContainsString($testString_korean, $document->getText());
        $this->assertStringContainsString($testString_western, $document->getText());
    }

    /**
     * Tests that xrefs with line breaks between id and position are parsed correctly
     *
     * @see https://github.com/smalot/pdfparser/issues/336
     */
    public function testIssue19(): void
    {
        $fixture = new ParserSub();
        $structure = [
            [
                '<<',
                [
                    [
                        '/',
                        'Type',
                        7735,
                    ],
                    [
                        '/',
                        'ObjStm',
                        7742,
                    ],
                ],
            ],
            [
                'stream',
                '',
                7804,
                [
                    "17\n0",
                    [],
                ],
            ],
        ];
        $document = new Document();

        $fixture->exposedParseObject('19_0', $structure, $document);
        $objects = $fixture->getObjects();

        $this->assertArrayHasKey('17_0', $objects);
    }

    /**
     * Properly decode ANSI encodings without producing scrambled UTF-8 characters
     *
     * @see https://github.com/smalot/pdfparser/issues/202
     * @see https://github.com/smalot/pdfparser/pull/257
     */
    public function testIssue202(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue202.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertEquals('„fööbär“', $document->getText());
    }

    /**
     * Without a proper fix it throws deprecated message like:
     *
     * `Invalid characters passed for attempted conversion, these have been ignored`
     */
    public function testEbook(): void
    {
        $filename = $this->rootDir.'/samples/ebook_sept2003.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertEquals('Ebook3-6-version-pdf.sxw', $document->getDetails()['Title']);
        $this->assertEquals('Communauté Wireless francophone', $document->getDetails()['Author']);
    }

    /**
     * Test that issue related pdf can now be parsed
     *
     * @see https://github.com/smalot/pdfparser/issues/267
     */
    public function testIssue267(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
        $this->assertStringContainsString('4 von 4', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed:
     * Too many slashes were being stripped and resulted
     * in malformed encoding of parts of the text content.
     *
     * @see https://github.com/smalot/pdfparser/issues/322
     */
    public function testIssue322(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue322.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed:
     * Too many slashes were being stripped and resulted
     * in malformed encoding of parts of the text content.
     *
     * License of the content taken from https://stackoverflow.com in the sample PDF:
     * CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
     *
     * @see https://github.com/smalot/pdfparser/issues/334
     */
    public function testIssue334(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue334.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString('This question already has an answer here', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed:
     * Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset"
     *
     * @see https://github.com/smalot/pdfparser/issues/359
     */
    public function testIssue359(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString(
            'dnia 10 maja 2018 roku o ochronie danych osobowych',
            $document->getText()
        );
        $this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText());
        /*
         * @todo Note that the "ł" in przepływu is decoded as a space character. This was already
         * the case before the PR that caused this issue and is not currently covered by this
         * test case. However, this issue should be addressed in the future and its fix can then
         * be incorporated into this test by uncommenting the following assertion.
         */
        // $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText());
    }

    /**
     * Tests if PDF triggers "Call to undefined method Smalot\PdfParser\Header::__toString()".
     *
     * It happened because there was a check missing in Font.php (~ line 109).
     *
     * @see https://github.com/smalot/pdfparser/issues/391
     */
    public function testIssue391(): void
    {
        /**
         * PDF provided by @dhildreth for usage in our test environment.
         *
         * @see https://github.com/smalot/pdfparser/issues/391#issuecomment-783504599
         */
        $filename = $this->rootDir.'/samples/bugs/Issue391.pdf';

        $document = $this->fixture->parseFile($filename);

        // check for an example string (PDF consists of many pages)
        $this->assertStringContainsString(
            '(This Code will be changed while mass production)',
            $document->getText()
        );
    }

    /**
     * Tests behavior when changing default font space limit (-50).
     *
     * Test is based on testIssue359 (above).
     */
    public function testChangedFontSpaceLimit(): void
    {
        $filename = $this->rootDir.'/samples/bugs/Issue359.pdf';

        $config = new Config();
        $config->setFontSpaceLimit(1); // change default value

        $this->fixture = new Parser([], $config);
        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString('dni a  10  maj a  2018', $document->getText());
    }

    /**
     * Tests if a given Config object is really used.
     * Or if a default one is generated, if null was given.
     */
    public function testUsageOfConfigObject(): void
    {
        // check default
        $this->fixture = new Parser([]);
        $this->assertEquals(new Config(), $this->fixture->getConfig());

        // check default 2
        $this->fixture = new Parser([], null);
        $this->assertEquals(new Config(), $this->fixture->getConfig());

        // check given
        $config = new Config();
        $config->setFontSpaceLimit(1000);
        $this->fixture = new Parser([], $config);
        $this->assertEquals($config, $this->fixture->getConfig());
    }

    /**
     * Tests the impact of the retainImageContent config setting on memory usage
     *
     * @group memory-heavy
     *
     * @see https://github.com/smalot/pdfparser/issues/104#issuecomment-883422508
     */
    public function testRetainImageContentImpact(): void
    {
        if (version_compare(\PHP_VERSION, '7.3.0', '<')) {
            $this->markTestSkipped('Garbage collection doesn\'t work reliably enough for this test in PHP < 7.3');
        }

        gc_collect_cycles();
        $baselineMemory = memory_get_usage(true);

        $filename = $this->rootDir.'/samples/bugs/Issue104a.pdf';
        $iterations = 2;

        /*
         * check default (= true)
         */
        $this->fixture = new Parser([]);
        $this->assertTrue($this->fixture->getConfig()->getRetainImageContent());
        $document = null;

        for ($i = 0; $i < $iterations; ++$i) {
            $document = $this->fixture->parseFile($filename);
        }

        $usedMemory = memory_get_usage(true);
        $this->assertTrue($usedMemory > ($baselineMemory * 1.5), 'Memory is only '.$usedMemory);
        $this->assertTrue(null != $document && 0 < \strlen($document->getText()));

        // force garbage collection
        $this->fixture = $document = null;
        gc_collect_cycles();

        /*
         * check false
         */
        $config = new Config();
        $config->setRetainImageContent(false);
        $this->fixture = new Parser([], $config);
        $this->assertEquals($config, $this->fixture->getConfig());

        for ($i = 0; $i < $iterations; ++$i) {
            $document = $this->fixture->parseFile($filename);
        }

        $usedMemory = memory_get_usage(true);
        /*
         * note: the following memory value is set manually and may differ from system to system.
         *       it must be high enough to not produce a false negative though.
         */
        $this->assertTrue($usedMemory < ($baselineMemory * 1.05), 'Memory is '.$usedMemory);
        $this->assertTrue(0 < \strlen($document->getText()));
    }
}

class ParserSub extends Parser
{
    public function exposedParseObject($id, $structure, $document)
    {
        return $this->parseObject($id, $structure, $document);
class A
{
    function getObject()
    {
        return null;
    }

}

$a = new A();
if ($a->getObject()) {
    }

    public function getObjects(): array
    {
        return $this->objects;
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			* @date 2020-06-01
8			*
9			* @author Sébastien MALOT <[email protected]>
10			* @date 2017-01-03
11			*
12			* @license LGPLv3
13			* @url <https://github.com/smalot/pdfparser>
14			*
15			* PdfParser is a pdf library written in PHP, extraction oriented.
16			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17			*
18			* This program is free software: you can redistribute it and/or modify
19			* it under the terms of the GNU Lesser General Public License as published by
20			* the Free Software Foundation, either version 3 of the License, or
21			* (at your option) any later version.
22			*
23			* This program is distributed in the hope that it will be useful,
24			* but WITHOUT ANY WARRANTY; without even the implied warranty of
25			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26			* GNU Lesser General Public License for more details.
27			*
28			* You should have received a copy of the GNU Lesser General Public License
29			* along with this program.
30			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31			*/
32
33			namespace Tests\Smalot\PdfParser\Integration;
34
35			use Exception;
36			use Smalot\PdfParser\Config;
37			use Smalot\PdfParser\Document;
38			use Smalot\PdfParser\Parser;
39			use Smalot\PdfParser\XObject\Image;
40			use Tests\Smalot\PdfParser\TestCase;
41
42			class ParserTest extends TestCase
43			{
44			protected function setUp(): void
45			{
46			parent::setUp();
47
48			$this->fixture = new Parser();
49			}
50
51			/**
52			* Notice: it may fail to run in Scrutinizer because of memory limitations.
53			*
54			* @group memory-heavy
55			*/
56			public function testParseFile(): void
57			{
58			$directory = $this->rootDir.'/samples/bugs';
59
60			if (is_dir($directory)) {
61			$files = scandir($directory);
62
63			foreach ($files as $file) {
64			if (preg_match('/^.*\.pdf$/i', $file)) {
65			try {
66			$document = $this->fixture->parseFile($directory.'/'.$file);
67			$pages = $document->getPages();
68			$this->assertTrue(0 < \count($pages));
69
70			foreach ($pages as $page) {
71			$content = $page->getText();
72			$this->assertTrue(0 < \strlen($content));
73			}
74			} catch (Exception $e) {
75			if (
76			'Secured pdf file are currently not supported.' !== $e->getMessage()
77			&& 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
78			) {
79			throw $e;
80			}
81			}
82			}
83			}
84			}
85			}
86
87			/**
88			* Properly decode international unicode characters
89			*
90			* @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support
91			*/
92			public function testUnicodeDecoding(): void
93			{
94			$filename = $this->rootDir.'/samples/InternationalChars.pdf';
95
96			$document = $this->fixture->parseFile($filename);
97
98			$testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
99			$testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
100			$testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
101			$testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
102			$testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
103			$testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';
104
105			$this->assertStringContainsString($testString_cyrillic, $document->getText());
106			$this->assertStringContainsString($testString_greek, $document->getText());
107			$this->assertStringContainsString($testString_armenian, $document->getText());
108			$this->assertStringContainsString($testString_georgean, $document->getText());
109			$this->assertStringContainsString($testString_korean, $document->getText());
110			$this->assertStringContainsString($testString_western, $document->getText());
111			}
112
113			/**
114			* Tests that xrefs with line breaks between id and position are parsed correctly
115			*
116			* @see https://github.com/smalot/pdfparser/issues/336
117			*/
118			public function testIssue19(): void
119			{
120			$fixture = new ParserSub();
121			$structure = [
122			[
123			'<<',
124			[
125			[
126			'/',
127			'Type',
128			7735,
129			],
130			[
131			'/',
132			'ObjStm',
133			7742,
134			],
135			],
136			],
137			[
138			'stream',
139			'',
140			7804,
141			[
142			"17\n0",
143			[],
144			],
145			],
146			];
147			$document = new Document();
148
149			$fixture->exposedParseObject('19_0', $structure, $document);
150			$objects = $fixture->getObjects();
151
152			$this->assertArrayHasKey('17_0', $objects);
153			}
154
155			/**
156			* Properly decode ANSI encodings without producing scrambled UTF-8 characters
157			*
158			* @see https://github.com/smalot/pdfparser/issues/202
159			* @see https://github.com/smalot/pdfparser/pull/257
160			*/
161			public function testIssue202(): void
162			{
163			$filename = $this->rootDir.'/samples/bugs/Issue202.pdf';
164
165			$document = $this->fixture->parseFile($filename);
166
167			$this->assertEquals('„fööbär“', $document->getText());
168			}
169
170			/**
171			* Without a proper fix it throws deprecated message like:
172			*
173			* `Invalid characters passed for attempted conversion, these have been ignored`
174			*/
175			public function testEbook(): void
176			{
177			$filename = $this->rootDir.'/samples/ebook_sept2003.pdf';
178
179			$document = $this->fixture->parseFile($filename);
180
181			$this->assertEquals('Ebook3-6-version-pdf.sxw', $document->getDetails()['Title']);
182			$this->assertEquals('Communauté Wireless francophone', $document->getDetails()['Author']);
183			}
184
185			/**
186			* Test that issue related pdf can now be parsed
187			*
188			* @see https://github.com/smalot/pdfparser/issues/267
189			*/
190			public function testIssue267(): void
191			{
192			$filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';
193
194			$document = $this->fixture->parseFile($filename);
195
196			$this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
197			$this->assertStringContainsString('4 von 4', $document->getText());
198			}
199
200			/**
201			* Test that issue related pdf can now be parsed:
202			* Too many slashes were being stripped and resulted
203			* in malformed encoding of parts of the text content.
204			*
205			* @see https://github.com/smalot/pdfparser/issues/322
206			*/
207			public function testIssue322(): void
208			{
209			$filename = $this->rootDir.'/samples/bugs/Issue322.pdf';
210
211			$document = $this->fixture->parseFile($filename);
212
213			$this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
214			}
215
216			/**
217			* Test that issue related pdf can now be parsed:
218			* Too many slashes were being stripped and resulted
219			* in malformed encoding of parts of the text content.
220			*
221			* License of the content taken from https://stackoverflow.com in the sample PDF:
222			* CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
223			*
224			* @see https://github.com/smalot/pdfparser/issues/334
225			*/
226			public function testIssue334(): void
227			{
228			$filename = $this->rootDir.'/samples/bugs/Issue334.pdf';
229
230			$document = $this->fixture->parseFile($filename);
231
232			$this->assertStringContainsString('This question already has an answer here', $document->getText());
233			}
234
235			/**
236			* Test that issue related pdf can now be parsed:
237			* Glyphs not in the Postscript lookup table would cause "Notice: Undefined offset"
238			*
239			* @see https://github.com/smalot/pdfparser/issues/359
240			*/
241			public function testIssue359(): void
242			{
243			$filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
244
245			$document = $this->fixture->parseFile($filename);
246
247			$this->assertStringContainsString(
248			'dnia 10 maja 2018 roku o ochronie danych osobowych',
249			$document->getText()
250			);
251			$this->assertStringContainsString('sprawie ochrony osób fizycznych w związku', $document->getText());
252			/*
253			* @todo Note that the "ł" in przepływu is decoded as a space character. This was already
254			* the case before the PR that caused this issue and is not currently covered by this
255			* test case. However, this issue should be addressed in the future and its fix can then
256			* be incorporated into this test by uncommenting the following assertion.
257			*/
258			// $this->assertStringContainsString('sprawie swobodnego przepływu takich danych oraz uchylenia dyrektywy', $document->getText());
259			}
260
261			/**
262			* Tests if PDF triggers "Call to undefined method Smalot\PdfParser\Header::__toString()".
263			*
264			* It happened because there was a check missing in Font.php (~ line 109).
265			*
266			* @see https://github.com/smalot/pdfparser/issues/391
267			*/
268			public function testIssue391(): void
269			{
270			/**
271			* PDF provided by @dhildreth for usage in our test environment.
272			*
273			* @see https://github.com/smalot/pdfparser/issues/391#issuecomment-783504599
274			*/
275			$filename = $this->rootDir.'/samples/bugs/Issue391.pdf';
276
277			$document = $this->fixture->parseFile($filename);
278
279			// check for an example string (PDF consists of many pages)
280			$this->assertStringContainsString(
281			'(This Code will be changed while mass production)',
282			$document->getText()
283			);
284			}
285
286			/**
287			* Tests behavior when changing default font space limit (-50).
288			*
289			* Test is based on testIssue359 (above).
290			*/
291			public function testChangedFontSpaceLimit(): void
292			{
293			$filename = $this->rootDir.'/samples/bugs/Issue359.pdf';
294
295			$config = new Config();
296			$config->setFontSpaceLimit(1); // change default value
297
298			$this->fixture = new Parser([], $config);
299			$document = $this->fixture->parseFile($filename);
300
301			$this->assertStringContainsString('dni a 10 maj a 2018', $document->getText());
302			}
303
304			/**
305			* Tests if a given Config object is really used.
306			* Or if a default one is generated, if null was given.
307			*/
308			public function testUsageOfConfigObject(): void
309			{
310			// check default
311			$this->fixture = new Parser([]);
312			$this->assertEquals(new Config(), $this->fixture->getConfig());
313
314			// check default 2
315			$this->fixture = new Parser([], null);
316			$this->assertEquals(new Config(), $this->fixture->getConfig());
317
318			// check given
319			$config = new Config();
320			$config->setFontSpaceLimit(1000);
321			$this->fixture = new Parser([], $config);
322			$this->assertEquals($config, $this->fixture->getConfig());
323			}
324
325			/**
326			* Tests the impact of the retainImageContent config setting on memory usage
327			*
328			* @group memory-heavy
329			*
330			* @see https://github.com/smalot/pdfparser/issues/104#issuecomment-883422508
331			*/
332			public function testRetainImageContentImpact(): void
333			{
334			if (version_compare(\PHP_VERSION, '7.3.0', '<')) {
335			$this->markTestSkipped('Garbage collection doesn\'t work reliably enough for this test in PHP < 7.3');
336			}
337
338			gc_collect_cycles();
339			$baselineMemory = memory_get_usage(true);
340
341			$filename = $this->rootDir.'/samples/bugs/Issue104a.pdf';
342			$iterations = 2;
343
344			/*
345			* check default (= true)
346			*/
347			$this->fixture = new Parser([]);
348			$this->assertTrue($this->fixture->getConfig()->getRetainImageContent());
349			$document = null;
350
351			for ($i = 0; $i < $iterations; ++$i) {
352			$document = $this->fixture->parseFile($filename);
353			}
354
355			$usedMemory = memory_get_usage(true);
356			$this->assertTrue($usedMemory > ($baselineMemory * 1.5), 'Memory is only '.$usedMemory);
357			$this->assertTrue(null != $document && 0 < \strlen($document->getText()));
358
359			// force garbage collection
360			$this->fixture = $document = null;
361			gc_collect_cycles();
362
363			/*
364			* check false
365			*/
366			$config = new Config();
367			$config->setRetainImageContent(false);
368			$this->fixture = new Parser([], $config);
369			$this->assertEquals($config, $this->fixture->getConfig());
370
371			for ($i = 0; $i < $iterations; ++$i) {
372			$document = $this->fixture->parseFile($filename);
373			}
374
375			$usedMemory = memory_get_usage(true);
376			/*
377			* note: the following memory value is set manually and may differ from system to system.
378			* it must be high enough to not produce a false negative though.
379			*/
380			$this->assertTrue($usedMemory < ($baselineMemory * 1.05), 'Memory is '.$usedMemory);
381			$this->assertTrue(0 < \strlen($document->getText()));
382			}
383			}
384
385			class ParserSub extends Parser
386			{
387			public function exposedParseObject($id, $structure, $document)
388			{
389			return $this->parseObject($id, $structure, $document);
			0 ignored issues – show Bug introduced 2020-09-29 13:49 UTC by Report Bug Copy Issue Report Are you sure the usage of `$this->parseObject($id, $structure, $document)` targeting `Smalot\PdfParser\Parser::parseObject()` seems to always return null. This check looks for function or method calls that always return null and whose return value is used. class A { function getObject() { return null; } } $a = new A(); if ($a->getObject()) { The method `getObject()` can return nothing but null, so it makes no sense to use the return value. The reason is most likely that a function or method is imcomplete or has been reduced for debug purposes. Loading history...
390			}
391
392			public function getObjects(): array
393			{
394			return $this->objects;
395			}
396			}
397

smalot / pdfparser

Pull Request — master (#510)

ParserTest::testChangedFontSpaceLimit() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like