ParserTest::testUnicodeDecoding() - Code Metrics - Inspection of "revived #257: Properly decode ANSI encodings" - smalot/pdfparser - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#349)

unknown

created 2020-09-30 18:43 UTC

ParserTest::testUnicodeDecoding() A

↳ Parent: ParserTest

Complexity

Conditions	1
Paths	1

Size

Total Lines	20
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
cc	1
eloc	14
c	1
b	0
f	1
nc	1
nop	0
dl	0
loc	20
rs	9.7998

<?php

/**
 * @file This file is part of the PdfParser library.
 *
 * @author  Konrad Abicht <[email protected]>
 * @date    2020-06-01
 *
 * @author  Sébastien MALOT <[email protected]>
 * @date    2017-01-03
 *
 * @license LGPLv3
 * @url     <https://github.com/smalot/pdfparser>
 *
 *  PdfParser is a pdf library written in PHP, extraction oriented.
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
 */

namespace Tests\Smalot\PdfParser\Integration;

use Exception;
use Smalot\PdfParser\Parser;
use Smalot\PdfParser\XObject\Image;
use Tests\Smalot\PdfParser\TestCase;

class ParserTest extends TestCase
{
    public function setUp()
    {
        parent::setUp();

        $this->fixture = new Parser();
    }

    public function testParseFile()
    {
        $directory = $this->rootDir.'/samples/bugs';

        if (is_dir($directory)) {
            $files = scandir($directory);

            foreach ($files as $file) {
                if (preg_match('/^.*\.pdf$/i', $file)) {
                    try {
                        $document = $this->fixture->parseFile($directory.'/'.$file);
                        $pages = $document->getPages();
                        $this->assertTrue(0 < \count($pages));

                        foreach ($pages as $page) {
                            $content = $page->getText();
                            $this->assertTrue(0 < \strlen($content));
                        }
                    } catch (Exception $e) {
                        if (
                            'Secured pdf file are currently not supported.' !== $e->getMessage()
                            && 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
                        ) {
                            throw $e;
                        }
                    }
                }
            }
        }
    }

    /**
     * Properly decode international unicode characters
     */
    public function testUnicodeDecoding()
    {
        $filename = $this->rootDir.'/samples/InternationalChars.pdf';

        $document = $this->fixture->parseFile($filename);

        $testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
        $testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
        $testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
        $testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
        $testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
        $testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';
        // @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support

        $this->assertStringContainsString($testString_cyrillic, $document->getText());
        $this->assertStringContainsString($testString_greek, $document->getText());
        $this->assertStringContainsString($testString_armenian, $document->getText());
        $this->assertStringContainsString($testString_georgean, $document->getText());
        $this->assertStringContainsString($testString_korean, $document->getText());
        $this->assertStringContainsString($testString_western, $document->getText());
    }

    /**
     * Properly decode ANSI encodings without producing scrambled UTF-8 characters
     *
     * @see https://github.com/smalot/pdfparser/issues/202
     * @see https://github.com/smalot/pdfparser/pull/257
     */
    public function testIssue202()
    {
        $filename = $this->rootDir.'/samples/bugs/Issue202.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertEquals('„fööbär“', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed
     *
     * @see https://github.com/smalot/pdfparser/issues/267
     */
    public function testIssue267()
    {
        $filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
        $this->assertStringContainsString('4 von 4', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed:
     * Too many slashes were being stripped and resulted
     * in malformed encoding of parts of the text content.
     *
     * @see https://github.com/smalot/pdfparser/issues/322
     */
    public function testIssue322()
    {
        $filename = $this->rootDir.'/samples/bugs/Issue322.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
    }

    /**
     * Test that issue related pdf can now be parsed:
     * Too many slashes were being stripped and resulted
     * in malformed encoding of parts of the text content.
     *
     * License of the content taken from https://stackoverflow.com in the sample PDF:
     * CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
     *
     * @see https://github.com/smalot/pdfparser/issues/334
     */
    public function testIssue334()
    {
        $filename = $this->rootDir.'/samples/bugs/Issue334.pdf';

        $document = $this->fixture->parseFile($filename);

        $this->assertStringContainsString('This question already has an answer here', $document->getText());
    }
}


1			<?php
2
3			/**
4			* @file This file is part of the PdfParser library.
5			*
6			* @author Konrad Abicht <[email protected]>
7			* @date 2020-06-01
8			*
9			* @author Sébastien MALOT <[email protected]>
10			* @date 2017-01-03
11			*
12			* @license LGPLv3
13			* @url <https://github.com/smalot/pdfparser>
14			*
15			* PdfParser is a pdf library written in PHP, extraction oriented.
16			* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17			*
18			* This program is free software: you can redistribute it and/or modify
19			* it under the terms of the GNU Lesser General Public License as published by
20			* the Free Software Foundation, either version 3 of the License, or
21			* (at your option) any later version.
22			*
23			* This program is distributed in the hope that it will be useful,
24			* but WITHOUT ANY WARRANTY; without even the implied warranty of
25			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26			* GNU Lesser General Public License for more details.
27			*
28			* You should have received a copy of the GNU Lesser General Public License
29			* along with this program.
30			* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31			*/
32
33			namespace Tests\Smalot\PdfParser\Integration;
34
35			use Exception;
36			use Smalot\PdfParser\Parser;
37			use Smalot\PdfParser\XObject\Image;
38			use Tests\Smalot\PdfParser\TestCase;
39
40			class ParserTest extends TestCase
41			{
42			public function setUp()
43			{
44			parent::setUp();
45
46			$this->fixture = new Parser();
47			}
48
49			public function testParseFile()
50			{
51			$directory = $this->rootDir.'/samples/bugs';
52
53			if (is_dir($directory)) {
54			$files = scandir($directory);
55
56			foreach ($files as $file) {
57			if (preg_match('/^.*\.pdf$/i', $file)) {
58			try {
59			$document = $this->fixture->parseFile($directory.'/'.$file);
60			$pages = $document->getPages();
61			$this->assertTrue(0 < \count($pages));
62
63			foreach ($pages as $page) {
64			$content = $page->getText();
65			$this->assertTrue(0 < \strlen($content));
66			}
67			} catch (Exception $e) {
68			if (
69			'Secured pdf file are currently not supported.' !== $e->getMessage()
70			&& 0 != strpos($e->getMessage(), 'TCPDF_PARSER')
71			) {
72			throw $e;
73			}
74			}
75			}
76			}
77			}
78			}
79
80			/**
81			* Properly decode international unicode characters
82			*/
83			public function testUnicodeDecoding()
84			{
85			$filename = $this->rootDir.'/samples/InternationalChars.pdf';
86
87			$document = $this->fixture->parseFile($filename);
88
89			$testString_cyrillic = "Лорем ипсум долор сит амет, еу сед либрис долорем инцоррупте. Ут лорем долоре граеце хис, модо \nаппареат сапиентем ут мел. Хис ат лаборе омнесяуе сигниферумяуе, тале анциллае ан еум, ех сед синт \nнобис. Сед модус вивендо цопиосае еа, сапиентем цонцептам хис не, яуандо сплендиде еум те.";
90			$testString_greek = "Λορεμ ιπσθμ δολορ σιτ αμετ, τατιον cονστιτθαμ ομιτταντθρ εα σεα, αθδιαμ μανδαμθσ μελ τε. Διcο μθτατ \nινδοcτθμ εοσ ει, ει vιξ σονετ παρτιενδο ινcορρθπτε. Επιcθρι αντιοπαμ εθ νεc, ναμ λεγιμθσ γθβεργρεν ιν. \nVιξ σολετ ρεcτεqθε εα, ηασ νο αλιqθαμ μινιμθμ. Ιδ προ περcιπιτ περιcθλισ δετερρθισσετ, ιν νεc αππετερε \nομιτταντθρ ελοqθεντιαμ, ορατιο δοcτθσ ναμ αδ. Ετ σιτ σολθμ ρεcθσαβο, vιξ θτ λοβορτισ σπλενδιδε \nρεπθδιανδαε.";
91			$testString_armenian = "լոռեմ իպսում դոլոռ սիթ ամեթ վիս ին իմպեդիթ ադմոդում ծու ապպառեաթ սծռիպսեռիթ մել մել եթ \nդոմինգ ծոնսեքուունթուռ ծիվիբուս վիվենդում պռոդեսսեթ ադ մեի թիբիքուե ապպառեաթ սիմիլիքուե թե \nվիմ վիխ ծասե սեմպեռ դոլոռեմ եխ եամ եա սթեթ մեդիոծռեմ ծոնսեթեթուռ ռաթիոնիբուս ինթելլեգամ \nմել թե";
92			$testString_georgean = "ლორემ იფსუმ დოლორ სით ამეთ ესთ ეთ სონეთ ზრილ მელიუს ელიგენდი თორყუათოს \nელოყუენთიამ ესთ ეხ უსუ ფალლი ალთერა ცეთეროს ინ ეთ ომითთამ თრაცთათოს ჰის ეუ ველ \nალთერუმ ვოლუფთათუმ მაზიმ ფერთინახ ჰენდრერით ინ ფრი ნეც ინ თემფორ ფეთენთიუმ ვერო \nფოსთულანთ ელოყუენთიამ უსუ ნე ან ყუი ლიბერ ეფიცური ასსუევერით იდ ნიბჰ ყუას ჰაბემუს სეა";
93			$testString_korean = "그 임기는 4년으로 하며. 이 경우 그 명령에 의하여 개정 또는 폐지되었던 법률은 그 명령이 승인을 얻지 못한 때부터 당연히 효력을 \n회복한다. 가부동수인 때에는 부결된 것으로 본다. 법률과 적법한 절차에 의하지 아니하고는 처벌·보안처분 또는 강제노역을 받지 \n아니한다.";
94			$testString_western = 'ÄÖÜöäüßẞ Ññ¡¿ øÅå';
95			// @todo the other languages in the test document need work because of issues with UTF-16 decoding (Chinese, Japanese) and missing right-to-left language support
96
97			$this->assertStringContainsString($testString_cyrillic, $document->getText());
98			$this->assertStringContainsString($testString_greek, $document->getText());
99			$this->assertStringContainsString($testString_armenian, $document->getText());
100			$this->assertStringContainsString($testString_georgean, $document->getText());
101			$this->assertStringContainsString($testString_korean, $document->getText());
102			$this->assertStringContainsString($testString_western, $document->getText());
103			}
104
105			/**
106			* Properly decode ANSI encodings without producing scrambled UTF-8 characters
107			*
108			* @see https://github.com/smalot/pdfparser/issues/202
109			* @see https://github.com/smalot/pdfparser/pull/257
110			*/
111			public function testIssue202()
112			{
113			$filename = $this->rootDir.'/samples/bugs/Issue202.pdf';
114
115			$document = $this->fixture->parseFile($filename);
116
117			$this->assertEquals('„fööbär“', $document->getText());
118			}
119
120			/**
121			* Test that issue related pdf can now be parsed
122			*
123			* @see https://github.com/smalot/pdfparser/issues/267
124			*/
125			public function testIssue267()
126			{
127			$filename = $this->rootDir.'/samples/bugs/Issue267_array_access_on_int.pdf';
128
129			$document = $this->fixture->parseFile($filename);
130
131			$this->assertEquals(Image::class, \get_class($document->getObjectById('128_0')));
132			$this->assertStringContainsString('4 von 4', $document->getText());
133			}
134
135			/**
136			* Test that issue related pdf can now be parsed:
137			* Too many slashes were being stripped and resulted
138			* in malformed encoding of parts of the text content.
139			*
140			* @see https://github.com/smalot/pdfparser/issues/322
141			*/
142			public function testIssue322()
143			{
144			$filename = $this->rootDir.'/samples/bugs/Issue322.pdf';
145
146			$document = $this->fixture->parseFile($filename);
147
148			$this->assertStringContainsString('this text isn’t working properly, I’ve edited it in Google Documents', $document->getText());
149			}
150
151			/**
152			* Test that issue related pdf can now be parsed:
153			* Too many slashes were being stripped and resulted
154			* in malformed encoding of parts of the text content.
155			*
156			* License of the content taken from https://stackoverflow.com in the sample PDF:
157			* CC BY-SA 2.5 https://creativecommons.org/licenses/by-sa/2.5/
158			*
159			* @see https://github.com/smalot/pdfparser/issues/334
160			*/
161			public function testIssue334()
162			{
163			$filename = $this->rootDir.'/samples/bugs/Issue334.pdf';
164
165			$document = $this->fixture->parseFile($filename);
166
167			$this->assertStringContainsString('This question already has an answer here', $document->getText());
168			}
169			}
170

smalot / pdfparser

Pull Request — master (#349)

ParserTest::testUnicodeDecoding() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like