Test Failed
Pull Request — master (#611)
by
unknown
02:00
created

PDFDocEncoding   A

Complexity

Total Complexity 2

Size/Duplication

Total Lines 148
Duplicated Lines 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
eloc 140
c 2
b 0
f 0
dl 0
loc 148
rs 10
wmc 2

2 Methods

Rating   Name   Duplication   Size   Complexity  
B getCodePage() 0 140 1
A convertPDFDoc2UTF8() 0 3 1
1
<?php
2
3
/**
4
 * @file    This file is part of the PdfParser library.
5
 *
6
 * @author  Brian Huisman <[email protected]>
7
 *
8
 * @date    2023-06-28
9
 *
10
 * @license LGPLv3
11
 *
12
 * @url     <https://github.com/smalot/pdfparser>
13
 *
14
 *  PdfParser is a pdf library written in PHP, extraction oriented.
15
 *  Copyright (C) 2017 - Sébastien MALOT <[email protected]>
16
 *
17
 *  This program is free software: you can redistribute it and/or modify
18
 *  it under the terms of the GNU Lesser General Public License as published by
19
 *  the Free Software Foundation, either version 3 of the License, or
20
 *  (at your option) any later version.
21
 *
22
 *  This program is distributed in the hope that it will be useful,
23
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
24
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25
 *  GNU Lesser General Public License for more details.
26
 *
27
 *  You should have received a copy of the GNU Lesser General Public License
28
 *  along with this program.
29
 *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
30
 */
31
32
// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf
33
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf
34
35
namespace Smalot\PdfParser\Encoding;
36
37
/**
38
 * Class PDFDocEncoding
39
 */
40
class PDFDocEncoding
41
{
42
    public static function getCodePage(): array
43
    {
44
        return [
45
            "\x18" => "\u{02d8}", // breve
46
            "\x19" => "\u{02c7}", // caron
47
            "\x1a" => "\u{02c6}", // circumflex
48
            "\x1b" => "\u{02d9}", // dotaccent
49
            "\x1c" => "\u{02dd}", // hungarumlaut
50
            "\x1d" => "\u{02db}", // ogonek
51
            "\x1e" => "\u{02de}", // ring
52
            "\x1f" => "\u{02dc}", // tilde
53
            "\x7f" => '',
54
            "\x80" => "\u{2022}", // bullet
55
            "\x81" => "\u{2020}", // dagger
56
            "\x82" => "\u{2021}", // daggerdbl
57
            "\x83" => "\u{2026}", // ellipsis
58
            "\x84" => "\u{2014}", // emdash
59
            "\x85" => "\u{2013}", // endash
60
            "\x86" => "\u{0192}", // florin
61
            "\x87" => "\u{2044}", // fraction
62
            "\x88" => "\u{2039}", // guilsinglleft
63
            "\x89" => "\u{203a}", // guilsinglright
64
            "\x8a" => "\u{2212}", // minus
65
            "\x8b" => "\u{2030}", // perthousand
66
            "\x8c" => "\u{201e}", // quotedblbase
67
            "\x8d" => "\u{201c}", // quotedblleft
68
            "\x8e" => "\u{201d}", // quotedblright
69
            "\x8f" => "\u{2018}", // quoteleft
70
            "\x90" => "\u{2019}", // quoteright
71
            "\x91" => "\u{201a}", // quotesinglbase
72
            "\x92" => "\u{2122}", // trademark
73
            "\x93" => "\u{fb01}", // fi
74
            "\x94" => "\u{fb02}", // fl
75
            "\x95" => "\u{0141}", // Lslash
76
            "\x96" => "\u{0152}", // OE
77
            "\x97" => "\u{0160}", // Scaron
78
            "\x98" => "\u{0178}", // Ydieresis
79
            "\x99" => "\u{017d}", // Zcaron
80
            "\x9a" => "\u{0131}", // dotlessi
81
            "\x9b" => "\u{0142}", // lslash
82
            "\x9c" => "\u{0153}", // oe
83
            "\x9d" => "\u{0161}", // scaron
84
            "\x9e" => "\u{017e}", // zcaron
85
            "\x9f" => '',
86
            "\xa0" => "\u{20ac}", // Euro
87
            "\xa1" => "\u{00a1}", // exclamdown
88
            "\xa2" => "\u{00a2}", // cent
89
            "\xa3" => "\u{00a3}", // sterling
90
            "\xa4" => "\u{00a4}", // currency
91
            "\xa5" => "\u{00a5}", // yen
92
            "\xa6" => "\u{00a6}", // brokenbar
93
            "\xa7" => "\u{00a7}", // section
94
            "\xa8" => "\u{00a8}", // dieresis
95
            "\xa9" => "\u{00a9}", // copyright
96
            "\xaa" => "\u{00aa}", // ordfeminine
97
            "\xab" => "\u{00ab}", // guillemotleft
98
            "\xac" => "\u{00ac}", // logicalnot
99
            "\xad" => '',
100
            "\xae" => "\u{00ae}", // registered
101
            "\xaf" => "\u{00af}", // macron
102
            "\xb0" => "\u{00b0}", // degree
103
            "\xb1" => "\u{00b1}", // plusminus
104
            "\xb2" => "\u{00b2}", // twosuperior
105
            "\xb3" => "\u{00b3}", // threesuperior
106
            "\xb4" => "\u{00b4}", // acute
107
            "\xb5" => "\u{00b5}", // mu
108
            "\xb6" => "\u{00b6}", // paragraph
109
            "\xb7" => "\u{00b7}", // periodcentered
110
            "\xb8" => "\u{00b8}", // cedilla
111
            "\xb9" => "\u{00b9}", // onesuperior
112
            "\xba" => "\u{00ba}", // ordmasculine
113
            "\xbb" => "\u{00bb}", // guillemotright
114
            "\xbc" => "\u{00bc}", // onequarter
115
            "\xbd" => "\u{00bd}", // onehalf
116
            "\xbe" => "\u{00be}", // threequarters
117
            "\xbf" => "\u{00bf}", // questiondown
118
            "\xc0" => "\u{00c0}", // Agrave
119
            "\xc1" => "\u{00c1}", // Aacute
120
            "\xc2" => "\u{00c2}", // Acircumflex
121
            "\xc3" => "\u{00c3}", // Atilde
122
            "\xc4" => "\u{00c4}", // Adieresis
123
            "\xc5" => "\u{00c5}", // Aring
124
            "\xc6" => "\u{00c6}", // AE
125
            "\xc7" => "\u{00c7}", // Ccedill
126
            "\xc8" => "\u{00c8}", // Egrave
127
            "\xc9" => "\u{00c9}", // Eacute
128
            "\xca" => "\u{00ca}", // Ecircumflex
129
            "\xcb" => "\u{00cb}", // Edieresis
130
            "\xcc" => "\u{00cc}", // Igrave
131
            "\xcd" => "\u{00cd}", // Iacute
132
            "\xce" => "\u{00ce}", // Icircumflex
133
            "\xcf" => "\u{00cf}", // Idieresis
134
            "\xd0" => "\u{00d0}", // Eth
135
            "\xd1" => "\u{00d1}", // Ntilde
136
            "\xd2" => "\u{00d2}", // Ograve
137
            "\xd3" => "\u{00d3}", // Oacute
138
            "\xd4" => "\u{00d4}", // Ocircumflex
139
            "\xd5" => "\u{00d5}", // Otilde
140
            "\xd6" => "\u{00d6}", // Odieresis
141
            "\xd7" => "\u{00d7}", // multiply
142
            "\xd8" => "\u{00d8}", // Oslash
143
            "\xd9" => "\u{00d9}", // Ugrave
144
            "\xda" => "\u{00da}", // Uacute
145
            "\xdb" => "\u{00db}", // Ucircumflex
146
            "\xdc" => "\u{00dc}", // Udieresis
147
            "\xdd" => "\u{00dd}", // Yacute
148
            "\xde" => "\u{00de}", // Thorn
149
            "\xdf" => "\u{00df}", // germandbls
150
            "\xe0" => "\u{00e0}", // agrave
151
            "\xe1" => "\u{00e1}", // aacute
152
            "\xe2" => "\u{00e2}", // acircumflex
153
            "\xe3" => "\u{00e3}", // atilde
154
            "\xe4" => "\u{00e4}", // adieresis
155
            "\xe5" => "\u{00e5}", // aring
156
            "\xe6" => "\u{00e6}", // ae
157
            "\xe7" => "\u{00e7}", // ccedilla
158
            "\xe8" => "\u{00e8}", // egrave
159
            "\xe9" => "\u{00e9}", // eacute
160
            "\xea" => "\u{00ea}", // ecircumflex
161
            "\xeb" => "\u{00eb}", // edieresis
162
            "\xec" => "\u{00ec}", // igrave
163
            "\xed" => "\u{00ed}", // iacute
164
            "\xee" => "\u{00ee}", // icircumflex
165
            "\xef" => "\u{00ef}", // idieresis
166
            "\xf0" => "\u{00f0}", // eth
167
            "\xf1" => "\u{00f1}", // ntilde
168
            "\xf2" => "\u{00f2}", // ograve
169
            "\xf3" => "\u{00f3}", // oacute
170
            "\xf4" => "\u{00f4}", // ocircumflex
171
            "\xf5" => "\u{00f5}", // otilde
172
            "\xf6" => "\u{00f6}", // odieresis
173
            "\xf7" => "\u{00f7}", // divide
174
            "\xf8" => "\u{00f8}", // oslash
175
            "\xf9" => "\u{00f9}", // ugrave
176
            "\xfa" => "\u{00fa}", // uacute
177
            "\xfb" => "\u{00fb}", // ucircumflex
178
            "\xfc" => "\u{00fc}", // udieresis
179
            "\xfd" => "\u{00fd}", // yacute
180
            "\xfe" => "\u{00fe}", // thorn
181
            "\xff" => "\u{00ff}"  // ydieresis
182
        ];
183
    }
184
185
    public static function convertPDFDoc2UTF8(string $content): string
186
    {
187
        return strtr($content, static::getCodePage());
188
    }
189
}
190