1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @file This file is part of the PdfParser library. |
5
|
|
|
* |
6
|
|
|
* @author Brian Huisman <[email protected]> |
7
|
|
|
* |
8
|
|
|
* @date 2023-06-28 |
9
|
|
|
* |
10
|
|
|
* @license LGPLv3 |
11
|
|
|
* |
12
|
|
|
* @url <https://github.com/smalot/pdfparser> |
13
|
|
|
* |
14
|
|
|
* PdfParser is a pdf library written in PHP, extraction oriented. |
15
|
|
|
* Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
16
|
|
|
* |
17
|
|
|
* This program is free software: you can redistribute it and/or modify |
18
|
|
|
* it under the terms of the GNU Lesser General Public License as published by |
19
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
20
|
|
|
* (at your option) any later version. |
21
|
|
|
* |
22
|
|
|
* This program is distributed in the hope that it will be useful, |
23
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
24
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
25
|
|
|
* GNU Lesser General Public License for more details. |
26
|
|
|
* |
27
|
|
|
* You should have received a copy of the GNU Lesser General Public License |
28
|
|
|
* along with this program. |
29
|
|
|
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
30
|
|
|
*/ |
31
|
|
|
|
32
|
|
|
// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf |
33
|
|
|
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf |
34
|
|
|
|
35
|
|
|
namespace Smalot\PdfParser\Encoding; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Class PDFDocEncoding |
39
|
|
|
*/ |
40
|
|
|
class PDFDocEncoding |
41
|
|
|
{ |
42
|
11 |
|
public static function getCodePage(): array |
43
|
|
|
{ |
44
|
11 |
|
return [ |
45
|
11 |
|
"\x18" => "\u{02d8}", // breve |
46
|
11 |
|
"\x19" => "\u{02c7}", // caron |
47
|
11 |
|
"\x1a" => "\u{02c6}", // circumflex |
48
|
11 |
|
"\x1b" => "\u{02d9}", // dotaccent |
49
|
11 |
|
"\x1c" => "\u{02dd}", // hungarumlaut |
50
|
11 |
|
"\x1d" => "\u{02db}", // ogonek |
51
|
11 |
|
"\x1e" => "\u{02de}", // ring |
52
|
11 |
|
"\x1f" => "\u{02dc}", // tilde |
53
|
11 |
|
"\x7f" => '', |
54
|
11 |
|
"\x80" => "\u{2022}", // bullet |
55
|
11 |
|
"\x81" => "\u{2020}", // dagger |
56
|
11 |
|
"\x82" => "\u{2021}", // daggerdbl |
57
|
11 |
|
"\x83" => "\u{2026}", // ellipsis |
58
|
11 |
|
"\x84" => "\u{2014}", // emdash |
59
|
11 |
|
"\x85" => "\u{2013}", // endash |
60
|
11 |
|
"\x86" => "\u{0192}", // florin |
61
|
11 |
|
"\x87" => "\u{2044}", // fraction |
62
|
11 |
|
"\x88" => "\u{2039}", // guilsinglleft |
63
|
11 |
|
"\x89" => "\u{203a}", // guilsinglright |
64
|
11 |
|
"\x8a" => "\u{2212}", // minus |
65
|
11 |
|
"\x8b" => "\u{2030}", // perthousand |
66
|
11 |
|
"\x8c" => "\u{201e}", // quotedblbase |
67
|
11 |
|
"\x8d" => "\u{201c}", // quotedblleft |
68
|
11 |
|
"\x8e" => "\u{201d}", // quotedblright |
69
|
11 |
|
"\x8f" => "\u{2018}", // quoteleft |
70
|
11 |
|
"\x90" => "\u{2019}", // quoteright |
71
|
11 |
|
"\x91" => "\u{201a}", // quotesinglbase |
72
|
11 |
|
"\x92" => "\u{2122}", // trademark |
73
|
11 |
|
"\x93" => "\u{fb01}", // fi |
74
|
11 |
|
"\x94" => "\u{fb02}", // fl |
75
|
11 |
|
"\x95" => "\u{0141}", // Lslash |
76
|
11 |
|
"\x96" => "\u{0152}", // OE |
77
|
11 |
|
"\x97" => "\u{0160}", // Scaron |
78
|
11 |
|
"\x98" => "\u{0178}", // Ydieresis |
79
|
11 |
|
"\x99" => "\u{017d}", // Zcaron |
80
|
11 |
|
"\x9a" => "\u{0131}", // dotlessi |
81
|
11 |
|
"\x9b" => "\u{0142}", // lslash |
82
|
11 |
|
"\x9c" => "\u{0153}", // oe |
83
|
11 |
|
"\x9d" => "\u{0161}", // scaron |
84
|
11 |
|
"\x9e" => "\u{017e}", // zcaron |
85
|
11 |
|
"\x9f" => '', |
86
|
11 |
|
"\xa0" => "\u{20ac}", // Euro |
87
|
11 |
|
"\xa1" => "\u{00a1}", // exclamdown |
88
|
11 |
|
"\xa2" => "\u{00a2}", // cent |
89
|
11 |
|
"\xa3" => "\u{00a3}", // sterling |
90
|
11 |
|
"\xa4" => "\u{00a4}", // currency |
91
|
11 |
|
"\xa5" => "\u{00a5}", // yen |
92
|
11 |
|
"\xa6" => "\u{00a6}", // brokenbar |
93
|
11 |
|
"\xa7" => "\u{00a7}", // section |
94
|
11 |
|
"\xa8" => "\u{00a8}", // dieresis |
95
|
11 |
|
"\xa9" => "\u{00a9}", // copyright |
96
|
11 |
|
"\xaa" => "\u{00aa}", // ordfeminine |
97
|
11 |
|
"\xab" => "\u{00ab}", // guillemotleft |
98
|
11 |
|
"\xac" => "\u{00ac}", // logicalnot |
99
|
11 |
|
"\xad" => '', |
100
|
11 |
|
"\xae" => "\u{00ae}", // registered |
101
|
11 |
|
"\xaf" => "\u{00af}", // macron |
102
|
11 |
|
"\xb0" => "\u{00b0}", // degree |
103
|
11 |
|
"\xb1" => "\u{00b1}", // plusminus |
104
|
11 |
|
"\xb2" => "\u{00b2}", // twosuperior |
105
|
11 |
|
"\xb3" => "\u{00b3}", // threesuperior |
106
|
11 |
|
"\xb4" => "\u{00b4}", // acute |
107
|
11 |
|
"\xb5" => "\u{00b5}", // mu |
108
|
11 |
|
"\xb6" => "\u{00b6}", // paragraph |
109
|
11 |
|
"\xb7" => "\u{00b7}", // periodcentered |
110
|
11 |
|
"\xb8" => "\u{00b8}", // cedilla |
111
|
11 |
|
"\xb9" => "\u{00b9}", // onesuperior |
112
|
11 |
|
"\xba" => "\u{00ba}", // ordmasculine |
113
|
11 |
|
"\xbb" => "\u{00bb}", // guillemotright |
114
|
11 |
|
"\xbc" => "\u{00bc}", // onequarter |
115
|
11 |
|
"\xbd" => "\u{00bd}", // onehalf |
116
|
11 |
|
"\xbe" => "\u{00be}", // threequarters |
117
|
11 |
|
"\xbf" => "\u{00bf}", // questiondown |
118
|
11 |
|
"\xc0" => "\u{00c0}", // Agrave |
119
|
11 |
|
"\xc1" => "\u{00c1}", // Aacute |
120
|
11 |
|
"\xc2" => "\u{00c2}", // Acircumflex |
121
|
11 |
|
"\xc3" => "\u{00c3}", // Atilde |
122
|
11 |
|
"\xc4" => "\u{00c4}", // Adieresis |
123
|
11 |
|
"\xc5" => "\u{00c5}", // Aring |
124
|
11 |
|
"\xc6" => "\u{00c6}", // AE |
125
|
11 |
|
"\xc7" => "\u{00c7}", // Ccedill |
126
|
11 |
|
"\xc8" => "\u{00c8}", // Egrave |
127
|
11 |
|
"\xc9" => "\u{00c9}", // Eacute |
128
|
11 |
|
"\xca" => "\u{00ca}", // Ecircumflex |
129
|
11 |
|
"\xcb" => "\u{00cb}", // Edieresis |
130
|
11 |
|
"\xcc" => "\u{00cc}", // Igrave |
131
|
11 |
|
"\xcd" => "\u{00cd}", // Iacute |
132
|
11 |
|
"\xce" => "\u{00ce}", // Icircumflex |
133
|
11 |
|
"\xcf" => "\u{00cf}", // Idieresis |
134
|
11 |
|
"\xd0" => "\u{00d0}", // Eth |
135
|
11 |
|
"\xd1" => "\u{00d1}", // Ntilde |
136
|
11 |
|
"\xd2" => "\u{00d2}", // Ograve |
137
|
11 |
|
"\xd3" => "\u{00d3}", // Oacute |
138
|
11 |
|
"\xd4" => "\u{00d4}", // Ocircumflex |
139
|
11 |
|
"\xd5" => "\u{00d5}", // Otilde |
140
|
11 |
|
"\xd6" => "\u{00d6}", // Odieresis |
141
|
11 |
|
"\xd7" => "\u{00d7}", // multiply |
142
|
11 |
|
"\xd8" => "\u{00d8}", // Oslash |
143
|
11 |
|
"\xd9" => "\u{00d9}", // Ugrave |
144
|
11 |
|
"\xda" => "\u{00da}", // Uacute |
145
|
11 |
|
"\xdb" => "\u{00db}", // Ucircumflex |
146
|
11 |
|
"\xdc" => "\u{00dc}", // Udieresis |
147
|
11 |
|
"\xdd" => "\u{00dd}", // Yacute |
148
|
11 |
|
"\xde" => "\u{00de}", // Thorn |
149
|
11 |
|
"\xdf" => "\u{00df}", // germandbls |
150
|
11 |
|
"\xe0" => "\u{00e0}", // agrave |
151
|
11 |
|
"\xe1" => "\u{00e1}", // aacute |
152
|
11 |
|
"\xe2" => "\u{00e2}", // acircumflex |
153
|
11 |
|
"\xe3" => "\u{00e3}", // atilde |
154
|
11 |
|
"\xe4" => "\u{00e4}", // adieresis |
155
|
11 |
|
"\xe5" => "\u{00e5}", // aring |
156
|
11 |
|
"\xe6" => "\u{00e6}", // ae |
157
|
11 |
|
"\xe7" => "\u{00e7}", // ccedilla |
158
|
11 |
|
"\xe8" => "\u{00e8}", // egrave |
159
|
11 |
|
"\xe9" => "\u{00e9}", // eacute |
160
|
11 |
|
"\xea" => "\u{00ea}", // ecircumflex |
161
|
11 |
|
"\xeb" => "\u{00eb}", // edieresis |
162
|
11 |
|
"\xec" => "\u{00ec}", // igrave |
163
|
11 |
|
"\xed" => "\u{00ed}", // iacute |
164
|
11 |
|
"\xee" => "\u{00ee}", // icircumflex |
165
|
11 |
|
"\xef" => "\u{00ef}", // idieresis |
166
|
11 |
|
"\xf0" => "\u{00f0}", // eth |
167
|
11 |
|
"\xf1" => "\u{00f1}", // ntilde |
168
|
11 |
|
"\xf2" => "\u{00f2}", // ograve |
169
|
11 |
|
"\xf3" => "\u{00f3}", // oacute |
170
|
11 |
|
"\xf4" => "\u{00f4}", // ocircumflex |
171
|
11 |
|
"\xf5" => "\u{00f5}", // otilde |
172
|
11 |
|
"\xf6" => "\u{00f6}", // odieresis |
173
|
11 |
|
"\xf7" => "\u{00f7}", // divide |
174
|
11 |
|
"\xf8" => "\u{00f8}", // oslash |
175
|
11 |
|
"\xf9" => "\u{00f9}", // ugrave |
176
|
11 |
|
"\xfa" => "\u{00fa}", // uacute |
177
|
11 |
|
"\xfb" => "\u{00fb}", // ucircumflex |
178
|
11 |
|
"\xfc" => "\u{00fc}", // udieresis |
179
|
11 |
|
"\xfd" => "\u{00fd}", // yacute |
180
|
11 |
|
"\xfe" => "\u{00fe}", // thorn |
181
|
11 |
|
"\xff" => "\u{00ff}", // ydieresis |
182
|
11 |
|
]; |
183
|
|
|
} |
184
|
|
|
|
185
|
11 |
|
public static function convertPDFDoc2UTF8(string $content): string |
186
|
|
|
{ |
187
|
11 |
|
return strtr($content, static::getCodePage()); |
188
|
|
|
} |
189
|
|
|
} |
190
|
|
|
|