|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* @file This file is part of the PdfParser library. |
|
5
|
|
|
* |
|
6
|
|
|
* @author Brian Huisman <[email protected]> |
|
7
|
|
|
* |
|
8
|
|
|
* @date 2023-06-28 |
|
9
|
|
|
* |
|
10
|
|
|
* @license LGPLv3 |
|
11
|
|
|
* |
|
12
|
|
|
* @url <https://github.com/smalot/pdfparser> |
|
13
|
|
|
* |
|
14
|
|
|
* PdfParser is a pdf library written in PHP, extraction oriented. |
|
15
|
|
|
* Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
|
16
|
|
|
* |
|
17
|
|
|
* This program is free software: you can redistribute it and/or modify |
|
18
|
|
|
* it under the terms of the GNU Lesser General Public License as published by |
|
19
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
|
20
|
|
|
* (at your option) any later version. |
|
21
|
|
|
* |
|
22
|
|
|
* This program is distributed in the hope that it will be useful, |
|
23
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
24
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
25
|
|
|
* GNU Lesser General Public License for more details. |
|
26
|
|
|
* |
|
27
|
|
|
* You should have received a copy of the GNU Lesser General Public License |
|
28
|
|
|
* along with this program. |
|
29
|
|
|
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
|
30
|
|
|
*/ |
|
31
|
|
|
|
|
32
|
|
|
// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf |
|
33
|
|
|
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf |
|
34
|
|
|
|
|
35
|
|
|
namespace Smalot\PdfParser\Encoding; |
|
36
|
|
|
|
|
37
|
|
|
/** |
|
38
|
|
|
* Class PDFDocEncoding |
|
39
|
|
|
*/ |
|
40
|
|
|
class PDFDocEncoding |
|
41
|
|
|
{ |
|
42
|
|
|
public static function getCodePage(): array |
|
43
|
|
|
{ |
|
44
|
|
|
return [ |
|
45
|
|
|
"\x18" => "\u{02d8}", // breve |
|
46
|
|
|
"\x19" => "\u{02c7}", // caron |
|
47
|
|
|
"\x1a" => "\u{02c6}", // circumflex |
|
48
|
|
|
"\x1b" => "\u{02d9}", // dotaccent |
|
49
|
|
|
"\x1c" => "\u{02dd}", // hungarumlaut |
|
50
|
|
|
"\x1d" => "\u{02db}", // ogonek |
|
51
|
|
|
"\x1e" => "\u{02de}", // ring |
|
52
|
|
|
"\x1f" => "\u{02dc}", // tilde |
|
53
|
|
|
"\x7f" => '', |
|
54
|
|
|
"\x80" => "\u{2022}", // bullet |
|
55
|
|
|
"\x81" => "\u{2020}", // dagger |
|
56
|
|
|
"\x82" => "\u{2021}", // daggerdbl |
|
57
|
|
|
"\x83" => "\u{2026}", // ellipsis |
|
58
|
|
|
"\x84" => "\u{2014}", // emdash |
|
59
|
|
|
"\x85" => "\u{2013}", // endash |
|
60
|
|
|
"\x86" => "\u{0192}", // florin |
|
61
|
|
|
"\x87" => "\u{2044}", // fraction |
|
62
|
|
|
"\x88" => "\u{2039}", // guilsinglleft |
|
63
|
|
|
"\x89" => "\u{203a}", // guilsinglright |
|
64
|
|
|
"\x8a" => "\u{2212}", // minus |
|
65
|
|
|
"\x8b" => "\u{2030}", // perthousand |
|
66
|
|
|
"\x8c" => "\u{201e}", // quotedblbase |
|
67
|
|
|
"\x8d" => "\u{201c}", // quotedblleft |
|
68
|
|
|
"\x8e" => "\u{201d}", // quotedblright |
|
69
|
|
|
"\x8f" => "\u{2018}", // quoteleft |
|
70
|
|
|
"\x90" => "\u{2019}", // quoteright |
|
71
|
|
|
"\x91" => "\u{201a}", // quotesinglbase |
|
72
|
|
|
"\x92" => "\u{2122}", // trademark |
|
73
|
|
|
"\x93" => "\u{fb01}", // fi |
|
74
|
|
|
"\x94" => "\u{fb02}", // fl |
|
75
|
|
|
"\x95" => "\u{0141}", // Lslash |
|
76
|
|
|
"\x96" => "\u{0152}", // OE |
|
77
|
|
|
"\x97" => "\u{0160}", // Scaron |
|
78
|
|
|
"\x98" => "\u{0178}", // Ydieresis |
|
79
|
|
|
"\x99" => "\u{017d}", // Zcaron |
|
80
|
|
|
"\x9a" => "\u{0131}", // dotlessi |
|
81
|
|
|
"\x9b" => "\u{0142}", // lslash |
|
82
|
|
|
"\x9c" => "\u{0153}", // oe |
|
83
|
|
|
"\x9d" => "\u{0161}", // scaron |
|
84
|
|
|
"\x9e" => "\u{017e}", // zcaron |
|
85
|
|
|
"\x9f" => '', |
|
86
|
|
|
"\xa0" => "\u{20ac}", // Euro |
|
87
|
|
|
"\xa1" => "\u{00a1}", // exclamdown |
|
88
|
|
|
"\xa2" => "\u{00a2}", // cent |
|
89
|
|
|
"\xa3" => "\u{00a3}", // sterling |
|
90
|
|
|
"\xa4" => "\u{00a4}", // currency |
|
91
|
|
|
"\xa5" => "\u{00a5}", // yen |
|
92
|
|
|
"\xa6" => "\u{00a6}", // brokenbar |
|
93
|
|
|
"\xa7" => "\u{00a7}", // section |
|
94
|
|
|
"\xa8" => "\u{00a8}", // dieresis |
|
95
|
|
|
"\xa9" => "\u{00a9}", // copyright |
|
96
|
|
|
"\xaa" => "\u{00aa}", // ordfeminine |
|
97
|
|
|
"\xab" => "\u{00ab}", // guillemotleft |
|
98
|
|
|
"\xac" => "\u{00ac}", // logicalnot |
|
99
|
|
|
"\xad" => '', |
|
100
|
|
|
"\xae" => "\u{00ae}", // registered |
|
101
|
|
|
"\xaf" => "\u{00af}", // macron |
|
102
|
|
|
"\xb0" => "\u{00b0}", // degree |
|
103
|
|
|
"\xb1" => "\u{00b1}", // plusminus |
|
104
|
|
|
"\xb2" => "\u{00b2}", // twosuperior |
|
105
|
|
|
"\xb3" => "\u{00b3}", // threesuperior |
|
106
|
|
|
"\xb4" => "\u{00b4}", // acute |
|
107
|
|
|
"\xb5" => "\u{00b5}", // mu |
|
108
|
|
|
"\xb6" => "\u{00b6}", // paragraph |
|
109
|
|
|
"\xb7" => "\u{00b7}", // periodcentered |
|
110
|
|
|
"\xb8" => "\u{00b8}", // cedilla |
|
111
|
|
|
"\xb9" => "\u{00b9}", // onesuperior |
|
112
|
|
|
"\xba" => "\u{00ba}", // ordmasculine |
|
113
|
|
|
"\xbb" => "\u{00bb}", // guillemotright |
|
114
|
|
|
"\xbc" => "\u{00bc}", // onequarter |
|
115
|
|
|
"\xbd" => "\u{00bd}", // onehalf |
|
116
|
|
|
"\xbe" => "\u{00be}", // threequarters |
|
117
|
|
|
"\xbf" => "\u{00bf}", // questiondown |
|
118
|
|
|
"\xc0" => "\u{00c0}", // Agrave |
|
119
|
|
|
"\xc1" => "\u{00c1}", // Aacute |
|
120
|
|
|
"\xc2" => "\u{00c2}", // Acircumflex |
|
121
|
|
|
"\xc3" => "\u{00c3}", // Atilde |
|
122
|
|
|
"\xc4" => "\u{00c4}", // Adieresis |
|
123
|
|
|
"\xc5" => "\u{00c5}", // Aring |
|
124
|
|
|
"\xc6" => "\u{00c6}", // AE |
|
125
|
|
|
"\xc7" => "\u{00c7}", // Ccedill |
|
126
|
|
|
"\xc8" => "\u{00c8}", // Egrave |
|
127
|
|
|
"\xc9" => "\u{00c9}", // Eacute |
|
128
|
|
|
"\xca" => "\u{00ca}", // Ecircumflex |
|
129
|
|
|
"\xcb" => "\u{00cb}", // Edieresis |
|
130
|
|
|
"\xcc" => "\u{00cc}", // Igrave |
|
131
|
|
|
"\xcd" => "\u{00cd}", // Iacute |
|
132
|
|
|
"\xce" => "\u{00ce}", // Icircumflex |
|
133
|
|
|
"\xcf" => "\u{00cf}", // Idieresis |
|
134
|
|
|
"\xd0" => "\u{00d0}", // Eth |
|
135
|
|
|
"\xd1" => "\u{00d1}", // Ntilde |
|
136
|
|
|
"\xd2" => "\u{00d2}", // Ograve |
|
137
|
|
|
"\xd3" => "\u{00d3}", // Oacute |
|
138
|
|
|
"\xd4" => "\u{00d4}", // Ocircumflex |
|
139
|
|
|
"\xd5" => "\u{00d5}", // Otilde |
|
140
|
|
|
"\xd6" => "\u{00d6}", // Odieresis |
|
141
|
|
|
"\xd7" => "\u{00d7}", // multiply |
|
142
|
|
|
"\xd8" => "\u{00d8}", // Oslash |
|
143
|
|
|
"\xd9" => "\u{00d9}", // Ugrave |
|
144
|
|
|
"\xda" => "\u{00da}", // Uacute |
|
145
|
|
|
"\xdb" => "\u{00db}", // Ucircumflex |
|
146
|
|
|
"\xdc" => "\u{00dc}", // Udieresis |
|
147
|
|
|
"\xdd" => "\u{00dd}", // Yacute |
|
148
|
|
|
"\xde" => "\u{00de}", // Thorn |
|
149
|
|
|
"\xdf" => "\u{00df}", // germandbls |
|
150
|
|
|
"\xe0" => "\u{00e0}", // agrave |
|
151
|
|
|
"\xe1" => "\u{00e1}", // aacute |
|
152
|
|
|
"\xe2" => "\u{00e2}", // acircumflex |
|
153
|
|
|
"\xe3" => "\u{00e3}", // atilde |
|
154
|
|
|
"\xe4" => "\u{00e4}", // adieresis |
|
155
|
|
|
"\xe5" => "\u{00e5}", // aring |
|
156
|
|
|
"\xe6" => "\u{00e6}", // ae |
|
157
|
|
|
"\xe7" => "\u{00e7}", // ccedilla |
|
158
|
|
|
"\xe8" => "\u{00e8}", // egrave |
|
159
|
|
|
"\xe9" => "\u{00e9}", // eacute |
|
160
|
|
|
"\xea" => "\u{00ea}", // ecircumflex |
|
161
|
|
|
"\xeb" => "\u{00eb}", // edieresis |
|
162
|
|
|
"\xec" => "\u{00ec}", // igrave |
|
163
|
|
|
"\xed" => "\u{00ed}", // iacute |
|
164
|
|
|
"\xee" => "\u{00ee}", // icircumflex |
|
165
|
|
|
"\xef" => "\u{00ef}", // idieresis |
|
166
|
|
|
"\xf0" => "\u{00f0}", // eth |
|
167
|
|
|
"\xf1" => "\u{00f1}", // ntilde |
|
168
|
|
|
"\xf2" => "\u{00f2}", // ograve |
|
169
|
|
|
"\xf3" => "\u{00f3}", // oacute |
|
170
|
|
|
"\xf4" => "\u{00f4}", // ocircumflex |
|
171
|
|
|
"\xf5" => "\u{00f5}", // otilde |
|
172
|
|
|
"\xf6" => "\u{00f6}", // odieresis |
|
173
|
|
|
"\xf7" => "\u{00f7}", // divide |
|
174
|
|
|
"\xf8" => "\u{00f8}", // oslash |
|
175
|
|
|
"\xf9" => "\u{00f9}", // ugrave |
|
176
|
|
|
"\xfa" => "\u{00fa}", // uacute |
|
177
|
|
|
"\xfb" => "\u{00fb}", // ucircumflex |
|
178
|
|
|
"\xfc" => "\u{00fc}", // udieresis |
|
179
|
|
|
"\xfd" => "\u{00fd}", // yacute |
|
180
|
|
|
"\xfe" => "\u{00fe}", // thorn |
|
181
|
|
|
"\xff" => "\u{00ff}" // ydieresis |
|
182
|
|
|
]; |
|
183
|
|
|
} |
|
184
|
|
|
|
|
185
|
|
|
public static function convertPDFDoc2UTF8(string $content): string |
|
186
|
|
|
{ |
|
187
|
|
|
return strtr($content, static::getCodePage()); |
|
188
|
|
|
} |
|
189
|
|
|
} |
|
190
|
|
|
|