1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @file |
5
|
|
|
* This file is part of the PdfParser library. |
6
|
|
|
* |
7
|
|
|
* @author Sébastien MALOT <[email protected]> |
8
|
|
|
* |
9
|
|
|
* @date 2017-01-03 |
10
|
|
|
* |
11
|
|
|
* @license LGPLv3 |
12
|
|
|
* |
13
|
|
|
* @url <https://github.com/smalot/pdfparser> |
14
|
|
|
* |
15
|
|
|
* PdfParser is a pdf library written in PHP, extraction oriented. |
16
|
|
|
* Copyright (C) 2017 - Sébastien MALOT <[email protected]> |
17
|
|
|
* |
18
|
|
|
* This program is free software: you can redistribute it and/or modify |
19
|
|
|
* it under the terms of the GNU Lesser General Public License as published by |
20
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
21
|
|
|
* (at your option) any later version. |
22
|
|
|
* |
23
|
|
|
* This program is distributed in the hope that it will be useful, |
24
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
25
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26
|
|
|
* GNU Lesser General Public License for more details. |
27
|
|
|
* |
28
|
|
|
* You should have received a copy of the GNU Lesser General Public License |
29
|
|
|
* along with this program. |
30
|
|
|
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. |
31
|
|
|
*/ |
32
|
|
|
|
33
|
|
|
// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf |
34
|
|
|
// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf |
35
|
|
|
|
36
|
|
|
namespace Smalot\PdfParser\Encoding; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Class PDFDocEncoding |
40
|
|
|
*/ |
41
|
|
|
abstract class PDFDocEncoding extends AbstractEncoding |
42
|
|
|
{ |
43
|
|
|
public static function convertPDFDoc2UTF8(string $content): string |
44
|
|
|
{ |
45
|
|
|
return strtr($content, array( |
46
|
|
|
"\x18" => "\u{02d8}", // breve |
47
|
|
|
"\x19" => "\u{02c7}", // caron |
48
|
|
|
"\x1a" => "\u{02c6}", // circumflex |
49
|
|
|
"\x1b" => "\u{02d9}", // dotaccent |
50
|
|
|
"\x1c" => "\u{02dd}", // hungarumlaut |
51
|
|
|
"\x1d" => "\u{02db}", // ogonek |
52
|
|
|
"\x1e" => "\u{02de}", // ring |
53
|
|
|
"\x1f" => "\u{02dc}", // tilde |
54
|
|
|
"\x7f" => '', |
55
|
|
|
"\x80" => "\u{2022}", // bullet |
56
|
|
|
"\x81" => "\u{2020}", // dagger |
57
|
|
|
"\x82" => "\u{2021}", // daggerdbl |
58
|
|
|
"\x83" => "\u{2026}", // ellipsis |
59
|
|
|
"\x84" => "\u{2014}", // emdash |
60
|
|
|
"\x85" => "\u{2013}", // endash |
61
|
|
|
"\x86" => "\u{0192}", // florin |
62
|
|
|
"\x87" => "\u{2044}", // fraction |
63
|
|
|
"\x88" => "\u{2039}", // guilsinglleft |
64
|
|
|
"\x89" => "\u{203a}", // guilsinglright |
65
|
|
|
"\x8a" => "\u{2212}", // minus |
66
|
|
|
"\x8b" => "\u{2030}", // perthousand |
67
|
|
|
"\x8c" => "\u{201e}", // quotedblbase |
68
|
|
|
"\x8d" => "\u{201c}", // quotedblleft |
69
|
|
|
"\x8e" => "\u{201d}", // quotedblright |
70
|
|
|
"\x8f" => "\u{2018}", // quoteleft |
71
|
|
|
"\x90" => "\u{2019}", // quoteright |
72
|
|
|
"\x91" => "\u{201a}", // quotesinglbase |
73
|
|
|
"\x92" => "\u{2122}", // trademark |
74
|
|
|
"\x93" => "\u{fb01}", // fi |
75
|
|
|
"\x94" => "\u{fb02}", // fl |
76
|
|
|
"\x95" => "\u{0141}", // Lslash |
77
|
|
|
"\x96" => "\u{0152}", // OE |
78
|
|
|
"\x97" => "\u{0160}", // Scaron |
79
|
|
|
"\x98" => "\u{0178}", // Ydieresis |
80
|
|
|
"\x99" => "\u{017d}", // Zcaron |
81
|
|
|
"\x9a" => "\u{0131}", // dotlessi |
82
|
|
|
"\x9b" => "\u{0142}", // lslash |
83
|
|
|
"\x9c" => "\u{0153}", // oe |
84
|
|
|
"\x9d" => "\u{0161}", // scaron |
85
|
|
|
"\x9e" => "\u{017e}", // zcaron |
86
|
|
|
"\x9f" => '', |
87
|
|
|
"\xa0" => "\u{20ac}", // Euro |
88
|
|
|
"\xa1" => "\u{00a1}", // exclamdown |
89
|
|
|
"\xa2" => "\u{00a2}", // cent |
90
|
|
|
"\xa3" => "\u{00a3}", // sterling |
91
|
|
|
"\xa4" => "\u{00a4}", // currency |
92
|
|
|
"\xa5" => "\u{00a5}", // yen |
93
|
|
|
"\xa6" => "\u{00a6}", // brokenbar |
94
|
|
|
"\xa7" => "\u{00a7}", // section |
95
|
|
|
"\xa8" => "\u{00a8}", // dieresis |
96
|
|
|
"\xa9" => "\u{00a9}", // copyright |
97
|
|
|
"\xaa" => "\u{00aa}", // ordfeminine |
98
|
|
|
"\xab" => "\u{00ab}", // guillemotleft |
99
|
|
|
"\xac" => "\u{00ac}", // logicalnot |
100
|
|
|
"\xad" => '', |
101
|
|
|
"\xae" => "\u{00ae}", // registered |
102
|
|
|
"\xaf" => "\u{00af}", // macron |
103
|
|
|
"\xb0" => "\u{00b0}", // degree |
104
|
|
|
"\xb1" => "\u{00b1}", // plusminus |
105
|
|
|
"\xb2" => "\u{00b2}", // twosuperior |
106
|
|
|
"\xb3" => "\u{00b3}", // threesuperior |
107
|
|
|
"\xb4" => "\u{00b4}", // acute |
108
|
|
|
"\xb5" => "\u{00b5}", // mu |
109
|
|
|
"\xb6" => "\u{00b6}", // paragraph |
110
|
|
|
"\xb7" => "\u{00b7}", // periodcentered |
111
|
|
|
"\xb8" => "\u{00b8}", // cedilla |
112
|
|
|
"\xb9" => "\u{00b9}", // onesuperior |
113
|
|
|
"\xba" => "\u{00ba}", // ordmasculine |
114
|
|
|
"\xbb" => "\u{00bb}", // guillemotright |
115
|
|
|
"\xbc" => "\u{00bc}", // onequarter |
116
|
|
|
"\xbd" => "\u{00bd}", // onehalf |
117
|
|
|
"\xbe" => "\u{00be}", // threequarters |
118
|
|
|
"\xbf" => "\u{00bf}", // questiondown |
119
|
|
|
"\xc0" => "\u{00c0}", // Agrave |
120
|
|
|
"\xc1" => "\u{00c1}", // Aacute |
121
|
|
|
"\xc2" => "\u{00c2}", // Acircumflex |
122
|
|
|
"\xc3" => "\u{00c3}", // Atilde |
123
|
|
|
"\xc4" => "\u{00c4}", // Adieresis |
124
|
|
|
"\xc5" => "\u{00c5}", // Aring |
125
|
|
|
"\xc6" => "\u{00c6}", // AE |
126
|
|
|
"\xc7" => "\u{00c7}", // Ccedill |
127
|
|
|
"\xc8" => "\u{00c8}", // Egrave |
128
|
|
|
"\xc9" => "\u{00c9}", // Eacute |
129
|
|
|
"\xca" => "\u{00ca}", // Ecircumflex |
130
|
|
|
"\xcb" => "\u{00cb}", // Edieresis |
131
|
|
|
"\xcc" => "\u{00cc}", // Igrave |
132
|
|
|
"\xcd" => "\u{00cd}", // Iacute |
133
|
|
|
"\xce" => "\u{00ce}", // Icircumflex |
134
|
|
|
"\xcf" => "\u{00cf}", // Idieresis |
135
|
|
|
"\xd0" => "\u{00d0}", // Eth |
136
|
|
|
"\xd1" => "\u{00d1}", // Ntilde |
137
|
|
|
"\xd2" => "\u{00d2}", // Ograve |
138
|
|
|
"\xd3" => "\u{00d3}", // Oacute |
139
|
|
|
"\xd4" => "\u{00d4}", // Ocircumflex |
140
|
|
|
"\xd5" => "\u{00d5}", // Otilde |
141
|
|
|
"\xd6" => "\u{00d6}", // Odieresis |
142
|
|
|
"\xd7" => "\u{00d7}", // multiply |
143
|
|
|
"\xd8" => "\u{00d8}", // Oslash |
144
|
|
|
"\xd9" => "\u{00d9}", // Ugrave |
145
|
|
|
"\xda" => "\u{00da}", // Uacute |
146
|
|
|
"\xdb" => "\u{00db}", // Ucircumflex |
147
|
|
|
"\xdc" => "\u{00dc}", // Udieresis |
148
|
|
|
"\xdd" => "\u{00dd}", // Yacute |
149
|
|
|
"\xde" => "\u{00de}", // Thorn |
150
|
|
|
"\xdf" => "\u{00df}", // germandbls |
151
|
|
|
"\xe0" => "\u{00e0}", // agrave |
152
|
|
|
"\xe1" => "\u{00e1}", // aacute |
153
|
|
|
"\xe2" => "\u{00e2}", // acircumflex |
154
|
|
|
"\xe3" => "\u{00e3}", // atilde |
155
|
|
|
"\xe4" => "\u{00e4}", // adieresis |
156
|
|
|
"\xe5" => "\u{00e5}", // aring |
157
|
|
|
"\xe6" => "\u{00e6}", // ae |
158
|
|
|
"\xe7" => "\u{00e7}", // ccedilla |
159
|
|
|
"\xe8" => "\u{00e8}", // egrave |
160
|
|
|
"\xe9" => "\u{00e9}", // eacute |
161
|
|
|
"\xea" => "\u{00ea}", // ecircumflex |
162
|
|
|
"\xeb" => "\u{00eb}", // edieresis |
163
|
|
|
"\xec" => "\u{00ec}", // igrave |
164
|
|
|
"\xed" => "\u{00ed}", // iacute |
165
|
|
|
"\xee" => "\u{00ee}", // icircumflex |
166
|
|
|
"\xef" => "\u{00ef}", // idieresis |
167
|
|
|
"\xf0" => "\u{00f0}", // eth |
168
|
|
|
"\xf1" => "\u{00f1}", // ntilde |
169
|
|
|
"\xf2" => "\u{00f2}", // ograve |
170
|
|
|
"\xf3" => "\u{00f3}", // oacute |
171
|
|
|
"\xf4" => "\u{00f4}", // ocircumflex |
172
|
|
|
"\xf5" => "\u{00f5}", // otilde |
173
|
|
|
"\xf6" => "\u{00f6}", // odieresis |
174
|
|
|
"\xf7" => "\u{00f7}", // divide |
175
|
|
|
"\xf8" => "\u{00f8}", // oslash |
176
|
|
|
"\xf9" => "\u{00f9}", // ugrave |
177
|
|
|
"\xfa" => "\u{00fa}", // uacute |
178
|
|
|
"\xfb" => "\u{00fb}", // ucircumflex |
179
|
|
|
"\xfc" => "\u{00fc}", // udieresis |
180
|
|
|
"\xfd" => "\u{00fd}", // yacute |
181
|
|
|
"\xfe" => "\u{00fe}", // thorn |
182
|
|
|
"\xff" => "\u{00ff}" // ydieresis |
183
|
|
|
)); |
184
|
|
|
} |
185
|
|
|
} |
186
|
|
|
|