1
|
|
|
"use strict"; |
2
|
|
|
var Buffer = require("buffer").Buffer; |
3
|
|
|
|
4
|
|
|
// Multibyte codec. In this scheme, a character is represented by 1 or more bytes. |
5
|
|
|
// Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. |
6
|
|
|
// To save memory and loading time, we read table files only when requested. |
7
|
|
|
|
8
|
|
|
exports._dbcs = DBCSCodec; |
9
|
|
|
|
10
|
|
|
var UNASSIGNED = -1, |
11
|
|
|
GB18030_CODE = -2, |
12
|
|
|
SEQ_START = -10, |
13
|
|
|
NODE_START = -1000, |
14
|
|
|
UNASSIGNED_NODE = new Array(0x100), |
15
|
|
|
DEF_CHAR = -1; |
16
|
|
|
|
17
|
|
|
for (var i = 0; i < 0x100; i++) |
18
|
|
|
UNASSIGNED_NODE[i] = UNASSIGNED; |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
// Class DBCSCodec reads and initializes mapping tables. |
22
|
|
|
function DBCSCodec(codecOptions, iconv) { |
23
|
|
|
this.encodingName = codecOptions.encodingName; |
24
|
|
|
if (!codecOptions) |
25
|
|
|
throw new Error("DBCS codec is called without the data.") |
26
|
|
|
if (!codecOptions.table) |
27
|
|
|
throw new Error("Encoding '" + this.encodingName + "' has no data."); |
28
|
|
|
|
29
|
|
|
// Load tables. |
30
|
|
|
var mappingTable = codecOptions.table(); |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
// Decode tables: MBCS -> Unicode. |
34
|
|
|
|
35
|
|
|
// decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. |
36
|
|
|
// Trie root is decodeTables[0]. |
37
|
|
|
// Values: >= 0 -> unicode character code. can be > 0xFFFF |
38
|
|
|
// == UNASSIGNED -> unknown/unassigned sequence. |
39
|
|
|
// == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. |
40
|
|
|
// <= NODE_START -> index of the next node in our trie to process next byte. |
41
|
|
|
// <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. |
42
|
|
|
this.decodeTables = []; |
43
|
|
|
this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. |
44
|
|
|
|
45
|
|
|
// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. |
46
|
|
|
this.decodeTableSeq = []; |
47
|
|
|
|
48
|
|
|
// Actual mapping tables consist of chunks. Use them to fill up decode tables. |
49
|
|
|
for (var i = 0; i < mappingTable.length; i++) |
50
|
|
|
this._addDecodeChunk(mappingTable[i]); |
51
|
|
|
|
52
|
|
|
this.defaultCharUnicode = iconv.defaultCharUnicode; |
53
|
|
|
|
54
|
|
|
|
55
|
|
|
// Encode tables: Unicode -> DBCS. |
56
|
|
|
|
57
|
|
|
// `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. |
58
|
|
|
// Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. |
59
|
|
|
// Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). |
60
|
|
|
// == UNASSIGNED -> no conversion found. Output a default char. |
61
|
|
|
// <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. |
62
|
|
|
this.encodeTable = []; |
63
|
|
|
|
64
|
|
|
// `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of |
65
|
|
|
// objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key |
66
|
|
|
// means end of sequence (needed when one sequence is a strict subsequence of another). |
67
|
|
|
// Objects are kept separately from encodeTable to increase performance. |
68
|
|
|
this.encodeTableSeq = []; |
69
|
|
|
|
70
|
|
|
// Some chars can be decoded, but need not be encoded. |
71
|
|
|
var skipEncodeChars = {}; |
72
|
|
|
if (codecOptions.encodeSkipVals) |
73
|
|
|
for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) { |
74
|
|
|
var val = codecOptions.encodeSkipVals[i]; |
75
|
|
|
if (typeof val === 'number') |
76
|
|
|
skipEncodeChars[val] = true; |
77
|
|
|
else |
78
|
|
|
for (var j = val.from; j <= val.to; j++) |
79
|
|
|
skipEncodeChars[j] = true; |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
// Use decode trie to recursively fill out encode tables. |
83
|
|
|
this._fillEncodeTable(0, 0, skipEncodeChars); |
84
|
|
|
|
85
|
|
|
// Add more encoding pairs when needed. |
86
|
|
|
if (codecOptions.encodeAdd) { |
87
|
|
|
for (var uChar in codecOptions.encodeAdd) |
88
|
|
|
if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar)) |
89
|
|
|
this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; |
93
|
|
|
if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; |
94
|
|
|
if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); |
95
|
|
|
|
96
|
|
|
|
97
|
|
|
// Load & create GB18030 tables when needed. |
98
|
|
|
if (typeof codecOptions.gb18030 === 'function') { |
99
|
|
|
this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. |
100
|
|
|
|
101
|
|
|
// Add GB18030 decode tables. |
102
|
|
|
var thirdByteNodeIdx = this.decodeTables.length; |
103
|
|
|
var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0); |
104
|
|
|
|
105
|
|
|
var fourthByteNodeIdx = this.decodeTables.length; |
106
|
|
|
var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0); |
107
|
|
|
|
108
|
|
|
for (var i = 0x81; i <= 0xFE; i++) { |
109
|
|
|
var secondByteNodeIdx = NODE_START - this.decodeTables[0][i]; |
110
|
|
|
var secondByteNode = this.decodeTables[secondByteNodeIdx]; |
111
|
|
|
for (var j = 0x30; j <= 0x39; j++) |
112
|
|
|
secondByteNode[j] = NODE_START - thirdByteNodeIdx; |
113
|
|
|
} |
114
|
|
|
for (var i = 0x81; i <= 0xFE; i++) |
115
|
|
|
thirdByteNode[i] = NODE_START - fourthByteNodeIdx; |
116
|
|
|
for (var i = 0x30; i <= 0x39; i++) |
117
|
|
|
fourthByteNode[i] = GB18030_CODE |
118
|
|
|
} |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
DBCSCodec.prototype.encoder = DBCSEncoder; |
122
|
|
|
DBCSCodec.prototype.decoder = DBCSDecoder; |
123
|
|
|
|
124
|
|
|
// Decoder helpers |
125
|
|
|
DBCSCodec.prototype._getDecodeTrieNode = function(addr) { |
126
|
|
|
var bytes = []; |
127
|
|
|
for (; addr > 0; addr >>= 8) |
128
|
|
|
bytes.push(addr & 0xFF); |
129
|
|
|
if (bytes.length == 0) |
130
|
|
|
bytes.push(0); |
131
|
|
|
|
132
|
|
|
var node = this.decodeTables[0]; |
133
|
|
|
for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie. |
134
|
|
|
var val = node[bytes[i]]; |
135
|
|
|
|
136
|
|
|
if (val == UNASSIGNED) { // Create new node. |
137
|
|
|
node[bytes[i]] = NODE_START - this.decodeTables.length; |
138
|
|
|
this.decodeTables.push(node = UNASSIGNED_NODE.slice(0)); |
139
|
|
|
} |
140
|
|
|
else if (val <= NODE_START) { // Existing node. |
141
|
|
|
node = this.decodeTables[NODE_START - val]; |
142
|
|
|
} |
143
|
|
|
else |
144
|
|
|
throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16)); |
145
|
|
|
} |
146
|
|
|
return node; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
|
150
|
|
|
DBCSCodec.prototype._addDecodeChunk = function(chunk) { |
151
|
|
|
// First element of chunk is the hex mbcs code where we start. |
152
|
|
|
var curAddr = parseInt(chunk[0], 16); |
153
|
|
|
|
154
|
|
|
// Choose the decoding node where we'll write our chars. |
155
|
|
|
var writeTable = this._getDecodeTrieNode(curAddr); |
156
|
|
|
curAddr = curAddr & 0xFF; |
157
|
|
|
|
158
|
|
|
// Write all other elements of the chunk to the table. |
159
|
|
|
for (var k = 1; k < chunk.length; k++) { |
160
|
|
|
var part = chunk[k]; |
161
|
|
|
if (typeof part === "string") { // String, write as-is. |
162
|
|
|
for (var l = 0; l < part.length;) { |
163
|
|
|
var code = part.charCodeAt(l++); |
164
|
|
|
if (0xD800 <= code && code < 0xDC00) { // Decode surrogate |
165
|
|
|
var codeTrail = part.charCodeAt(l++); |
166
|
|
|
if (0xDC00 <= codeTrail && codeTrail < 0xE000) |
167
|
|
|
writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); |
168
|
|
|
else |
169
|
|
|
throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]); |
170
|
|
|
} |
171
|
|
|
else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) |
172
|
|
|
var len = 0xFFF - code + 2; |
173
|
|
|
var seq = []; |
174
|
|
|
for (var m = 0; m < len; m++) |
175
|
|
|
seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. |
176
|
|
|
|
177
|
|
|
writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; |
178
|
|
|
this.decodeTableSeq.push(seq); |
179
|
|
|
} |
180
|
|
|
else |
181
|
|
|
writeTable[curAddr++] = code; // Basic char |
182
|
|
|
} |
183
|
|
|
} |
184
|
|
|
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. |
185
|
|
|
var charCode = writeTable[curAddr - 1] + 1; |
186
|
|
|
for (var l = 0; l < part; l++) |
187
|
|
|
writeTable[curAddr++] = charCode++; |
188
|
|
|
} |
189
|
|
|
else |
190
|
|
|
throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]); |
191
|
|
|
} |
192
|
|
|
if (curAddr > 0xFF) |
193
|
|
|
throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); |
194
|
|
|
} |
195
|
|
|
|
196
|
|
|
// Encoder helpers |
197
|
|
|
DBCSCodec.prototype._getEncodeBucket = function(uCode) { |
198
|
|
|
var high = uCode >> 8; // This could be > 0xFF because of astral characters. |
199
|
|
|
if (this.encodeTable[high] === undefined) |
200
|
|
|
this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. |
201
|
|
|
return this.encodeTable[high]; |
202
|
|
|
} |
203
|
|
|
|
204
|
|
|
DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { |
205
|
|
|
var bucket = this._getEncodeBucket(uCode); |
206
|
|
|
var low = uCode & 0xFF; |
207
|
|
|
if (bucket[low] <= SEQ_START) |
208
|
|
|
this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it. |
209
|
|
|
else if (bucket[low] == UNASSIGNED) |
210
|
|
|
bucket[low] = dbcsCode; |
211
|
|
|
} |
212
|
|
|
|
213
|
|
|
DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { |
214
|
|
|
|
215
|
|
|
// Get the root of character tree according to first character of the sequence. |
216
|
|
|
var uCode = seq[0]; |
217
|
|
|
var bucket = this._getEncodeBucket(uCode); |
218
|
|
|
var low = uCode & 0xFF; |
219
|
|
|
|
220
|
|
|
var node; |
221
|
|
|
if (bucket[low] <= SEQ_START) { |
222
|
|
|
// There's already a sequence with - use it. |
223
|
|
|
node = this.encodeTableSeq[SEQ_START-bucket[low]]; |
224
|
|
|
} |
225
|
|
|
else { |
226
|
|
|
// There was no sequence object - allocate a new one. |
227
|
|
|
node = {}; |
228
|
|
|
if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. |
229
|
|
|
bucket[low] = SEQ_START - this.encodeTableSeq.length; |
230
|
|
|
this.encodeTableSeq.push(node); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
// Traverse the character tree, allocating new nodes as needed. |
234
|
|
|
for (var j = 1; j < seq.length-1; j++) { |
235
|
|
|
var oldVal = node[uCode]; |
236
|
|
|
if (typeof oldVal === 'object') |
237
|
|
|
node = oldVal; |
238
|
|
|
else { |
239
|
|
|
node = node[uCode] = {} |
240
|
|
|
if (oldVal !== undefined) |
241
|
|
|
node[DEF_CHAR] = oldVal |
242
|
|
|
} |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
// Set the leaf to given dbcsCode. |
246
|
|
|
uCode = seq[seq.length-1]; |
247
|
|
|
node[uCode] = dbcsCode; |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) { |
251
|
|
|
var node = this.decodeTables[nodeIdx]; |
252
|
|
|
for (var i = 0; i < 0x100; i++) { |
253
|
|
|
var uCode = node[i]; |
254
|
|
|
var mbCode = prefix + i; |
255
|
|
|
if (skipEncodeChars[mbCode]) |
256
|
|
|
continue; |
257
|
|
|
|
258
|
|
|
if (uCode >= 0) |
259
|
|
|
this._setEncodeChar(uCode, mbCode); |
260
|
|
|
else if (uCode <= NODE_START) |
261
|
|
|
this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars); |
262
|
|
|
else if (uCode <= SEQ_START) |
263
|
|
|
this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); |
264
|
|
|
} |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
|
268
|
|
|
|
269
|
|
|
// == Encoder ================================================================== |
270
|
|
|
|
271
|
|
|
function DBCSEncoder(options, codec) { |
272
|
|
|
// Encoder state |
273
|
|
|
this.leadSurrogate = -1; |
274
|
|
|
this.seqObj = undefined; |
275
|
|
|
|
276
|
|
|
// Static data |
277
|
|
|
this.encodeTable = codec.encodeTable; |
278
|
|
|
this.encodeTableSeq = codec.encodeTableSeq; |
279
|
|
|
this.defaultCharSingleByte = codec.defCharSB; |
280
|
|
|
this.gb18030 = codec.gb18030; |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
DBCSEncoder.prototype.write = function(str) { |
284
|
|
|
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)), |
285
|
|
|
leadSurrogate = this.leadSurrogate, |
286
|
|
|
seqObj = this.seqObj, nextChar = -1, |
287
|
|
|
i = 0, j = 0; |
288
|
|
|
|
289
|
|
|
while (true) { |
290
|
|
|
// 0. Get next character. |
291
|
|
|
if (nextChar === -1) { |
292
|
|
|
if (i == str.length) break; |
293
|
|
|
var uCode = str.charCodeAt(i++); |
294
|
|
|
} |
295
|
|
|
else { |
296
|
|
|
var uCode = nextChar; |
297
|
|
|
nextChar = -1; |
298
|
|
|
} |
299
|
|
|
|
300
|
|
|
// 1. Handle surrogates. |
301
|
|
|
if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates. |
302
|
|
|
if (uCode < 0xDC00) { // We've got lead surrogate. |
303
|
|
|
if (leadSurrogate === -1) { |
304
|
|
|
leadSurrogate = uCode; |
305
|
|
|
continue; |
306
|
|
|
} else { |
307
|
|
|
leadSurrogate = uCode; |
308
|
|
|
// Double lead surrogate found. |
309
|
|
|
uCode = UNASSIGNED; |
310
|
|
|
} |
311
|
|
|
} else { // We've got trail surrogate. |
312
|
|
|
if (leadSurrogate !== -1) { |
313
|
|
|
uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00); |
314
|
|
|
leadSurrogate = -1; |
315
|
|
|
} else { |
316
|
|
|
// Incomplete surrogate pair - only trail surrogate found. |
317
|
|
|
uCode = UNASSIGNED; |
318
|
|
|
} |
319
|
|
|
|
320
|
|
|
} |
321
|
|
|
} |
322
|
|
|
else if (leadSurrogate !== -1) { |
323
|
|
|
// Incomplete surrogate pair - only lead surrogate found. |
324
|
|
|
nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char. |
325
|
|
|
leadSurrogate = -1; |
326
|
|
|
} |
327
|
|
|
|
328
|
|
|
// 2. Convert uCode character. |
329
|
|
|
var dbcsCode = UNASSIGNED; |
330
|
|
|
if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence |
331
|
|
|
var resCode = seqObj[uCode]; |
332
|
|
|
if (typeof resCode === 'object') { // Sequence continues. |
333
|
|
|
seqObj = resCode; |
334
|
|
|
continue; |
335
|
|
|
|
336
|
|
|
} else if (typeof resCode == 'number') { // Sequence finished. Write it. |
337
|
|
|
dbcsCode = resCode; |
338
|
|
|
|
339
|
|
|
} else if (resCode == undefined) { // Current character is not part of the sequence. |
340
|
|
|
|
341
|
|
|
// Try default character for this sequence |
342
|
|
|
resCode = seqObj[DEF_CHAR]; |
343
|
|
|
if (resCode !== undefined) { |
344
|
|
|
dbcsCode = resCode; // Found. Write it. |
345
|
|
|
nextChar = uCode; // Current character will be written too in the next iteration. |
346
|
|
|
|
347
|
|
|
} else { |
348
|
|
|
// TODO: What if we have no default? (resCode == undefined) |
349
|
|
|
// Then, we should write first char of the sequence as-is and try the rest recursively. |
350
|
|
|
// Didn't do it for now because no encoding has this situation yet. |
351
|
|
|
// Currently, just skip the sequence and write current char. |
352
|
|
|
} |
353
|
|
|
} |
354
|
|
|
seqObj = undefined; |
355
|
|
|
} |
356
|
|
|
else if (uCode >= 0) { // Regular character |
357
|
|
|
var subtable = this.encodeTable[uCode >> 8]; |
358
|
|
|
if (subtable !== undefined) |
359
|
|
|
dbcsCode = subtable[uCode & 0xFF]; |
360
|
|
|
|
361
|
|
|
if (dbcsCode <= SEQ_START) { // Sequence start |
362
|
|
|
seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; |
363
|
|
|
continue; |
364
|
|
|
} |
365
|
|
|
|
366
|
|
|
if (dbcsCode == UNASSIGNED && this.gb18030) { |
367
|
|
|
// Use GB18030 algorithm to find character(s) to write. |
368
|
|
|
var idx = findIdx(this.gb18030.uChars, uCode); |
369
|
|
|
if (idx != -1) { |
370
|
|
|
var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); |
371
|
|
|
newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600; |
372
|
|
|
newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260; |
373
|
|
|
newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10; |
374
|
|
|
newBuf[j++] = 0x30 + dbcsCode; |
375
|
|
|
continue; |
376
|
|
|
} |
377
|
|
|
} |
378
|
|
|
} |
379
|
|
|
|
380
|
|
|
// 3. Write dbcsCode character. |
381
|
|
|
if (dbcsCode === UNASSIGNED) |
382
|
|
|
dbcsCode = this.defaultCharSingleByte; |
383
|
|
|
|
384
|
|
|
if (dbcsCode < 0x100) { |
385
|
|
|
newBuf[j++] = dbcsCode; |
386
|
|
|
} |
387
|
|
|
else if (dbcsCode < 0x10000) { |
388
|
|
|
newBuf[j++] = dbcsCode >> 8; // high byte |
389
|
|
|
newBuf[j++] = dbcsCode & 0xFF; // low byte |
390
|
|
|
} |
391
|
|
|
else { |
392
|
|
|
newBuf[j++] = dbcsCode >> 16; |
393
|
|
|
newBuf[j++] = (dbcsCode >> 8) & 0xFF; |
394
|
|
|
newBuf[j++] = dbcsCode & 0xFF; |
395
|
|
|
} |
396
|
|
|
} |
397
|
|
|
|
398
|
|
|
this.seqObj = seqObj; |
399
|
|
|
this.leadSurrogate = leadSurrogate; |
400
|
|
|
return newBuf.slice(0, j); |
401
|
|
|
} |
402
|
|
|
|
403
|
|
|
DBCSEncoder.prototype.end = function() { |
404
|
|
|
if (this.leadSurrogate === -1 && this.seqObj === undefined) |
405
|
|
|
return; // All clean. Most often case. |
406
|
|
|
|
407
|
|
|
var newBuf = new Buffer(10), j = 0; |
408
|
|
|
|
409
|
|
|
if (this.seqObj) { // We're in the sequence. |
410
|
|
|
var dbcsCode = this.seqObj[DEF_CHAR]; |
411
|
|
|
if (dbcsCode !== undefined) { // Write beginning of the sequence. |
412
|
|
|
if (dbcsCode < 0x100) { |
413
|
|
|
newBuf[j++] = dbcsCode; |
414
|
|
|
} |
415
|
|
|
else { |
416
|
|
|
newBuf[j++] = dbcsCode >> 8; // high byte |
417
|
|
|
newBuf[j++] = dbcsCode & 0xFF; // low byte |
418
|
|
|
} |
419
|
|
|
} else { |
420
|
|
|
// See todo above. |
421
|
|
|
} |
422
|
|
|
this.seqObj = undefined; |
423
|
|
|
} |
424
|
|
|
|
425
|
|
|
if (this.leadSurrogate !== -1) { |
426
|
|
|
// Incomplete surrogate pair - only lead surrogate found. |
427
|
|
|
newBuf[j++] = this.defaultCharSingleByte; |
428
|
|
|
this.leadSurrogate = -1; |
429
|
|
|
} |
430
|
|
|
|
431
|
|
|
return newBuf.slice(0, j); |
432
|
|
|
} |
433
|
|
|
|
434
|
|
|
// Export for testing |
435
|
|
|
DBCSEncoder.prototype.findIdx = findIdx; |
436
|
|
|
|
437
|
|
|
|
438
|
|
|
// == Decoder ================================================================== |
439
|
|
|
|
440
|
|
|
function DBCSDecoder(options, codec) { |
441
|
|
|
// Decoder state |
442
|
|
|
this.nodeIdx = 0; |
443
|
|
|
this.prevBuf = new Buffer(0); |
444
|
|
|
|
445
|
|
|
// Static data |
446
|
|
|
this.decodeTables = codec.decodeTables; |
447
|
|
|
this.decodeTableSeq = codec.decodeTableSeq; |
448
|
|
|
this.defaultCharUnicode = codec.defaultCharUnicode; |
449
|
|
|
this.gb18030 = codec.gb18030; |
450
|
|
|
} |
451
|
|
|
|
452
|
|
|
DBCSDecoder.prototype.write = function(buf) { |
453
|
|
|
var newBuf = new Buffer(buf.length*2), |
454
|
|
|
nodeIdx = this.nodeIdx, |
455
|
|
|
prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length, |
456
|
|
|
seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence. |
457
|
|
|
uCode; |
458
|
|
|
|
459
|
|
|
if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later. |
460
|
|
|
prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]); |
461
|
|
|
|
462
|
|
|
for (var i = 0, j = 0; i < buf.length; i++) { |
463
|
|
|
var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset]; |
464
|
|
|
|
465
|
|
|
// Lookup in current trie node. |
466
|
|
|
var uCode = this.decodeTables[nodeIdx][curByte]; |
467
|
|
|
|
468
|
|
|
if (uCode >= 0) { |
469
|
|
|
// Normal character, just use it. |
470
|
|
|
} |
471
|
|
|
else if (uCode === UNASSIGNED) { // Unknown char. |
472
|
|
|
// TODO: Callback with seq. |
473
|
|
|
//var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); |
474
|
|
|
i = seqStart; // Try to parse again, after skipping first byte of the sequence ('i' will be incremented by 'for' cycle). |
475
|
|
|
uCode = this.defaultCharUnicode.charCodeAt(0); |
476
|
|
|
} |
477
|
|
|
else if (uCode === GB18030_CODE) { |
478
|
|
|
var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); |
479
|
|
|
var ptr = (curSeq[0]-0x81)*12600 + (curSeq[1]-0x30)*1260 + (curSeq[2]-0x81)*10 + (curSeq[3]-0x30); |
480
|
|
|
var idx = findIdx(this.gb18030.gbChars, ptr); |
481
|
|
|
uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; |
482
|
|
|
} |
483
|
|
|
else if (uCode <= NODE_START) { // Go to next trie node. |
484
|
|
|
nodeIdx = NODE_START - uCode; |
485
|
|
|
continue; |
486
|
|
|
} |
487
|
|
|
else if (uCode <= SEQ_START) { // Output a sequence of chars. |
488
|
|
|
var seq = this.decodeTableSeq[SEQ_START - uCode]; |
489
|
|
|
for (var k = 0; k < seq.length - 1; k++) { |
490
|
|
|
uCode = seq[k]; |
491
|
|
|
newBuf[j++] = uCode & 0xFF; |
492
|
|
|
newBuf[j++] = uCode >> 8; |
493
|
|
|
} |
494
|
|
|
uCode = seq[seq.length-1]; |
495
|
|
|
} |
496
|
|
|
else |
497
|
|
|
throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte); |
498
|
|
|
|
499
|
|
|
// Write the character to buffer, handling higher planes using surrogate pair. |
500
|
|
|
if (uCode > 0xFFFF) { |
501
|
|
|
uCode -= 0x10000; |
502
|
|
|
var uCodeLead = 0xD800 + Math.floor(uCode / 0x400); |
503
|
|
|
newBuf[j++] = uCodeLead & 0xFF; |
504
|
|
|
newBuf[j++] = uCodeLead >> 8; |
505
|
|
|
|
506
|
|
|
uCode = 0xDC00 + uCode % 0x400; |
507
|
|
|
} |
508
|
|
|
newBuf[j++] = uCode & 0xFF; |
509
|
|
|
newBuf[j++] = uCode >> 8; |
510
|
|
|
|
511
|
|
|
// Reset trie node. |
512
|
|
|
nodeIdx = 0; seqStart = i+1; |
513
|
|
|
} |
514
|
|
|
|
515
|
|
|
this.nodeIdx = nodeIdx; |
516
|
|
|
this.prevBuf = (seqStart >= 0) ? buf.slice(seqStart) : prevBuf.slice(seqStart + prevBufOffset); |
517
|
|
|
return newBuf.slice(0, j).toString('ucs2'); |
518
|
|
|
} |
519
|
|
|
|
520
|
|
|
DBCSDecoder.prototype.end = function() { |
521
|
|
|
var ret = ''; |
522
|
|
|
|
523
|
|
|
// Try to parse all remaining chars. |
524
|
|
|
while (this.prevBuf.length > 0) { |
525
|
|
|
// Skip 1 character in the buffer. |
526
|
|
|
ret += this.defaultCharUnicode; |
527
|
|
|
var buf = this.prevBuf.slice(1); |
528
|
|
|
|
529
|
|
|
// Parse remaining as usual. |
530
|
|
|
this.prevBuf = new Buffer(0); |
531
|
|
|
this.nodeIdx = 0; |
532
|
|
|
if (buf.length > 0) |
533
|
|
|
ret += this.write(buf); |
534
|
|
|
} |
535
|
|
|
|
536
|
|
|
this.nodeIdx = 0; |
537
|
|
|
return ret; |
538
|
|
|
} |
539
|
|
|
|
540
|
|
|
// Binary search for GB18030. Returns largest i such that table[i] <= val. |
541
|
|
|
function findIdx(table, val) { |
542
|
|
|
if (table[0] > val) |
543
|
|
|
return -1; |
544
|
|
|
|
545
|
|
|
var l = 0, r = table.length; |
546
|
|
|
while (l < r-1) { // always table[l] <= val < table[r] |
547
|
|
|
var mid = l + Math.floor((r-l+1)/2); |
548
|
|
|
if (table[mid] <= val) |
549
|
|
|
l = mid; |
550
|
|
|
else |
551
|
|
|
r = mid; |
552
|
|
|
} |
553
|
|
|
return l; |
554
|
|
|
} |
555
|
|
|
|
556
|
|
|
|