You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

555 lines
21 KiB

4 years ago
  1. "use strict";
  2. var Buffer = require("safer-buffer").Buffer;
  3. // Multibyte codec. In this scheme, a character is represented by 1 or more bytes.
  4. // Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences.
  5. // To save memory and loading time, we read table files only when requested.
  6. exports._dbcs = DBCSCodec;
  7. var UNASSIGNED = -1,
  8. GB18030_CODE = -2,
  9. SEQ_START = -10,
  10. NODE_START = -1000,
  11. UNASSIGNED_NODE = new Array(0x100),
  12. DEF_CHAR = -1;
  13. for (var i = 0; i < 0x100; i++)
  14. UNASSIGNED_NODE[i] = UNASSIGNED;
  15. // Class DBCSCodec reads and initializes mapping tables.
  16. function DBCSCodec(codecOptions, iconv) {
  17. this.encodingName = codecOptions.encodingName;
  18. if (!codecOptions)
  19. throw new Error("DBCS codec is called without the data.")
  20. if (!codecOptions.table)
  21. throw new Error("Encoding '" + this.encodingName + "' has no data.");
  22. // Load tables.
  23. var mappingTable = codecOptions.table();
  24. // Decode tables: MBCS -> Unicode.
  25. // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256.
  26. // Trie root is decodeTables[0].
  27. // Values: >= 0 -> unicode character code. can be > 0xFFFF
  28. // == UNASSIGNED -> unknown/unassigned sequence.
  29. // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence.
  30. // <= NODE_START -> index of the next node in our trie to process next byte.
  31. // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq.
  32. this.decodeTables = [];
  33. this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node.
  34. // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
  35. this.decodeTableSeq = [];
  36. // Actual mapping tables consist of chunks. Use them to fill up decode tables.
  37. for (var i = 0; i < mappingTable.length; i++)
  38. this._addDecodeChunk(mappingTable[i]);
  39. this.defaultCharUnicode = iconv.defaultCharUnicode;
  40. // Encode tables: Unicode -> DBCS.
  41. // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance.
  42. // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null.
  43. // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.).
  44. // == UNASSIGNED -> no conversion found. Output a default char.
  45. // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence.
  46. this.encodeTable = [];
  47. // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of
  48. // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key
  49. // means end of sequence (needed when one sequence is a strict subsequence of another).
  50. // Objects are kept separately from encodeTable to increase performance.
  51. this.encodeTableSeq = [];
  52. // Some chars can be decoded, but need not be encoded.
  53. var skipEncodeChars = {};
  54. if (codecOptions.encodeSkipVals)
  55. for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) {
  56. var val = codecOptions.encodeSkipVals[i];
  57. if (typeof val === 'number')
  58. skipEncodeChars[val] = true;
  59. else
  60. for (var j = val.from; j <= val.to; j++)
  61. skipEncodeChars[j] = true;
  62. }
  63. // Use decode trie to recursively fill out encode tables.
  64. this._fillEncodeTable(0, 0, skipEncodeChars);
  65. // Add more encoding pairs when needed.
  66. if (codecOptions.encodeAdd) {
  67. for (var uChar in codecOptions.encodeAdd)
  68. if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar))
  69. this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]);
  70. }
  71. this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)];
  72. if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?'];
  73. if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0);
  74. // Load & create GB18030 tables when needed.
  75. if (typeof codecOptions.gb18030 === 'function') {
  76. this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges.
  77. // Add GB18030 decode tables.
  78. var thirdByteNodeIdx = this.decodeTables.length;
  79. var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0);
  80. var fourthByteNodeIdx = this.decodeTables.length;
  81. var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0);
  82. for (var i = 0x81; i <= 0xFE; i++) {
  83. var secondByteNodeIdx = NODE_START - this.decodeTables[0][i];
  84. var secondByteNode = this.decodeTables[secondByteNodeIdx];
  85. for (var j = 0x30; j <= 0x39; j++)
  86. secondByteNode[j] = NODE_START - thirdByteNodeIdx;
  87. }
  88. for (var i = 0x81; i <= 0xFE; i++)
  89. thirdByteNode[i] = NODE_START - fourthByteNodeIdx;
  90. for (var i = 0x30; i <= 0x39; i++)
  91. fourthByteNode[i] = GB18030_CODE
  92. }
  93. }
  94. DBCSCodec.prototype.encoder = DBCSEncoder;
  95. DBCSCodec.prototype.decoder = DBCSDecoder;
  96. // Decoder helpers
  97. DBCSCodec.prototype._getDecodeTrieNode = function(addr) {
  98. var bytes = [];
  99. for (; addr > 0; addr >>= 8)
  100. bytes.push(addr & 0xFF);
  101. if (bytes.length == 0)
  102. bytes.push(0);
  103. var node = this.decodeTables[0];
  104. for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie.
  105. var val = node[bytes[i]];
  106. if (val == UNASSIGNED) { // Create new node.
  107. node[bytes[i]] = NODE_START - this.decodeTables.length;
  108. this.decodeTables.push(node = UNASSIGNED_NODE.slice(0));
  109. }
  110. else if (val <= NODE_START) { // Existing node.
  111. node = this.decodeTables[NODE_START - val];
  112. }
  113. else
  114. throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16));
  115. }
  116. return node;
  117. }
  118. DBCSCodec.prototype._addDecodeChunk = function(chunk) {
  119. // First element of chunk is the hex mbcs code where we start.
  120. var curAddr = parseInt(chunk[0], 16);
  121. // Choose the decoding node where we'll write our chars.
  122. var writeTable = this._getDecodeTrieNode(curAddr);
  123. curAddr = curAddr & 0xFF;
  124. // Write all other elements of the chunk to the table.
  125. for (var k = 1; k < chunk.length; k++) {
  126. var part = chunk[k];
  127. if (typeof part === "string") { // String, write as-is.
  128. for (var l = 0; l < part.length;) {
  129. var code = part.charCodeAt(l++);
  130. if (0xD800 <= code && code < 0xDC00) { // Decode surrogate
  131. var codeTrail = part.charCodeAt(l++);
  132. if (0xDC00 <= codeTrail && codeTrail < 0xE000)
  133. writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00);
  134. else
  135. throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]);
  136. }
  137. else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used)
  138. var len = 0xFFF - code + 2;
  139. var seq = [];
  140. for (var m = 0; m < len; m++)
  141. seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq.
  142. writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length;
  143. this.decodeTableSeq.push(seq);
  144. }
  145. else
  146. writeTable[curAddr++] = code; // Basic char
  147. }
  148. }
  149. else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character.
  150. var charCode = writeTable[curAddr - 1] + 1;
  151. for (var l = 0; l < part; l++)
  152. writeTable[curAddr++] = charCode++;
  153. }
  154. else
  155. throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]);
  156. }
  157. if (curAddr > 0xFF)
  158. throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr);
  159. }
  160. // Encoder helpers
  161. DBCSCodec.prototype._getEncodeBucket = function(uCode) {
  162. var high = uCode >> 8; // This could be > 0xFF because of astral characters.
  163. if (this.encodeTable[high] === undefined)
  164. this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand.
  165. return this.encodeTable[high];
  166. }
  167. DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) {
  168. var bucket = this._getEncodeBucket(uCode);
  169. var low = uCode & 0xFF;
  170. if (bucket[low] <= SEQ_START)
  171. this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it.
  172. else if (bucket[low] == UNASSIGNED)
  173. bucket[low] = dbcsCode;
  174. }
  175. DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {
  176. // Get the root of character tree according to first character of the sequence.
  177. var uCode = seq[0];
  178. var bucket = this._getEncodeBucket(uCode);
  179. var low = uCode & 0xFF;
  180. var node;
  181. if (bucket[low] <= SEQ_START) {
  182. // There's already a sequence with - use it.
  183. node = this.encodeTableSeq[SEQ_START-bucket[low]];
  184. }
  185. else {
  186. // There was no sequence object - allocate a new one.
  187. node = {};
  188. if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence.
  189. bucket[low] = SEQ_START - this.encodeTableSeq.length;
  190. this.encodeTableSeq.push(node);
  191. }
  192. // Traverse the character tree, allocating new nodes as needed.
  193. for (var j = 1; j < seq.length-1; j++) {
  194. var oldVal = node[uCode];
  195. if (typeof oldVal === 'object')
  196. node = oldVal;
  197. else {
  198. node = node[uCode] = {}
  199. if (oldVal !== undefined)
  200. node[DEF_CHAR] = oldVal
  201. }
  202. }
  203. // Set the leaf to given dbcsCode.
  204. uCode = seq[seq.length-1];
  205. node[uCode] = dbcsCode;
  206. }
  207. DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) {
  208. var node = this.decodeTables[nodeIdx];
  209. for (var i = 0; i < 0x100; i++) {
  210. var uCode = node[i];
  211. var mbCode = prefix + i;
  212. if (skipEncodeChars[mbCode])
  213. continue;
  214. if (uCode >= 0)
  215. this._setEncodeChar(uCode, mbCode);
  216. else if (uCode <= NODE_START)
  217. this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars);
  218. else if (uCode <= SEQ_START)
  219. this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode);
  220. }
  221. }
  222. // == Encoder ==================================================================
  223. function DBCSEncoder(options, codec) {
  224. // Encoder state
  225. this.leadSurrogate = -1;
  226. this.seqObj = undefined;
  227. // Static data
  228. this.encodeTable = codec.encodeTable;
  229. this.encodeTableSeq = codec.encodeTableSeq;
  230. this.defaultCharSingleByte = codec.defCharSB;
  231. this.gb18030 = codec.gb18030;
  232. }
  233. DBCSEncoder.prototype.write = function(str) {
  234. var newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)),
  235. leadSurrogate = this.leadSurrogate,
  236. seqObj = this.seqObj, nextChar = -1,
  237. i = 0, j = 0;
  238. while (true) {
  239. // 0. Get next character.
  240. if (nextChar === -1) {
  241. if (i == str.length) break;
  242. var uCode = str.charCodeAt(i++);
  243. }
  244. else {
  245. var uCode = nextChar;
  246. nextChar = -1;
  247. }
  248. // 1. Handle surrogates.
  249. if (0xD800 <= uCode && uCode < 0xE000) { // Char is one of surrogates.
  250. if (uCode < 0xDC00) { // We've got lead surrogate.
  251. if (leadSurrogate === -1) {
  252. leadSurrogate = uCode;
  253. continue;
  254. } else {
  255. leadSurrogate = uCode;
  256. // Double lead surrogate found.
  257. uCode = UNASSIGNED;
  258. }
  259. } else { // We've got trail surrogate.
  260. if (leadSurrogate !== -1) {
  261. uCode = 0x10000 + (leadSurrogate - 0xD800) * 0x400 + (uCode - 0xDC00);
  262. leadSurrogate = -1;
  263. } else {
  264. // Incomplete surrogate pair - only trail surrogate found.
  265. uCode = UNASSIGNED;
  266. }
  267. }
  268. }
  269. else if (leadSurrogate !== -1) {
  270. // Incomplete surrogate pair - only lead surrogate found.
  271. nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char.
  272. leadSurrogate = -1;
  273. }
  274. // 2. Convert uCode character.
  275. var dbcsCode = UNASSIGNED;
  276. if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence
  277. var resCode = seqObj[uCode];
  278. if (typeof resCode === 'object') { // Sequence continues.
  279. seqObj = resCode;
  280. continue;
  281. } else if (typeof resCode == 'number') { // Sequence finished. Write it.
  282. dbcsCode = resCode;
  283. } else if (resCode == undefined) { // Current character is not part of the sequence.
  284. // Try default character for this sequence
  285. resCode = seqObj[DEF_CHAR];
  286. if (resCode !== undefined) {
  287. dbcsCode = resCode; // Found. Write it.
  288. nextChar = uCode; // Current character will be written too in the next iteration.
  289. } else {
  290. // TODO: What if we have no default? (resCode == undefined)
  291. // Then, we should write first char of the sequence as-is and try the rest recursively.
  292. // Didn't do it for now because no encoding has this situation yet.
  293. // Currently, just skip the sequence and write current char.
  294. }
  295. }
  296. seqObj = undefined;
  297. }
  298. else if (uCode >= 0) { // Regular character
  299. var subtable = this.encodeTable[uCode >> 8];
  300. if (subtable !== undefined)
  301. dbcsCode = subtable[uCode & 0xFF];
  302. if (dbcsCode <= SEQ_START) { // Sequence start
  303. seqObj = this.encodeTableSeq[SEQ_START-dbcsCode];
  304. continue;
  305. }
  306. if (dbcsCode == UNASSIGNED && this.gb18030) {
  307. // Use GB18030 algorithm to find character(s) to write.
  308. var idx = findIdx(this.gb18030.uChars, uCode);
  309. if (idx != -1) {
  310. var dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]);
  311. newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); dbcsCode = dbcsCode % 12600;
  312. newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); dbcsCode = dbcsCode % 1260;
  313. newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); dbcsCode = dbcsCode % 10;
  314. newBuf[j++] = 0x30 + dbcsCode;
  315. continue;
  316. }
  317. }
  318. }
  319. // 3. Write dbcsCode character.
  320. if (dbcsCode === UNASSIGNED)
  321. dbcsCode = this.defaultCharSingleByte;
  322. if (dbcsCode < 0x100) {
  323. newBuf[j++] = dbcsCode;
  324. }
  325. else if (dbcsCode < 0x10000) {
  326. newBuf[j++] = dbcsCode >> 8; // high byte
  327. newBuf[j++] = dbcsCode & 0xFF; // low byte
  328. }
  329. else {
  330. newBuf[j++] = dbcsCode >> 16;
  331. newBuf[j++] = (dbcsCode >> 8) & 0xFF;
  332. newBuf[j++] = dbcsCode & 0xFF;
  333. }
  334. }
  335. this.seqObj = seqObj;
  336. this.leadSurrogate = leadSurrogate;
  337. return newBuf.slice(0, j);
  338. }
  339. DBCSEncoder.prototype.end = function() {
  340. if (this.leadSurrogate === -1 && this.seqObj === undefined)
  341. return; // All clean. Most often case.
  342. var newBuf = Buffer.alloc(10), j = 0;
  343. if (this.seqObj) { // We're in the sequence.
  344. var dbcsCode = this.seqObj[DEF_CHAR];
  345. if (dbcsCode !== undefined) { // Write beginning of the sequence.
  346. if (dbcsCode < 0x100) {
  347. newBuf[j++] = dbcsCode;
  348. }
  349. else {
  350. newBuf[j++] = dbcsCode >> 8; // high byte
  351. newBuf[j++] = dbcsCode & 0xFF; // low byte
  352. }
  353. } else {
  354. // See todo above.
  355. }
  356. this.seqObj = undefined;
  357. }
  358. if (this.leadSurrogate !== -1) {
  359. // Incomplete surrogate pair - only lead surrogate found.
  360. newBuf[j++] = this.defaultCharSingleByte;
  361. this.leadSurrogate = -1;
  362. }
  363. return newBuf.slice(0, j);
  364. }
  365. // Export for testing
  366. DBCSEncoder.prototype.findIdx = findIdx;
  367. // == Decoder ==================================================================
  368. function DBCSDecoder(options, codec) {
  369. // Decoder state
  370. this.nodeIdx = 0;
  371. this.prevBuf = Buffer.alloc(0);
  372. // Static data
  373. this.decodeTables = codec.decodeTables;
  374. this.decodeTableSeq = codec.decodeTableSeq;
  375. this.defaultCharUnicode = codec.defaultCharUnicode;
  376. this.gb18030 = codec.gb18030;
  377. }
  378. DBCSDecoder.prototype.write = function(buf) {
  379. var newBuf = Buffer.alloc(buf.length*2),
  380. nodeIdx = this.nodeIdx,
  381. prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length,
  382. seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence.
  383. uCode;
  384. if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later.
  385. prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]);
  386. for (var i = 0, j = 0; i < buf.length; i++) {
  387. var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset];
  388. // Lookup in current trie node.
  389. var uCode = this.decodeTables[nodeIdx][curByte];
  390. if (uCode >= 0) {
  391. // Normal character, just use it.
  392. }
  393. else if (uCode === UNASSIGNED) { // Unknown char.
  394. // TODO: Callback with seq.
  395. //var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset);
  396. i = seqStart; // Try to parse again, after skipping first byte of the sequence ('i' will be incremented by 'for' cycle).
  397. uCode = this.defaultCharUnicode.charCodeAt(0);
  398. }
  399. else if (uCode === GB18030_CODE) {
  400. var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset);
  401. var ptr = (curSeq[0]-0x81)*12600 + (curSeq[1]-0x30)*1260 + (curSeq[2]-0x81)*10 + (curSeq[3]-0x30);
  402. var idx = findIdx(this.gb18030.gbChars, ptr);
  403. uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx];
  404. }
  405. else if (uCode <= NODE_START) { // Go to next trie node.
  406. nodeIdx = NODE_START - uCode;
  407. continue;
  408. }
  409. else if (uCode <= SEQ_START) { // Output a sequence of chars.
  410. var seq = this.decodeTableSeq[SEQ_START - uCode];
  411. for (var k = 0; k < seq.length - 1; k++) {
  412. uCode = seq[k];
  413. newBuf[j++] = uCode & 0xFF;
  414. newBuf[j++] = uCode >> 8;
  415. }
  416. uCode = seq[seq.length-1];
  417. }
  418. else
  419. throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte);
  420. // Write the character to buffer, handling higher planes using surrogate pair.
  421. if (uCode > 0xFFFF) {
  422. uCode -= 0x10000;
  423. var uCodeLead = 0xD800 + Math.floor(uCode / 0x400);
  424. newBuf[j++] = uCodeLead & 0xFF;
  425. newBuf[j++] = uCodeLead >> 8;
  426. uCode = 0xDC00 + uCode % 0x400;
  427. }
  428. newBuf[j++] = uCode & 0xFF;
  429. newBuf[j++] = uCode >> 8;
  430. // Reset trie node.
  431. nodeIdx = 0; seqStart = i+1;
  432. }
  433. this.nodeIdx = nodeIdx;
  434. this.prevBuf = (seqStart >= 0) ? buf.slice(seqStart) : prevBuf.slice(seqStart + prevBufOffset);
  435. return newBuf.slice(0, j).toString('ucs2');
  436. }
  437. DBCSDecoder.prototype.end = function() {
  438. var ret = '';
  439. // Try to parse all remaining chars.
  440. while (this.prevBuf.length > 0) {
  441. // Skip 1 character in the buffer.
  442. ret += this.defaultCharUnicode;
  443. var buf = this.prevBuf.slice(1);
  444. // Parse remaining as usual.
  445. this.prevBuf = Buffer.alloc(0);
  446. this.nodeIdx = 0;
  447. if (buf.length > 0)
  448. ret += this.write(buf);
  449. }
  450. this.nodeIdx = 0;
  451. return ret;
  452. }
  453. // Binary search for GB18030. Returns largest i such that table[i] <= val.
  454. function findIdx(table, val) {
  455. if (table[0] > val)
  456. return -1;
  457. var l = 0, r = table.length;
  458. while (l < r-1) { // always table[l] <= val < table[r]
  459. var mid = l + Math.floor((r-l+1)/2);
  460. if (table[mid] <= val)
  461. l = mid;
  462. else
  463. r = mid;
  464. }
  465. return l;
  466. }