Skip to content

Commit 5d99a92

Browse files
authored
Convert dbcs codec and tests (#256)
1 parent a1bd8f7 commit 5d99a92

File tree

8 files changed

+673
-628
lines changed

8 files changed

+673
-628
lines changed

encodings/dbcs-codec.js

Lines changed: 513 additions & 501 deletions
Large diffs are not rendered by default.
File renamed without changes.
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"use strict";
2+
3+
const Iconv = require("iconv").Iconv,
4+
fs = require("fs"),
5+
path = require("path"),
6+
utils = require("../test/utils");
7+
8+
const fixtures = {
9+
big5: big5(),
10+
gbk: gbk(),
11+
};
12+
const outputFile = path.resolve(__dirname, "..", "test", "fixtures", "gbk-big5.json");
13+
fs.writeFileSync(outputFile, JSON.stringify(fixtures));
14+
15+
function gbk() {
16+
const inputFile = path.resolve(__dirname, "fixtures", "gbkFile.txt");
17+
const contentBuffer = fs.readFileSync(inputFile);
18+
19+
const codec = Iconv("GBK", "utf8");
20+
const str = codec.convert(contentBuffer).toString();
21+
22+
return {
23+
bytes: utils.hex(contentBuffer, true),
24+
string: str,
25+
};
26+
}
27+
28+
function big5() {
29+
const contentBuffer = Buffer.from(
30+
"PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+",
31+
"base64"
32+
);
33+
34+
const codec = Iconv("big5", "utf8");
35+
const str = codec.convert(contentBuffer).toString();
36+
37+
return {
38+
bytes: utils.hex(contentBuffer, true),
39+
string: str,
40+
};
41+
}

test/big5-test.js

Lines changed: 31 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,68 @@
11
"use strict";
22

3-
var assert = require("assert"),
4-
Buffer = require("safer-buffer").Buffer,
5-
iconv = require("../");
3+
const assert = require("assert"),
4+
utils = require("./utils"),
5+
fixtures = require("./fixtures/gbk-big5.json"),
6+
iconv = utils.requireIconv();
67

7-
var testString = "中文abc", //unicode contains Big5-code and ascii
8-
testStringBig5Buffer = Buffer.from([0xa4, 0xa4, 0xa4, 0xe5, 0x61, 0x62, 0x63]),
8+
const testString = "中文abc", //unicode contains Big5-code and ascii
9+
testStringBig5Buffer = utils.bytes("a4 a4 a4 e5 61 62 63"),
910
testString2 = "測試",
10-
testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]);
11+
testStringBig5Buffer2 = utils.bytes("b4 fa b8 d5");
1112

12-
describe("Big5 tests", function () {
13+
describe("Big5 tests #node-web", function () {
1314
it("Big5 correctly encoded/decoded", function () {
1415
assert.strictEqual(
15-
iconv.encode(testString, "big5").toString("hex"),
16-
testStringBig5Buffer.toString("hex")
16+
utils.hex(iconv.encode(testString, "big5")),
17+
utils.hex(testStringBig5Buffer)
1718
);
1819
assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString);
1920
assert.strictEqual(
20-
iconv.encode(testString2, "big5").toString("hex"),
21-
testStringBig5Buffer2.toString("hex")
21+
utils.hex(iconv.encode(testString2, "big5")),
22+
utils.hex(testStringBig5Buffer2)
2223
);
2324
assert.strictEqual(iconv.decode(testStringBig5Buffer2, "big5"), testString2);
2425
});
2526

2627
it("cp950 correctly encoded/decoded", function () {
2728
assert.strictEqual(
28-
iconv.encode(testString, "cp950").toString("hex"),
29-
testStringBig5Buffer.toString("hex")
29+
utils.hex(iconv.encode(testString, "cp950")),
30+
utils.hex(testStringBig5Buffer)
3031
);
3132
assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString);
3233
});
3334

3435
it("Big5 file read decoded,compare with iconv result", function () {
35-
var contentBuffer = Buffer.from(
36-
"PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+",
37-
"base64"
38-
);
39-
var str = iconv.decode(contentBuffer, "big5");
40-
var iconvc = new (require("iconv").Iconv)("big5", "utf8");
41-
assert.strictEqual(iconvc.convert(contentBuffer).toString(), str);
36+
const contentBuffer = utils.bytes(fixtures.big5.bytes);
37+
const str = iconv.decode(contentBuffer, "big5");
38+
assert.strictEqual(fixtures.big5.string, str);
4239
});
4340

4441
it("Big5 correctly decodes and encodes characters · and ×", function () {
4542
// https://github.com/ashtuchkin/iconv-lite/issues/13
4643
// Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT
47-
var chars = "·×";
48-
var big5Chars = Buffer.from([0xa1, 0x50, 0xa1, 0xd1]);
49-
assert.strictEqual(iconv.encode(chars, "big5").toString("hex"), big5Chars.toString("hex"));
44+
const chars = "·×";
45+
const big5Chars = utils.bytes("a1 50 a1 d1");
46+
assert.strictEqual(utils.hex(iconv.encode(chars, "big5")), utils.hex(big5Chars));
5047
assert.strictEqual(iconv.decode(big5Chars, "big5"), chars);
5148
});
5249

5350
it("Big5 correctly encodes & decodes sequences", function () {
54-
assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString("hex"), "8862");
55-
assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString("hex"), "88a5");
56-
assert.strictEqual(iconv.encode("\u00CA", "big5").toString("hex"), "8866");
57-
assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString("hex"), "88668866");
51+
assert.strictEqual(utils.hex(iconv.encode("\u00CA\u0304", "big5")), "88 62");
52+
assert.strictEqual(utils.hex(iconv.encode("\u00EA\u030C", "big5")), "88 a5");
53+
assert.strictEqual(utils.hex(iconv.encode("\u00CA", "big5")), "88 66");
54+
assert.strictEqual(utils.hex(iconv.encode("\u00CA\u00CA", "big5")), "88 66 88 66");
5855

59-
assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString("hex"), "88663f"); // Unfinished surrogate.
60-
assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇').
61-
assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇').
56+
assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD800", "big5")), "88 66 3f"); // Unfinished surrogate.
57+
assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD841\uDD47", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇').
58+
assert.strictEqual(utils.hex(iconv.encode("\u00CA𠕇", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇').
6259

63-
assert.strictEqual(iconv.decode(Buffer.from("8862", "hex"), "big5"), "\u00CA\u0304");
64-
assert.strictEqual(iconv.decode(Buffer.from("8866", "hex"), "big5"), "\u00CA");
65-
assert.strictEqual(iconv.decode(Buffer.from("8866fa40", "hex"), "big5"), "\u00CA𠕇");
60+
assert.strictEqual(iconv.decode(utils.bytes("88 62"), "big5"), "\u00CA\u0304");
61+
assert.strictEqual(iconv.decode(utils.bytes("88 66"), "big5"), "\u00CA");
62+
assert.strictEqual(iconv.decode(utils.bytes("88 66 fa 40"), "big5"), "\u00CA𠕇");
6663
});
6764

6865
it("Big5 correctly encodes 十", function () {
69-
assert.strictEqual(iconv.encode("十", "big5").toString("hex"), "a451");
66+
assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51");
7067
});
7168
});

test/fixtures/gbk-big5.json

Lines changed: 10 additions & 0 deletions
Large diffs are not rendered by default.

test/gbk-test.js

Lines changed: 56 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,51 @@
11
"use strict";
22

3-
var fs = require("fs"),
4-
assert = require("assert"),
5-
Buffer = require("safer-buffer").Buffer,
6-
iconv = require("../");
3+
const assert = require("assert"),
4+
utils = require("./utils"),
5+
fixtures = require("./fixtures/gbk-big5.json"),
6+
iconv = utils.requireIconv();
77

8-
var testString = "中国abc", //unicode contains GBK-code and ascii
9-
testStringGBKBuffer = Buffer.from([0xd6, 0xd0, 0xb9, 0xfa, 0x61, 0x62, 0x63]);
8+
const testString = "中国abc", //unicode contains GBK-code and ascii
9+
testStringGBKBuffer = utils.bytes("d6 d0 b9 fa 61 62 63");
1010

11-
describe("GBK tests", function () {
11+
describe("GBK tests #node-web", function () {
1212
it("GBK correctly encoded/decoded", function () {
1313
assert.strictEqual(
14-
iconv.encode(testString, "GBK").toString("binary"),
15-
testStringGBKBuffer.toString("binary")
14+
utils.hex(iconv.encode(testString, "GBK")),
15+
utils.hex(testStringGBKBuffer)
1616
);
1717
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString);
1818
});
1919

2020
it("GB2312 correctly encoded/decoded", function () {
2121
assert.strictEqual(
22-
iconv.encode(testString, "GB2312").toString("binary"),
23-
testStringGBKBuffer.toString("binary")
22+
utils.hex(iconv.encode(testString, "GB2312")),
23+
utils.hex(testStringGBKBuffer)
2424
);
2525
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString);
2626
});
2727

2828
it("GBK file read decoded,compare with iconv result", function () {
29-
var contentBuffer = fs.readFileSync(__dirname + "/gbkFile.txt");
30-
var str = iconv.decode(contentBuffer, "GBK");
31-
var iconvc = new (require("iconv").Iconv)("GBK", "utf8");
32-
assert.strictEqual(iconvc.convert(contentBuffer).toString(), str);
29+
const contentBuffer = utils.bytes(fixtures.gbk.bytes);
30+
const str = iconv.decode(contentBuffer, "GBK");
31+
assert.strictEqual(fixtures.gbk.string, str);
3332
});
3433

3534
it("GBK correctly decodes and encodes characters · and ×", function () {
3635
// https://github.com/ashtuchkin/iconv-lite/issues/13
3736
// Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT
38-
var chars = "·×";
39-
var gbkChars = Buffer.from([0xa1, 0xa4, 0xa1, 0xc1]);
40-
assert.strictEqual(
41-
iconv.encode(chars, "GBK").toString("binary"),
42-
gbkChars.toString("binary")
43-
);
37+
const chars = "·×";
38+
const gbkChars = utils.bytes("a1 a4 a1 c1");
39+
assert.strictEqual(utils.hex(iconv.encode(chars, "GBK")), utils.hex(gbkChars));
4440
assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars);
4541
});
4642

4743
it("GBK and GB18030 correctly decodes and encodes Euro character", function () {
4844
// Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3
4945
// According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder),
5046
// Both GBK and GB18030 decoders should accept both encodings.
51-
var gbkEuroEncoding1 = Buffer.from([0x80]),
52-
gbkEuroEncoding2 = Buffer.from([0xa2, 0xe3]),
47+
const gbkEuroEncoding1 = utils.bytes("80"),
48+
gbkEuroEncoding2 = utils.bytes("a2 e3"),
5349
strEuro = "€";
5450

5551
assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro);
@@ -58,13 +54,10 @@ describe("GBK tests", function () {
5854
assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro);
5955

6056
// But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3.
57+
assert.strictEqual(utils.hex(iconv.encode(strEuro, "GBK")), utils.hex(gbkEuroEncoding1));
6158
assert.strictEqual(
62-
iconv.encode(strEuro, "GBK").toString("hex"),
63-
gbkEuroEncoding1.toString("hex")
64-
);
65-
assert.strictEqual(
66-
iconv.encode(strEuro, "GB18030").toString("hex"),
67-
gbkEuroEncoding2.toString("hex")
59+
utils.hex(iconv.encode(strEuro, "GB18030")),
60+
utils.hex(gbkEuroEncoding2)
6861
);
6962
});
7063

@@ -92,65 +85,54 @@ describe("GBK tests", function () {
9285
);
9386
});
9487

95-
function swapBytes(buf) {
96-
for (var i = 0; i < buf.length; i += 2) buf.writeUInt16LE(buf.readUInt16BE(i), i);
97-
return buf;
98-
}
99-
function spacify4(str) {
100-
return str.replace(/(....)/g, "$1 ").trim();
101-
}
102-
function strToHex(str) {
103-
return spacify4(swapBytes(Buffer.from(str, "ucs2")).toString("hex"));
104-
}
105-
10688
it("GB18030 encodes/decodes 4 byte sequences", function () {
107-
var chars = {
108-
"\u0080": Buffer.from([0x81, 0x30, 0x81, 0x30]),
109-
"\u0081": Buffer.from([0x81, 0x30, 0x81, 0x31]),
110-
"\u008b": Buffer.from([0x81, 0x30, 0x82, 0x31]),
111-
"\u0615": Buffer.from([0x81, 0x31, 0x82, 0x31]),
112-
: Buffer.from([0x82, 0x31, 0x82, 0x31]),
113-
"\udbd9\ude77": Buffer.from([0xe0, 0x31, 0x82, 0x31]),
89+
const chars = {
90+
"\u0080": utils.bytes("81 30 81 30"),
91+
"\u0081": utils.bytes("81 30 81 31"),
92+
"\u008b": utils.bytes("81 30 82 31"),
93+
"\u0615": utils.bytes("81 31 82 31"),
94+
: utils.bytes("82 31 82 31"),
95+
"\udbd9\ude77": utils.bytes("e0 31 82 31"),
11496
};
115-
for (var uChar in chars) {
116-
var gbkBuf = chars[uChar];
97+
for (const uChar in chars) {
98+
const gbkBuf = chars[uChar];
99+
assert.strictEqual(utils.hex(iconv.encode(uChar, "GB18030")), utils.hex(gbkBuf));
117100
assert.strictEqual(
118-
iconv.encode(uChar, "GB18030").toString("hex"),
119-
gbkBuf.toString("hex")
101+
utils.strToHex(iconv.decode(gbkBuf, "GB18030")),
102+
utils.strToHex(uChar)
120103
);
121-
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
122104
}
123105
});
124106

125107
it("GB18030 correctly decodes incomplete 4 byte sequences", function () {
126-
var chars = {
127-
"�": Buffer.from([0x82]),
128-
"�1": Buffer.from([0x82, 0x31]),
129-
"�1�": Buffer.from([0x82, 0x31, 0x82]),
130-
: Buffer.from([0x82, 0x31, 0x82, 0x31]),
131-
"� ": Buffer.from([0x82, 0x20]),
132-
"�1 ": Buffer.from([0x82, 0x31, 0x20]),
133-
"�1� ": Buffer.from([0x82, 0x31, 0x82, 0x20]),
134-
"\u399f ": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x20]),
135-
"�1\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x61]),
136-
"�1\u5010\u0061": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x61]),
137-
㦟俛: Buffer.from([0x82, 0x31, 0x82, 0x31, 0x82, 0x61]),
138-
"�1\u50101�1": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x31, 0x82, 0x31]),
108+
const chars = {
109+
"�": utils.bytes("82"),
110+
"�1": utils.bytes("82 31"),
111+
"�1�": utils.bytes("82 31 82"),
112+
: utils.bytes("82 31 82 31"),
113+
"� ": utils.bytes("82 20"),
114+
"�1 ": utils.bytes("82 31 20"),
115+
"�1� ": utils.bytes("82 31 82 20"),
116+
"\u399f ": utils.bytes("82 31 82 31 20"),
117+
"�1\u4fdb": utils.bytes("82 31 82 61"),
118+
"�1\u5010\u0061": utils.bytes("82 31 82 82 61"),
119+
㦟俛: utils.bytes("82 31 82 31 82 61"),
120+
"�1\u50101�1": utils.bytes("82 31 82 82 31 82 31"),
139121
};
140-
for (var uChar in chars) {
141-
var gbkBuf = chars[uChar];
142-
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
122+
for (const uChar in chars) {
123+
const gbkBuf = chars[uChar];
124+
assert.strictEqual(
125+
utils.strToHex(iconv.decode(gbkBuf, "GB18030")),
126+
utils.strToHex(uChar)
127+
);
143128
}
144129
});
145130

146131
it("GB18030:2005 changes are applied", function () {
147132
// See https://github.com/whatwg/encoding/issues/22
148-
var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator
149-
var gbkChars = Buffer.from([0xa8, 0xbc, 0x00, 0x81, 0x35, 0xf4, 0x37]);
133+
const chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator
134+
const gbkChars = utils.bytes("a8 bc 00 81 35 f4 37");
150135
assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars);
151-
assert.strictEqual(
152-
iconv.encode(chars, "GB18030").toString("hex"),
153-
gbkChars.toString("hex")
154-
);
136+
assert.strictEqual(utils.hex(iconv.encode(chars, "GB18030")), utils.hex(gbkChars));
155137
});
156138
});

0 commit comments

Comments
 (0)