Skip to content

Commit 84ee650

Browse files
committed
Implement UTF-16LE encoding, update tests, adjust codec interface
Three major reasons for reimplementing UTF-16 and not use native codec: 1. We want to remove StringDecoder & Buffer references due to #235. 2. StringDecoder is inconsistent with handling surrogates on Node v6-9 3. NPM module string_decoder gives strange results when processing chunks - it sometimes prepends '\u0000', likely due to a bug. Performance was and is a major concern here. Decoder shouldn't be affected because it uses backend methods directly. Encoder is affected due to introducing character-level loop. It's still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster in Node.js by introducing a dedicated backend method. Browser speeds will be the same.
1 parent e567849 commit 84ee650

File tree

6 files changed

+360
-87
lines changed

6 files changed

+360
-87
lines changed

encodings/internal.js

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ module.exports = {
99
cesu8: { type: "_internal", bomAware: true},
1010
unicode11utf8: "utf8",
1111

12-
ucs2: { type: "_internal", bomAware: true},
13-
utf16le: "ucs2",
12+
// NOTE: utf-16le/ucs2 are in utf16.js.
1413

1514
binary: { type: "_internal" },
1615
base64: { type: "_internal" },

encodings/utf16.js

Lines changed: 149 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,123 @@
11
"use strict";
22

3-
// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js
3+
// == UTF16-LE codec. ==========================================================
4+
// Note: We're not using Node.js native codec because StringDecoder implementation is buggy
5+
// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding
6+
// routines for performance, though.
7+
8+
exports.utf16le = class Utf16LECodec {
9+
createEncoder(options, iconv) {
10+
return new Utf16LEEncoder(iconv.backend);
11+
}
12+
createDecoder(options, iconv) {
13+
return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode);
14+
}
15+
get bomAware() { return true; }
16+
}
17+
18+
class Utf16LEEncoder {
19+
constructor(backend) {
20+
this.backend = backend;
21+
}
22+
23+
write(str) {
24+
const bytes = this.backend.allocBytes(str.length * 2);
25+
const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length);
26+
for (let i = 0; i < str.length; i++) {
27+
chars[i] = str.charCodeAt(i);
28+
}
29+
return this.backend.bytesToResult(bytes, bytes.length);
30+
}
31+
32+
end() {}
33+
}
34+
35+
class Utf16LEDecoder {
36+
constructor(backend, defaultChar) {
37+
this.backend = backend;
38+
this.defaultChar = defaultChar;
39+
this.overflowByte = -1;
40+
this.prefixSurrogate = undefined;
41+
}
42+
43+
write(buf) {
44+
if (buf.length == 0) {
45+
return '';
46+
}
47+
let byteOffset = buf.byteOffset;
48+
let byteLen = buf.length;
49+
50+
// Process previous overflowByte
51+
let prefix = '';
52+
if (this.overflowByte !== -1) {
53+
byteOffset++; byteLen--;
54+
prefix = String.fromCharCode(this.overflowByte + (buf[0] << 8));
55+
}
56+
57+
// Set new overflowByte
58+
if (byteLen & 1) {
59+
this.overflowByte = buf[buf.length-1];
60+
byteLen--;
61+
} else {
62+
this.overflowByte = -1;
63+
}
64+
65+
let chars;
66+
if (byteOffset & 1 === 0) {
67+
// If byteOffset is aligned, just use the ArrayBuffer from input buf.
68+
chars = new Uint16Array(buf.buffer, byteOffset, byteLen >> 1);
69+
} else {
70+
// If byteOffset is NOT aligned, create a new aligned buffer and copy the data.
71+
chars = this.backend.allocRawChars(byteLen >> 1);
72+
const srcByteView = new Uint8Array(buf.buffer, byteOffset, byteLen);
73+
const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen);
74+
destByteView.set(srcByteView);
75+
}
76+
77+
let res = prefix + this.backend.rawCharsToResult(chars, chars.length);
78+
if (res) {
79+
// Add high surrogate from previous chunk.
80+
if (this.prefixSurrogate) {
81+
res = this.prefixSurrogate + res;
82+
this.prefixSurrogate = undefined;
83+
}
84+
85+
// Slice off a new high surrogate at the end of the current chunk.
86+
const lastChar = res.charCodeAt(res.length-1);
87+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
88+
this.prefixSurrogate = res[res.length-1];
89+
res = res.slice(0, -1);
90+
}
91+
}
92+
return res;
93+
}
94+
95+
end() {
96+
if (this.prefixSurrogate || this.overflowByte !== -1) {
97+
const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
98+
this.prefixSurrogate = undefined;
99+
this.overflowByte = -1;
100+
return res;
101+
}
102+
}
103+
}
104+
exports.ucs2 = "utf16le"; // Alias
105+
4106

5107
// == UTF16-BE codec. ==========================================================
6108

7109
exports.utf16be = class Utf16BECodec {
8-
get encoder() { return Utf16BEEncoder; }
9-
get decoder() { return Utf16BEDecoder; }
110+
createEncoder(options, iconv) {
111+
return new Utf16BEEncoder(iconv.backend);
112+
}
113+
createDecoder(options, iconv) {
114+
return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode);
115+
}
10116
get bomAware() { return true; }
11117
}
12118

13119
class Utf16BEEncoder {
14-
constructor(opts, codec, backend) {
120+
constructor(backend) {
15121
this.backend = backend;
16122
}
17123

@@ -30,30 +136,59 @@ class Utf16BEEncoder {
30136
}
31137

32138
class Utf16BEDecoder {
33-
constructor(opts, codec, backend) {
139+
constructor(backend, defaultChar) {
34140
this.backend = backend;
141+
this.defaultChar = defaultChar;
35142
this.overflowByte = -1;
143+
this.prefixSurrogate = undefined;
36144
}
37145

38146
write(buf) {
147+
if (buf.length === 0) {
148+
return '';
149+
}
150+
39151
const chars = this.backend.allocRawChars((buf.length+1) >> 1);
40152
let charsPos = 0, i = 0;
41153

42-
if (this.overflowByte !== -1 && i < buf.length) {
154+
if (this.overflowByte !== -1) {
43155
chars[charsPos++] = (this.overflowByte << 8) + buf[i++];
44156
}
45157

158+
// NOTE: we can win another 10% perf by using chars[i >> 1].
159+
// NOTE: the double-reverse method takes almost the same time.
46160
for (; i < buf.length-1; i += 2) {
47161
chars[charsPos++] = (buf[i] << 8) + buf[i+1];
48162
}
49163

50164
this.overflowByte = (i == buf.length-1) ? buf[i] : -1;
51165

52-
return this.backend.rawCharsToResult(chars, charsPos);
166+
let res = this.backend.rawCharsToResult(chars, charsPos);
167+
if (res) {
168+
// Add high surrogate from previous chunk.
169+
if (this.prefixSurrogate) {
170+
res = this.prefixSurrogate + res;
171+
this.prefixSurrogate = undefined;
172+
}
173+
174+
// Slice off a new high surrogate at the end of the current chunk.
175+
const lastChar = res.charCodeAt(res.length-1);
176+
if (0xD800 <= lastChar && lastChar < 0xDC00) {
177+
this.prefixSurrogate = res[res.length-1];
178+
res = res.slice(0, -1);
179+
}
180+
}
181+
return res;
182+
53183
}
54184

55185
end() {
56-
this.overflowByte = -1;
186+
if (this.prefixSurrogate || this.overflowByte !== -1) {
187+
const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : '');
188+
this.prefixSurrogate = undefined;
189+
this.overflowByte = -1;
190+
return res;
191+
}
57192
}
58193
}
59194

@@ -67,39 +202,25 @@ class Utf16BEDecoder {
67202
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false).
68203

69204
exports.utf16 = class Utf16Codec {
70-
constructor(opts, iconv) {
71-
this.iconv = iconv;
72-
}
73-
get encoder() { return Utf16Encoder; }
74-
get decoder() { return Utf16Decoder; }
75-
}
76-
77-
class Utf16Encoder {
78-
constructor(options, codec) {
205+
createEncoder(options, iconv) {
79206
options = options || {};
80207
if (options.addBOM === undefined)
81208
options.addBOM = true;
82-
this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options);
209+
return iconv.getEncoder('utf-16le', options);
83210
}
84-
85-
// Pass-through to this.encoder
86-
write(str) {
87-
return this.encoder.write(str);
88-
}
89-
90-
end() {
91-
return this.encoder.end();
211+
createDecoder(options, iconv) {
212+
return new Utf16Decoder(options, iconv);
92213
}
93214
}
94215

95216
class Utf16Decoder {
96-
constructor(options, codec) {
217+
constructor(options, iconv) {
97218
this.decoder = null;
98219
this.initialBufs = [];
99220
this.initialBufsLen = 0;
100221

101222
this.options = options || {};
102-
this.iconv = codec.iconv;
223+
this.iconv = iconv;
103224
}
104225

105226
write(buf) {

lib/index.js

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) {
105105
}
106106

107107
iconv.getEncoder = function getEncoder(encoding, options) {
108-
var codec = iconv.getCodec(encoding),
109-
encoder = new codec.encoder(options, codec, iconv.backend);
108+
const codec = iconv.getCodec(encoding);
109+
110+
let encoder = codec.createEncoder
111+
? codec.createEncoder(options, iconv)
112+
: new codec.encoder(options, codec, iconv.backend);
110113

111114
if (codec.bomAware && options && options.addBOM)
112115
encoder = new bomHandling.PrependBOM(encoder, options);
@@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) {
115118
}
116119

117120
iconv.getDecoder = function getDecoder(encoding, options) {
118-
var codec = iconv.getCodec(encoding),
119-
decoder = new codec.decoder(options, codec, iconv.backend);
121+
const codec = iconv.getCodec(encoding);
122+
123+
let decoder = codec.createDecoder
124+
? codec.createDecoder(options, iconv)
125+
: new codec.decoder(options, codec, iconv.backend);
120126

121127
if (codec.bomAware && !(options && options.stripBOM === false))
122128
decoder = new bomHandling.StripBOM(decoder, options);

test/streams-test.js

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -213,17 +213,7 @@ describe("Streaming mode", function() {
213213
encoding: "ucs2",
214214
input: [[0x3D], [0xD8, 0x3B], [0xDE]], // U+1F63B, 😻, SMILING CAT FACE WITH HEART-SHAPED EYES
215215
outputType: false, // Don't concat
216-
checkOutput: function(res) {
217-
if (semver.satisfies(process.version, '>= 6.2.1 < 10.0.0')) {
218-
// After a string_decoder rewrite in https://github.com/nodejs/node/pull/6777, which
219-
// was merged in Node v6.2.1, we don't merge chunks anymore.
220-
// Not really correct, but it seems we cannot do anything with it.
221-
// Though it has been fixed again in Node v10.0.0
222-
assert.deepEqual(res, ["\uD83D", "\uDE3B"]);
223-
} else {
224-
assert.deepEqual(res, ["\uD83D\uDE3B"]); // We should have only 1 chunk.
225-
}
226-
},
216+
checkOutput: function(res) { assert.deepEqual(res, ["\uD83D\uDE3B"]); }, // We should have only 1 chunk.
227217
}));
228218

229219
it("Encoding using internal modules: utf8", checkEncodeStream({
@@ -264,13 +254,13 @@ describe("Streaming mode", function() {
264254

265255
it("Decoding of uneven length buffers from UTF-16BE - 2", checkDecodeStream({
266256
encoding: "UTF-16BE",
267-
input: [[0x00, 0x61, 0x00], [0x62, 0x00, 0x63]],
257+
input: [[0x00, 0x61, 0x00], [0x62, 0x00], [0x63]],
268258
output: "abc"
269259
}));
270260

271261
it("Decoding of uneven length buffers from UTF-16", checkDecodeStream({
272262
encoding: "UTF-16",
273-
input: [[0x61], [0x0], [0x20], [0x0]],
263+
input: [[0x61], [0x0, 0x20], [0x0]],
274264
output: "a "
275265
}));
276266

0 commit comments

Comments
 (0)