Skip to content

Commit 4bea821

Browse files
ChALkeRaduh95
authored andcommitted
lib: use utf8 fast path for streaming TextDecoder
PR-URL: #61549 Reviewed-By: Yagiz Nizipli <[email protected]> Reviewed-By: Gürgün Dayıoğlu <[email protected]> Reviewed-By: Colin Ihrig <[email protected]>
1 parent 7e3eab5 commit 4bea821

File tree

4 files changed

+138
-47
lines changed

4 files changed

+138
-47
lines changed

lib/internal/encoding.js

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,13 @@ const kFlags = Symbol('flags');
2929
const kEncoding = Symbol('encoding');
3030
const kDecoder = Symbol('decoder');
3131
const kEncoder = Symbol('encoder');
32+
const kChunk = Symbol('chunk');
3233
const kFatal = Symbol('kFatal');
3334
const kUTF8FastPath = Symbol('kUTF8FastPath');
3435
const kIgnoreBOM = Symbol('kIgnoreBOM');
3536

3637
const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
38+
const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
3739

3840
const {
3941
getConstructorOf,
@@ -448,9 +450,11 @@ class TextDecoder {
448450
this[kUTF8FastPath] = false;
449451
this[kHandle] = undefined;
450452
this[kSingleByte] = undefined; // Does not care about streaming or BOM
453+
this[kChunk] = null; // A copy of previous streaming tail or null
451454

452455
if (enc === 'utf-8') {
453456
this[kUTF8FastPath] = true;
457+
this[kBOMSeen] = false;
454458
} else if (isSinglebyteEncoding(enc)) {
455459
this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]);
456460
} else {
@@ -459,15 +463,14 @@ class TextDecoder {
459463
}
460464

461465
#prepareConverter() {
462-
if (this[kHandle] !== undefined) return;
463466
if (hasIntl) {
464467
let icuEncoding = this[kEncoding];
465468
if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder
466469
const handle = icuGetConverter(icuEncoding, this[kFlags]);
467470
if (handle === undefined)
468471
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
469472
this[kHandle] = handle;
470-
} else if (this[kEncoding] === 'utf-8' || this[kEncoding] === 'utf-16le') {
473+
} else if (this[kEncoding] === 'utf-16le') {
471474
if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option');
472475
this[kHandle] = new (lazyStringDecoder())(this[kEncoding]);
473476
this[kBOMSeen] = false;
@@ -484,11 +487,55 @@ class TextDecoder {
484487

485488
const stream = options?.stream;
486489
if (this[kUTF8FastPath]) {
487-
if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
488-
this[kUTF8FastPath] = false;
489-
}
490+
const chunk = this[kChunk];
491+
const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
492+
if (!stream) {
493+
this[kBOMSeen] = false;
494+
if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
495+
}
496+
497+
let u = parseInput(input);
498+
if (u.length === 0 && stream) return ''; // no state change
499+
let prefix;
500+
if (chunk) {
501+
const merged = mergePrefixUtf8(u, this[kChunk]);
502+
if (u.length < 3) {
503+
u = merged; // Might be unfinished, but fully consumed old u
504+
} else {
505+
prefix = merged; // Stops at complete chunk
506+
const add = prefix.length - this[kChunk].length;
507+
if (add > 0) u = u.subarray(add);
508+
}
509+
510+
this[kChunk] = null;
511+
}
490512

491-
this.#prepareConverter();
513+
if (stream) {
514+
const trail = unfinishedBytesUtf8(u, u.length);
515+
if (trail > 0) {
516+
this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy
517+
if (!prefix && trail === u.length) return ''; // No further state change
518+
u = u.subarray(0, -trail);
519+
}
520+
}
521+
522+
try {
523+
const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') +
524+
decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
525+
526+
// "BOM seen" is set on the current decode call only if it did not error,
527+
// in "serialize I/O queue" after decoding
528+
// We don't get here if we had no complete data to process,
529+
// and we don't want BOM processing after that if streaming
530+
if (stream) this[kBOMSeen] = true;
531+
532+
return res;
533+
} catch (e) {
534+
this[kChunk] = null; // Reset unfinished chunk on errors
535+
// The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
536+
throw e;
537+
}
538+
}
492539

493540
if (hasIntl) {
494541
const flags = stream ? 0 : CONVERTER_FLAGS_FLUSH;

lib/internal/encoding/util.js

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// From https://npmjs.com/package/@exodus/bytes
2+
// Copyright Exodus Movement. Licensed under MIT License.
3+
4+
'use strict';
5+
6+
const {
7+
Uint8Array,
8+
} = primordials;
9+
10+
11+
/**
12+
* Get a number of last bytes in an Uint8Array `data` ending at `len` that don't
13+
* form a codepoint yet, but can be a part of a single codepoint on more data.
14+
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
15+
* @param {number} len Position to look behind from
16+
* @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len`
17+
*/
18+
function unfinishedBytesUtf8(data, len) {
19+
// 0-3
20+
let pos = 0;
21+
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
22+
if (pos === len) return 0; // no space for lead
23+
const lead = data[len - pos - 1];
24+
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
25+
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
26+
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing
27+
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
28+
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
29+
const next = data[len - pos];
30+
return next >= lower && next <= upper ? pos + 1 : 0;
31+
}
32+
33+
/**
34+
* Merge prefix `chunk` with `data` and return new combined prefix.
35+
* For data.length < 3, fully consumes data and can return unfinished data,
36+
* otherwise returns a prefix with no unfinished bytes
37+
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
38+
* @param {Uint8Array} chunk Prefix to prepend before `data`
39+
* @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data`
40+
* so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data).
41+
*/
42+
function mergePrefixUtf8(data, chunk) {
43+
if (data.length === 0) return chunk;
44+
if (data.length < 3) {
45+
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
46+
const res = new Uint8Array(data.length + chunk.length);
47+
res.set(chunk);
48+
res.set(data, chunk.length);
49+
return res;
50+
}
51+
52+
// Slice off a small portion of data into prefix chunk so we can decode them separately without extending array size
53+
const temp = new Uint8Array(chunk.length + 3); // We have 1-3 bytes and need 1-3 more bytes
54+
temp.set(chunk);
55+
temp.set(data.subarray(0, 3), chunk.length);
56+
57+
// Stop at the first offset where unfinished bytes reaches 0 or fits into data
58+
// If that doesn't happen (data too short), just concat chunk and data completely (above)
59+
for (let i = 1; i <= 3; i++) {
60+
const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3
61+
if (unfinished <= i) {
62+
// Always reachable at 3, but we still need 'unfinished' value for it
63+
const add = i - unfinished; // 0-3
64+
return add > 0 ? temp.subarray(0, chunk.length + add) : chunk;
65+
}
66+
}
67+
68+
// Unreachable
69+
return null;
70+
}
71+
72+
module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 };

test/parallel/test-bootstrap-modules.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ expected.beforePreExec = new Set([
9090
'Internal Binding fs',
9191
'NativeModule internal/encoding',
9292
'NativeModule internal/encoding/single-byte',
93+
'NativeModule internal/encoding/util',
9394
'NativeModule internal/blob',
9495
'NativeModule internal/fs/utils',
9596
'NativeModule fs',

test/parallel/test-whatwg-encoding-custom-textdecoder.js

Lines changed: 12 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,8 @@ assert(TextDecoder);
8080

8181
['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => {
8282
const dec = new TextDecoder(i, { fatal: true });
83-
if (common.hasIntl) {
84-
dec.decode(buf.slice(0, 8), { stream: true });
85-
dec.decode(buf.slice(8));
86-
} else {
87-
assert.throws(
88-
() => {
89-
dec.decode(buf.slice(0, 8), { stream: true });
90-
},
91-
{
92-
code: 'ERR_NO_ICU',
93-
name: 'TypeError',
94-
message: '"fatal" option is not supported on Node.js compiled without ICU'
95-
});
96-
}
83+
dec.decode(buf.slice(0, 8), { stream: true });
84+
dec.decode(buf.slice(8));
9785
});
9886

9987
// Test TextDecoder, label undefined, options null
@@ -122,33 +110,16 @@ if (common.hasIntl) {
122110
// Test TextDecoder inspect with hidden fields
123111
{
124112
const dec = new TextDecoder('utf-8', { ignoreBOM: true });
125-
if (common.hasIntl) {
126-
assert.strictEqual(
127-
util.inspect(dec, { showHidden: true }),
128-
'TextDecoder {\n' +
129-
' encoding: \'utf-8\',\n' +
130-
' fatal: false,\n' +
131-
' ignoreBOM: true,\n' +
132-
' Symbol(flags): 4,\n' +
133-
' Symbol(handle): undefined\n' +
134-
'}'
135-
);
136-
} else {
137-
dec.decode(Uint8Array.of(0), { stream: true });
138-
assert.strictEqual(
139-
util.inspect(dec, { showHidden: true }),
140-
'TextDecoder {\n' +
141-
" encoding: 'utf-8',\n" +
142-
' fatal: false,\n' +
143-
' ignoreBOM: true,\n' +
144-
' Symbol(flags): 4,\n' +
145-
' Symbol(handle): StringDecoder {\n' +
146-
" encoding: 'utf8',\n" +
147-
' Symbol(kNativeDecoder): <Buffer 00 00 00 00 00 00 01>\n' +
148-
' }\n' +
149-
'}'
150-
);
151-
}
113+
assert.strictEqual(
114+
util.inspect(dec, { showHidden: true }),
115+
'TextDecoder {\n' +
116+
' encoding: \'utf-8\',\n' +
117+
' fatal: false,\n' +
118+
' ignoreBOM: true,\n' +
119+
' Symbol(flags): 4,\n' +
120+
' Symbol(handle): undefined\n' +
121+
'}'
122+
);
152123
}
153124

154125

0 commit comments

Comments
 (0)