Skip to content

Commit e36f068

Browse files
committed
implement utf8CountWasm()
1 parent 7694547 commit e36f068

File tree

5 files changed

+86
-14
lines changed

5 files changed

+86
-14
lines changed

assembly/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
// memory is assumed:
44
// [input][output]
55
export { utf8DecodeToUint16Array } from "./utf8DecodeToUint16Array";
6+
export { utf8CountUint16Array } from "./utf8CountUint16Array";
67
export { malloc, free } from "./memory";

assembly/utf8CountUint16Array.ts

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
export function utf8CountUint16Array(inputPtr: usize, strLength: usize): usize {
2+
const u16s = sizeof<u16>();
3+
4+
let byteLength: usize = 0;
5+
let pos: usize = inputPtr;
6+
let end = inputPtr + strLength * u16s;
7+
while (pos < end) {
8+
let value = load<u16>(pos);
9+
pos += u16s;
10+
11+
if (value >= 0xd800 && value <= 0xdbff) {
12+
// high surrogate
13+
if (pos < strLength) {
14+
let extra = load<u16>(pos);
15+
if ((extra & 0xfc00) === 0xdc00) {
16+
pos += u16s;
17+
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
18+
}
19+
}
20+
if (value >= 0xd800 && value <= 0xdbff) {
21+
continue; // drop lone surrogate
22+
}
23+
}
24+
25+
if ((value & 0xffffff80) === 0) {
26+
// 1-byte
27+
byteLength++;
28+
continue;
29+
} else if ((value & 0xfffff800) === 0) {
30+
// 2-bytes
31+
byteLength += 2;
32+
} else if ((value & 0xffff0000) === 0) {
33+
// 3-byte
34+
byteLength += 3;
35+
} else if ((value & 0xffe00000) === 0) {
36+
// 4-byte
37+
byteLength += 4;
38+
} else {
39+
unreachable();
40+
}
41+
}
42+
return byteLength;
43+
}

src/Encoder.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,11 @@ export class Encoder {
118118
}
119119

120120
encodeString(object: string) {
121-
const byteLength = utf8Count(object);
121+
const units = new Uint16Array(object.length);
122+
for (let i = 0; i < object.length; i++) {
123+
units[i] = object.charCodeAt(i);
124+
}
125+
const byteLength = utf8Count(units);
122126
if (byteLength < 32) {
123127
// fixstr
124128
this.writeU8(0xa0 + byteLength);
@@ -139,7 +143,7 @@ export class Encoder {
139143
}
140144

141145
this.ensureBufferSizeToWrite(byteLength);
142-
utf8Encode(object, this.view, this.pos);
146+
utf8Encode(units, this.view, this.pos);
143147
this.pos += byteLength;
144148
}
145149

src/utils/utf8.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
import { prettyByte } from "./prettyByte";
2-
import { WASM_AVAILABLE, WASM_DEBUG, utf8DecodeWasm } from "../wasmFunctions";
2+
import { WASM_AVAILABLE, WASM_DEBUG, utf8DecodeWasm, utf8CountWasm } from "../wasmFunctions";
33

44
const WASM_THRESHOLD = WASM_DEBUG ? 0 : 0x100;
55

6-
export function utf8Count(str: string): number {
6+
export function utf8Count(str: Uint16Array): number {
77
const strLength = str.length;
8+
if (WASM_AVAILABLE && strLength > WASM_THRESHOLD) {
9+
return utf8CountWasm(str);
10+
}
811

912
let byteLength = 0;
1013
let pos = 0;
1114
while (pos < strLength) {
12-
let value = str.charCodeAt(pos++);
15+
let value = str[pos++];
1316
if (value >= 0xd800 && value <= 0xdbff) {
1417
// high surrogate
1518
if (pos < strLength) {
16-
const extra = str.charCodeAt(pos);
19+
const extra = str[pos];
1720
if ((extra & 0xfc00) === 0xdc00) {
1821
++pos;
1922
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;
@@ -44,16 +47,16 @@ export function utf8Count(str: string): number {
4447
return byteLength;
4548
}
4649

47-
export function utf8Encode(str: string, view: DataView, offset: number): void {
50+
export function utf8Encode(str: Uint16Array, view: DataView, offset: number): void {
4851
const strLength = str.length;
4952

5053
let pos = 0;
5154
while (pos < strLength) {
52-
let value = str.charCodeAt(pos++);
55+
let value = str[pos++];
5356
if (value >= 0xd800 && value <= 0xdbff) {
5457
// high surrogate
5558
if (pos < strLength) {
56-
const extra = str.charCodeAt(pos);
59+
const extra = str[pos];
5760
if ((extra & 0xfc00) === 0xdc00) {
5861
++pos;
5962
value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000;

src/wasmFunctions.ts

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,14 @@
33
// TODO: Use TypeScript built-in type
44
declare const WebAssembly: any;
55

6-
export const WASM_DEBUG = process.env.WASM_DEBUG === "true";
6+
const NO_WASM = process.env.NO_WASM === "true" || process.env.MSGPACK_NO_WASM === "true";
7+
export const WASM_DEBUG = process.env.WASM_DEBUG === "true" || process.env.MSGPACK_WASM_DEBUG === "true";
78

89
let { wasmModule } = (() => {
10+
if (NO_WASM) {
11+
return {};
12+
}
13+
914
try {
1015
if (WASM_DEBUG) {
1116
return require("../dist/wasm/untouched.wasm.js");
@@ -20,7 +25,10 @@ let { wasmModule } = (() => {
2025
}
2126
})();
2227

28+
export const WASM_AVAILABLE = !!wasmModule;
29+
2330
function abort(filename: number, line: number, column: number): void {
31+
// FIXME: filename is just a number (pointer?)
2432
throw new Error(`abort called at ${filename}:${line}:${column}`);
2533
}
2634

@@ -32,15 +40,28 @@ const defaultWasmInstance =
3240
},
3341
});
3442

35-
export const WASM_AVAILABLE = !!wasmModule && process.env.NO_WASM !== "true";
36-
3743
type pointer = number;
3844

39-
function setMemory(wasm: any, destPtr: pointer, src: Uint8Array, size: number) {
45+
function setMemoryU8(wasm: any, destPtr: pointer, src: Uint8Array, size: number) {
4046
const destView = new Uint8Array(wasm.exports.memory.buffer, destPtr, size);
4147
destView.set(src);
4248
}
49+
function setMemoryU16(wasm: any, destPtr: pointer, src: Uint16Array, size: number) {
50+
const destView = new Uint16Array(wasm.exports.memory.buffer, destPtr, size);
51+
destView.set(src);
52+
}
53+
54+
export function utf8CountWasm(units: Uint16Array, wasm = defaultWasmInstance): number {
55+
const inputPtr: pointer = wasm.exports.malloc(units.byteLength);
56+
try {
57+
setMemoryU16(wasm, inputPtr, units, units.length);
58+
return wasm.exports.utf8CountUint16Array(inputPtr, units.length);
59+
} finally {
60+
wasm.exports.free(inputPtr);
61+
}
62+
}
4363

64+
// A wrapper function for utf8DecodeToUint16Array()
4465
export function utf8DecodeWasm(
4566
bytes: Uint8Array,
4667
offset: number,
@@ -51,7 +72,7 @@ export function utf8DecodeWasm(
5172
// in worst case, the UTF-16 array uses the same as byteLength * 2
5273
const outputPtr: pointer = wasm.exports.malloc(byteLength * 2);
5374
try {
54-
setMemory(wasm, inputPtr, bytes.subarray(offset, offset + byteLength), byteLength);
75+
setMemoryU8(wasm, inputPtr, bytes.subarray(offset, offset + byteLength), byteLength);
5576

5677
const outputArraySize = wasm.exports.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
5778
const codepoints = new Uint16Array(wasm.exports.memory.buffer, outputPtr, outputArraySize);

0 commit comments

Comments
 (0)