未验证 提交 0dbbb581 编写于 作者: A Alexandru Dima 提交者: GitHub

Merge pull request #130211 from microsoft/dev/mjbvz/always-text-encoder

Remove strings.encodeUtf8
......@@ -4,13 +4,10 @@
*--------------------------------------------------------------------------------------------*/
import * as streams from 'vs/base/common/stream';
import * as strings from 'vs/base/common/strings';
declare const Buffer: any;
const hasBuffer = (typeof Buffer !== 'undefined');
const hasTextEncoder = (typeof TextEncoder !== 'undefined');
const hasTextDecoder = (typeof TextDecoder !== 'undefined');
let textEncoder: TextEncoder | null;
let textDecoder: TextDecoder | null;
......@@ -38,13 +35,11 @@ export class VSBuffer {
const dontUseNodeBuffer = options?.dontUseNodeBuffer || false;
if (!dontUseNodeBuffer && hasBuffer) {
return new VSBuffer(Buffer.from(source));
} else if (hasTextEncoder) {
} else {
if (!textEncoder) {
textEncoder = new TextEncoder();
}
return new VSBuffer(textEncoder.encode(source));
} else {
return new VSBuffer(strings.encodeUTF8(source));
}
}
......@@ -78,13 +73,11 @@ export class VSBuffer {
toString(): string {
if (hasBuffer) {
return this.buffer.toString();
} else if (hasTextDecoder) {
} else {
if (!textDecoder) {
textDecoder = new TextDecoder();
}
return textDecoder.decode(this.buffer);
} else {
return strings.decodeUTF8(this.buffer);
}
}
......
......@@ -573,119 +573,6 @@ export function getCharContainingOffset(str: string, offset: number): [number, n
return _getCharContainingOffset(str, offset);
}
/**
* A manual encoding of `str` to UTF8.
* Use only in environments which do not offer native conversion methods!
*/
export function encodeUTF8(str: string): Uint8Array {
const strLen = str.length;
// See https://en.wikipedia.org/wiki/UTF-8
// first loop to establish needed buffer size
let neededSize = 0;
let strOffset = 0;
while (strOffset < strLen) {
const codePoint = getNextCodePoint(str, strLen, strOffset);
strOffset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
if (codePoint < 0x0080) {
neededSize += 1;
} else if (codePoint < 0x0800) {
neededSize += 2;
} else if (codePoint < 0x10000) {
neededSize += 3;
} else {
neededSize += 4;
}
}
// second loop to actually encode
const arr = new Uint8Array(neededSize);
strOffset = 0;
let arrOffset = 0;
while (strOffset < strLen) {
const codePoint = getNextCodePoint(str, strLen, strOffset);
strOffset += (codePoint >= Constants.UNICODE_SUPPLEMENTARY_PLANE_BEGIN ? 2 : 1);
if (codePoint < 0x0080) {
arr[arrOffset++] = codePoint;
} else if (codePoint < 0x0800) {
arr[arrOffset++] = 0b11000000 | ((codePoint & 0b00000000000000000000011111000000) >>> 6);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
} else if (codePoint < 0x10000) {
arr[arrOffset++] = 0b11100000 | ((codePoint & 0b00000000000000001111000000000000) >>> 12);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000000000111111000000) >>> 6);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
} else {
arr[arrOffset++] = 0b11110000 | ((codePoint & 0b00000000000111000000000000000000) >>> 18);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000111111000000000000) >>> 12);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000000000111111000000) >>> 6);
arr[arrOffset++] = 0b10000000 | ((codePoint & 0b00000000000000000000000000111111) >>> 0);
}
}
return arr;
}
/**
* A manual decoding of a UTF8 string.
* Use only in environments which do not offer native conversion methods!
*/
export function decodeUTF8(buffer: Uint8Array): string {
// https://en.wikipedia.org/wiki/UTF-8
const len = buffer.byteLength;
const result: string[] = [];
let offset = 0;
while (offset < len) {
const v0 = buffer[offset];
let codePoint: number;
if (v0 >= 0b11110000 && offset + 3 < len) {
// 4 bytes
codePoint = (
(((buffer[offset++] & 0b00000111) << 18) >>> 0)
| (((buffer[offset++] & 0b00111111) << 12) >>> 0)
| (((buffer[offset++] & 0b00111111) << 6) >>> 0)
| (((buffer[offset++] & 0b00111111) << 0) >>> 0)
);
} else if (v0 >= 0b11100000 && offset + 2 < len) {
// 3 bytes
codePoint = (
(((buffer[offset++] & 0b00001111) << 12) >>> 0)
| (((buffer[offset++] & 0b00111111) << 6) >>> 0)
| (((buffer[offset++] & 0b00111111) << 0) >>> 0)
);
} else if (v0 >= 0b11000000 && offset + 1 < len) {
// 2 bytes
codePoint = (
(((buffer[offset++] & 0b00011111) << 6) >>> 0)
| (((buffer[offset++] & 0b00111111) << 0) >>> 0)
);
} else {
// 1 byte
codePoint = buffer[offset++];
}
if ((codePoint >= 0 && codePoint <= 0xD7FF) || (codePoint >= 0xE000 && codePoint <= 0xFFFF)) {
// Basic Multilingual Plane
result.push(String.fromCharCode(codePoint));
} else if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
// Supplementary Planes
const uPrime = codePoint - 0x10000;
const w1 = 0xD800 + ((uPrime & 0b11111111110000000000) >>> 10);
const w2 = 0xDC00 + ((uPrime & 0b00000000001111111111) >>> 0);
result.push(String.fromCharCode(w1));
result.push(String.fromCharCode(w2));
} else {
// illegal code point
result.push(String.fromCharCode(0xFFFD));
}
}
return result.join('');
}
/**
* Generated using https://github.com/alexdima/unicode-utils/blob/master/generate-rtl-test.js
*/
......
......@@ -396,40 +396,6 @@ suite('Strings', () => {
assert.strictEqual(strings.getNLines('foo', 0), '');
});
test('encodeUTF8', function () {
function assertEncodeUTF8(str: string, expected: number[]): void {
const actual = strings.encodeUTF8(str);
const actualArr: number[] = [];
for (let offset = 0; offset < actual.byteLength; offset++) {
actualArr[offset] = actual[offset];
}
assert.deepStrictEqual(actualArr, expected);
}
function assertDecodeUTF8(data: number[], expected: string): void {
const actual = strings.decodeUTF8(new Uint8Array(data));
assert.deepStrictEqual(actual, expected);
}
function assertEncodeDecodeUTF8(str: string, buff: number[]): void {
assertEncodeUTF8(str, buff);
assertDecodeUTF8(buff, str);
}
assertEncodeDecodeUTF8('\u0000', [0]);
assertEncodeDecodeUTF8('!', [33]);
assertEncodeDecodeUTF8('\u007F', [127]);
assertEncodeDecodeUTF8('\u0080', [194, 128]);
assertEncodeDecodeUTF8('Ɲ', [198, 157]);
assertEncodeDecodeUTF8('\u07FF', [223, 191]);
assertEncodeDecodeUTF8('\u0800', [224, 160, 128]);
assertEncodeDecodeUTF8('', [224, 174, 130]);
assertEncodeDecodeUTF8('\uffff', [239, 191, 191]);
assertEncodeDecodeUTF8('\u10000', [225, 128, 128, 48]);
assertEncodeDecodeUTF8('🧝', [240, 159, 167, 157]);
});
test('getGraphemeBreakType', () => {
assert.strictEqual(strings.getGraphemeBreakType(0xBC1), strings.GraphemeBreakType.SpacingMark);
});
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册