提交 f3b860d4 编写于 作者: B Benjamin Pasero

debt - merge node/mime.ts into node/encoding.ts

上级 351221b1
......@@ -172,6 +172,100 @@ export function toCanonicalName(enc: string): string {
}
}
const ZERO_BYTE_DETECTION_BUFFER_MAX_LEN = 512; // number of bytes to look at to decide about a file being binary or not
const NO_GUESS_BUFFER_MAX_LEN = 512; // when not auto guessing the encoding, small number of bytes are enough
const AUTO_GUESS_BUFFER_MAX_LEN = 512 * 8; // with auto guessing we want a lot more content to be read for guessing
export function maxEncodingDetectionBufferLen(arg1?: DetectEncodingOption | boolean): number {
let autoGuessEncoding: boolean;
if (typeof arg1 === 'boolean') {
autoGuessEncoding = arg1;
} else {
autoGuessEncoding = arg1 && arg1.autoGuessEncoding;
}
return autoGuessEncoding ? AUTO_GUESS_BUFFER_MAX_LEN : NO_GUESS_BUFFER_MAX_LEN;
}
export interface IDetectedEncodingResult {
encoding: string;
seemsBinary: boolean;
}
export interface DetectEncodingOption {
autoGuessEncoding?: boolean;
}
export function detectEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: false): IDetectedEncodingResult;
export function detectEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IDetectedEncodingResult>;
export function detectEncodingFromBuffer({ buffer, bytesRead }: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IDetectedEncodingResult> | IDetectedEncodingResult {
// Always first check for BOM to find out about encoding
let encoding = detectEncodingByBOMFromBuffer(buffer, bytesRead);
// Detect 0 bytes to see if file is binary or UTF-16 LE/BE
// unless we already know that this file has a UTF-16 encoding
let seemsBinary = false;
if (encoding !== UTF16be && encoding !== UTF16le) {
let couldBeUTF16LE = true; // e.g. 0xAA 0x00
let couldBeUTF16BE = true; // e.g. 0x00 0xAA
let containsZeroByte = false;
// This is a simplified guess to detect UTF-16 BE or LE by just checking if
// the first 512 bytes have the 0-byte at a specific location. For UTF-16 LE
// this would be the odd byte index and for UTF-16 BE the even one.
// Note: this can produce false positives (a binary file that uses a 2-byte
// encoding of the same format as UTF-16) and false negatives (a UTF-16 file
// that is using 4 bytes to encode a character).
for (let i = 0; i < bytesRead && i < ZERO_BYTE_DETECTION_BUFFER_MAX_LEN; i++) {
const isEndian = (i % 2 === 1); // assume 2-byte sequences typical for UTF-16
const isZeroByte = (buffer.readInt8(i) === 0);
if (isZeroByte) {
containsZeroByte = true;
}
// UTF-16 LE: expect e.g. 0xAA 0x00
if (couldBeUTF16LE && (isEndian && !isZeroByte || !isEndian && isZeroByte)) {
couldBeUTF16LE = false;
}
// UTF-16 BE: expect e.g. 0x00 0xAA
if (couldBeUTF16BE && (isEndian && isZeroByte || !isEndian && !isZeroByte)) {
couldBeUTF16BE = false;
}
// Return if this is neither UTF16-LE nor UTF16-BE and thus treat as binary
if (isZeroByte && !couldBeUTF16LE && !couldBeUTF16BE) {
break;
}
}
// Handle case of 0-byte included
if (containsZeroByte) {
if (couldBeUTF16LE) {
encoding = UTF16le;
} else if (couldBeUTF16BE) {
encoding = UTF16be;
} else {
seemsBinary = true;
}
}
}
// Auto guess encoding if configured
if (autoGuessEncoding && !seemsBinary && !encoding) {
return guessEncodingByBuffer(buffer.slice(0, bytesRead)).then(encoding => {
return {
seemsBinary: false,
encoding
};
});
}
return { seemsBinary, encoding };
}
// https://ss64.com/nt/chcp.html
const windowsTerminalEncodings = {
'437': 'cp437', // United States
......@@ -256,4 +350,4 @@ export function resolveTerminalEncoding(verbose?: boolean): TPromise<string> {
return UTF8;
});
}
}
\ No newline at end of file
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
'use strict';
import * as mime from 'vs/base/common/mime';
import { TPromise } from 'vs/base/common/winjs.base';
import * as stream from 'vs/base/node/stream';
import * as encoding from 'vs/base/node/encoding';
const ZERO_BYTE_DETECTION_BUFFER_MAX_LEN = 512; // number of bytes to look at to decide about a file being binary or not
const NO_GUESS_BUFFER_MAX_LEN = 512; // when not auto guessing the encoding, small number of bytes are enough
const AUTO_GUESS_BUFFER_MAX_LEN = 512 * 8; // with auto guessing we want a lot more content to be read for guessing
export function maxBufferLen(arg1?: DetectMimesOption | boolean): number {
let autoGuessEncoding: boolean;
if (typeof arg1 === 'boolean') {
autoGuessEncoding = arg1;
} else {
autoGuessEncoding = arg1 && arg1.autoGuessEncoding;
}
return autoGuessEncoding ? AUTO_GUESS_BUFFER_MAX_LEN : NO_GUESS_BUFFER_MAX_LEN;
}
export interface IMimeAndEncoding {
encoding: string;
mimes: string[];
}
export interface DetectMimesOption {
autoGuessEncoding?: boolean;
}
export function detectMimeAndEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: false): IMimeAndEncoding;
export function detectMimeAndEncodingFromBuffer(readResult: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IMimeAndEncoding>;
export function detectMimeAndEncodingFromBuffer({ buffer, bytesRead }: stream.ReadResult, autoGuessEncoding?: boolean): TPromise<IMimeAndEncoding> | IMimeAndEncoding {
// Always first check for BOM to find out about encoding
let enc = encoding.detectEncodingByBOMFromBuffer(buffer, bytesRead);
// Detect 0 bytes to see if file is binary or UTF-16 LE/BE
// unless we already know that this file has a UTF-16 encoding
let isText = true;
if (enc !== encoding.UTF16be && enc !== encoding.UTF16le) {
let couldBeUTF16LE = true; // e.g. 0xAA 0x00
let couldBeUTF16BE = true; // e.g. 0x00 0xAA
let containsZeroByte = false;
// This is a simplified guess to detect UTF-16 BE or LE by just checking if
// the first 512 bytes have the 0-byte at a specific location. For UTF-16 LE
// this would be the odd byte index and for UTF-16 BE the even one.
// Note: this can produce false positives (a binary file that uses a 2-byte
// encoding of the same format as UTF-16) and false negatives (a UTF-16 file
// that is using 4 bytes to encode a character).
for (let i = 0; i < bytesRead && i < ZERO_BYTE_DETECTION_BUFFER_MAX_LEN; i++) {
const isEndian = (i % 2 === 1); // assume 2-byte sequences typical for UTF-16
const isZeroByte = (buffer.readInt8(i) === 0);
if (isZeroByte) {
containsZeroByte = true;
}
// UTF-16 LE: expect e.g. 0xAA 0x00
if (couldBeUTF16LE && (isEndian && !isZeroByte || !isEndian && isZeroByte)) {
couldBeUTF16LE = false;
}
// UTF-16 BE: expect e.g. 0x00 0xAA
if (couldBeUTF16BE && (isEndian && isZeroByte || !isEndian && !isZeroByte)) {
couldBeUTF16BE = false;
}
// Return if this is neither UTF16-LE nor UTF16-BE and thus treat as binary
if (isZeroByte && !couldBeUTF16LE && !couldBeUTF16BE) {
break;
}
}
// Handle case of 0-byte included
if (containsZeroByte) {
if (couldBeUTF16LE) {
enc = encoding.UTF16le;
} else if (couldBeUTF16BE) {
enc = encoding.UTF16be;
} else {
isText = false;
}
}
}
// Auto guess encoding if configured
if (autoGuessEncoding && isText && !enc) {
return encoding.guessEncodingByBuffer(buffer.slice(0, bytesRead)).then(enc => {
return {
mimes: isText ? [mime.MIME_TEXT] : [mime.MIME_BINARY],
encoding: enc
};
});
}
return {
mimes: isText ? [mime.MIME_TEXT] : [mime.MIME_BINARY],
encoding: enc
};
}
\ No newline at end of file
......@@ -8,6 +8,7 @@
import * as assert from 'assert';
import * as encoding from 'vs/base/node/encoding';
import { readExactlyByFile } from 'vs/base/node/stream';
suite('Encoding', () => {
test('detectBOM UTF-8', () => {
......@@ -64,4 +65,89 @@ suite('Encoding', () => {
assert.equal(enc, 'utf16le');
});
});
test('detectEncodingFromBuffer (JSON saved as PNG)', function () {
const file = require.toUrl('./fixtures/some.json.png');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, false);
});
});
test('detectEncodingFromBuffer (PNG saved as TXT)', function () {
const file = require.toUrl('./fixtures/some.png.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, true);
});
});
test('detectEncodingFromBuffer (XML saved as PNG)', function () {
const file = require.toUrl('./fixtures/some.xml.png');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, false);
});
});
test('detectEncodingFromBuffer (QWOFF saved as TXT)', function () {
const file = require.toUrl('./fixtures/some.qwoff.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, true);
});
});
test('detectEncodingFromBuffer (CSS saved as QWOFF)', function () {
const file = require.toUrl('./fixtures/some.css.qwoff');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, false);
});
});
test('detectEncodingFromBuffer (PDF)', function () {
const file = require.toUrl('./fixtures/some.pdf');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.seemsBinary, true);
});
});
test('detectEncodingFromBuffer (guess UTF-16 LE from content without BOM)', function () {
const file = require.toUrl('./fixtures/utf16_le_nobom.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.encoding, encoding.UTF16le);
assert.equal(mimes.seemsBinary, false);
});
});
test('detectEncodingFromBuffer (guess UTF-16 BE from content without BOM)', function () {
const file = require.toUrl('./fixtures/utf16_be_nobom.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = encoding.detectEncodingFromBuffer(buffer);
assert.equal(mimes.encoding, encoding.UTF16be);
assert.equal(mimes.seemsBinary, false);
});
});
test('autoGuessEncoding (ShiftJIS)', function () {
const file = require.toUrl('./fixtures/some.shiftjis.txt');
return readExactlyByFile(file, 512 * 8).then(buffer => {
return encoding.detectEncodingFromBuffer(buffer, true).then(mimes => {
assert.equal(mimes.encoding, 'shiftjis');
});
});
});
test('autoGuessEncoding (CP1252)', function () {
const file = require.toUrl('./fixtures/some.cp1252.txt');
return readExactlyByFile(file, 512 * 8).then(buffer => {
return encoding.detectEncodingFromBuffer(buffer, true).then(mimes => {
assert.equal(mimes.encoding, 'windows1252');
});
});
});
});
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
'use strict';
import * as assert from 'assert';
import * as mimeCommon from 'vs/base/common/mime';
import * as mime from 'vs/base/node/mime';
import { readExactlyByFile } from 'vs/base/node/stream';
import { UTF16le, UTF16be } from 'vs/base/node/encoding';
suite('Mime', () => {
test('detectMimesFromFile (JSON saved as PNG)', function () {
const file = require.toUrl('./fixtures/some.json.png');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['text/plain']);
});
});
test('detectMimesFromFile (PNG saved as TXT)', function () {
mimeCommon.registerTextMime({ id: 'text', mime: 'text/plain', extension: '.txt' });
const file = require.toUrl('./fixtures/some.png.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['application/octet-stream']);
});
});
test('detectMimesFromFile (XML saved as PNG)', function () {
const file = require.toUrl('./fixtures/some.xml.png');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['text/plain']);
});
});
test('detectMimesFromFile (QWOFF saved as TXT)', function () {
const file = require.toUrl('./fixtures/some.qwoff.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['application/octet-stream']);
});
});
test('detectMimesFromFile (CSS saved as QWOFF)', function () {
const file = require.toUrl('./fixtures/some.css.qwoff');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['text/plain']);
});
});
test('detectMimesFromFile (PDF)', function () {
const file = require.toUrl('./fixtures/some.pdf');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.deepEqual(mimes.mimes, ['application/octet-stream']);
});
});
test('detectMimesFromFile (guess UTF-16 LE from content without BOM)', function () {
mimeCommon.registerTextMime({ id: 'text', mime: 'text/plain', extension: '.txt' });
const file = require.toUrl('./fixtures/utf16_le_nobom.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.equal(mimes.encoding, UTF16le);
assert.deepEqual(mimes.mimes, ['text/plain']);
});
});
test('detectMimesFromFile (guess UTF-16 BE from content without BOM)', function () {
mimeCommon.registerTextMime({ id: 'text', mime: 'text/plain', extension: '.txt' });
const file = require.toUrl('./fixtures/utf16_be_nobom.txt');
return readExactlyByFile(file, 512).then(buffer => {
const mimes = mime.detectMimeAndEncodingFromBuffer(buffer);
assert.equal(mimes.encoding, UTF16be);
assert.deepEqual(mimes.mimes, ['text/plain']);
});
});
test('autoGuessEncoding (ShiftJIS)', function () {
const file = require.toUrl('./fixtures/some.shiftjis.txt');
return readExactlyByFile(file, 512 * 8).then(buffer => {
return mime.detectMimeAndEncodingFromBuffer(buffer, true).then(mimes => {
assert.equal(mimes.encoding, 'shiftjis');
});
});
});
test('autoGuessEncoding (CP1252)', function () {
const file = require.toUrl('./fixtures/some.cp1252.txt');
return readExactlyByFile(file, 512 * 8).then(buffer => {
return mime.detectMimeAndEncodingFromBuffer(buffer, true).then(mimes => {
assert.equal(mimes.encoding, 'windows1252');
});
});
});
});
......@@ -13,7 +13,7 @@ import { IDisposable } from 'vs/base/common/lifecycle';
import { isFalsyOrEmpty, distinct } from 'vs/base/common/arrays';
import { Schemas } from 'vs/base/common/network';
import { Progress } from 'vs/platform/progress/common/progress';
import { decodeStream, encode, UTF8, UTF8_with_bom } from 'vs/base/node/encoding';
import { decodeStream, encode, UTF8, UTF8_with_bom, detectEncodingFromBuffer, maxEncodingDetectionBufferLen } from 'vs/base/node/encoding';
import { TernarySearchTree } from 'vs/base/common/map';
import { IConfigurationService } from 'vs/platform/configuration/common/configuration';
import { IWorkspaceContextService } from 'vs/platform/workspace/common/workspace';
......@@ -22,8 +22,6 @@ import { ILifecycleService } from 'vs/platform/lifecycle/common/lifecycle';
import { IStorageService } from 'vs/platform/storage/common/storage';
import { ITextResourceConfigurationService } from 'vs/editor/common/services/resourceConfiguration';
import { IExtensionService } from 'vs/workbench/services/extensions/common/extensions';
import { maxBufferLen, detectMimeAndEncodingFromBuffer } from 'vs/base/node/mime';
import { MIME_BINARY } from 'vs/base/common/mime';
import { localize } from 'vs/nls';
import { INotificationService } from 'vs/platform/notification/common/notification';
......@@ -250,7 +248,7 @@ export class RemoteFileService extends FileService {
}
const guessEncoding = options.autoGuessEncoding;
const count = maxBufferLen(options);
const count = maxEncodingDetectionBufferLen(options);
const chunks: Buffer[] = [];
return provider.read(
......@@ -258,11 +256,10 @@ export class RemoteFileService extends FileService {
0, count,
new Progress<Buffer>(chunk => chunks.push(chunk))
).then(bytesRead => {
// send to bla
return detectMimeAndEncodingFromBuffer({ bytesRead, buffer: Buffer.concat(chunks) }, guessEncoding);
return detectEncodingFromBuffer({ bytesRead, buffer: Buffer.concat(chunks) }, guessEncoding);
}).then(detected => {
if (options.acceptTextOnly && detected.mimes.indexOf(MIME_BINARY) >= 0) {
if (options.acceptTextOnly && detected.seemsBinary) {
return TPromise.wrapError<IStreamContent>(new FileOperationError(
localize('fileBinaryError', "File seems to be binary and cannot be opened as text"),
FileOperationResult.FILE_IS_BINARY,
......
......@@ -15,7 +15,6 @@ import { MAX_FILE_SIZE, MAX_HEAP_SIZE } from 'vs/platform/files/node/files';
import { isEqualOrParent } from 'vs/base/common/paths';
import { ResourceMap } from 'vs/base/common/map';
import * as arrays from 'vs/base/common/arrays';
import * as baseMime from 'vs/base/common/mime';
import { TPromise } from 'vs/base/common/winjs.base';
import * as objects from 'vs/base/common/objects';
import * as extfs from 'vs/base/node/extfs';
......@@ -27,7 +26,6 @@ import { dispose, IDisposable, toDisposable } from 'vs/base/common/lifecycle';
import { IWorkspaceContextService, WorkbenchState } from 'vs/platform/workspace/common/workspace';
import * as pfs from 'vs/base/node/pfs';
import * as encoding from 'vs/base/node/encoding';
import { detectMimeAndEncodingFromBuffer, IMimeAndEncoding } from 'vs/base/node/mime';
import * as flow from 'vs/base/node/flow';
import { FileWatcher as UnixWatcherService } from 'vs/workbench/services/files/node/watcher/unix/watcherService';
import { FileWatcher as WindowsWatcherService } from 'vs/workbench/services/files/node/watcher/win32/watcherService';
......@@ -492,12 +490,12 @@ export class FileService implements IFileService {
} else {
// when receiving the first chunk of data we need to create the
// decoding stream which is then used to drive the string stream.
TPromise.as(detectMimeAndEncodingFromBuffer(
TPromise.as(encoding.detectEncodingFromBuffer(
{ buffer: chunkBuffer, bytesRead },
options && options.autoGuessEncoding || this.configuredAutoGuessEncoding(resource)
)).then(value => {
)).then(detected => {
if (options && options.acceptTextOnly && value.mimes.indexOf(baseMime.MIME_BINARY) >= 0) {
if (options && options.acceptTextOnly && detected.seemsBinary) {
// Return error early if client only accepts text and this is not text
finish(new FileOperationError(
nls.localize('fileBinaryError', "File seems to be binary and cannot be opened as text"),
......@@ -506,7 +504,7 @@ export class FileService implements IFileService {
));
} else {
result.encoding = this.getEncoding(resource, this.getPeferredEncoding(resource, options, value));
result.encoding = this.getEncoding(resource, this.getPeferredEncoding(resource, options, detected));
result.stream = decoder = encoding.decodeStream(result.encoding);
resolve(result);
handleChunk(bytesRead);
......@@ -924,7 +922,7 @@ export class FileService implements IFileService {
});
}
private getPeferredEncoding(resource: uri, options: IResolveContentOptions, detected: IMimeAndEncoding): string {
private getPeferredEncoding(resource: uri, options: IResolveContentOptions, detected: encoding.IDetectedEncodingResult): string {
let preferredEncoding: string;
if (options && options.encoding) {
if (detected.encoding === encoding.UTF8 && options.encoding === encoding.UTF8) {
......
......@@ -13,9 +13,7 @@ import { onUnexpectedError } from 'vs/base/common/errors';
import * as strings from 'vs/base/common/strings';
import { TPromise } from 'vs/base/common/winjs.base';
import { LineMatch, FileMatch } from '../search';
import * as baseMime from 'vs/base/common/mime';
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode, bomLength } from 'vs/base/node/encoding';
import { detectMimeAndEncodingFromBuffer } from 'vs/base/node/mime';
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode, bomLength, detectEncodingFromBuffer } from 'vs/base/node/encoding';
import { ISearchWorker, ISearchWorkerSearchArgs, ISearchWorkerSearchResult } from './searchWorkerIpc';
......@@ -208,13 +206,13 @@ export class SearchWorkerEngine {
// Detect encoding and mime when this is the beginning of the file
if (isFirstRead) {
const mimeAndEncoding = detectMimeAndEncodingFromBuffer({ buffer, bytesRead }, false);
if (mimeAndEncoding.mimes[mimeAndEncoding.mimes.length - 1] !== baseMime.MIME_TEXT) {
const detected = detectEncodingFromBuffer({ buffer, bytesRead }, false);
if (detected.seemsBinary) {
return clb(null); // skip files that seem binary
}
// Check for BOM offset
switch (mimeAndEncoding.encoding) {
switch (detected.encoding) {
case UTF8:
pos = i = bomLength(UTF8);
options.encoding = UTF8;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册