提交 a9b9534b 编写于 作者: B Benjamin Pasero 提交者: GitHub

Merge pull request #21416 from katainaka0503/auto-detect-encoding

Auto guess encoding
......@@ -257,6 +257,11 @@
"from": "isobject@>=2.0.0 <3.0.0",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz"
},
"jschardet": {
"version": "1.4.2",
"from": "jschardet@>=1.4.2 <2.0.0",
"resolved": "https://registry.npmjs.org/jschardet/-/jschardet-1.4.2.tgz"
},
"kind-of": {
"version": "3.0.4",
"from": "kind-of@>=3.0.2 <4.0.0",
......
declare module 'jschardet' {
export interface IDetectedMap {
encoding: string,
confidence: number
}
export function detect(buffer: NodeBuffer): IDetectedMap;
export const Constants: {
MINIMUM_THRESHOLD: number,
}
}
\ No newline at end of file
......@@ -8,6 +8,7 @@
import stream = require('vs/base/node/stream');
import iconv = require('iconv-lite');
import { TPromise } from 'vs/base/common/winjs.base';
import jschardet = require('jschardet');
export const UTF8 = 'utf8';
export const UTF8_with_bom = 'utf8bom';
......@@ -91,7 +92,36 @@ export function detectEncodingByBOMFromBuffer(buffer: NodeBuffer, bytesRead: num
* If no BOM is detected, null will be passed to callback.
*/
export function detectEncodingByBOM(file: string): TPromise<string> {
return stream.readExactlyByFile(file, 3).then(({buffer, bytesRead}) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
return stream.readExactlyByFile(file, 3).then(({ buffer, bytesRead }) => detectEncodingByBOMFromBuffer(buffer, bytesRead));
}
const MINIMUM_THRESHOLD = 0.2; // TODO@Ben Decide how much this should be.
jschardet.Constants.MINIMUM_THRESHOLD = MINIMUM_THRESHOLD;
const IGNORE_ENCODINGS = ['ascii', 'utf-8', 'utf-16', 'utf-32'];
/**
* Guesses the encoding from buffer.
*/
export function guessEncodingByBuffer(buffer: NodeBuffer): string {
const guessed = jschardet.detect(buffer);
if (!guessed || !guessed.encoding) {
return null;
}
const enc = guessed.encoding.toLowerCase();
// Ignore encodings that cannot guess correctly
// (http://chardet.readthedocs.io/en/latest/supported-encodings.html)
if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
return null;
}
return lowerCaseWithoutNonAlphaNumeric(guessed.encoding);
}
function lowerCaseWithoutNonAlphaNumeric(encodingName: string): string {
return encodingName.replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
}
/**
......
......@@ -58,15 +58,23 @@ export interface IMimeAndEncoding {
mimes: string[];
}
function doDetectMimesFromStream(instream: streams.Readable): TPromise<IMimeAndEncoding> {
return stream.readExactlyByStream(instream, BUFFER_READ_MAX_LEN).then(detectMimeAndEncodingFromBuffer);
export interface DetectMimesOption {
autoGuessEncoding?: boolean;
}
function doDetectMimesFromFile(absolutePath: string): TPromise<IMimeAndEncoding> {
return stream.readExactlyByFile(absolutePath, BUFFER_READ_MAX_LEN).then(detectMimeAndEncodingFromBuffer);
function doDetectMimesFromStream(instream: streams.Readable, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
return stream.readExactlyByStream(instream, BUFFER_READ_MAX_LEN).then((readResult: stream.ReadResult) => {
return detectMimeAndEncodingFromBuffer(readResult, option && option.autoGuessEncoding);
});
}
export function detectMimeAndEncodingFromBuffer({buffer, bytesRead}: stream.ReadResult): IMimeAndEncoding {
function doDetectMimesFromFile(absolutePath: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
return stream.readExactlyByFile(absolutePath, BUFFER_READ_MAX_LEN).then((readResult: stream.ReadResult) => {
return detectMimeAndEncodingFromBuffer(readResult, option && option.autoGuessEncoding);
});
}
export function detectMimeAndEncodingFromBuffer({ buffer, bytesRead }: stream.ReadResult, autoGuessEncoding?: boolean): IMimeAndEncoding {
let enc = encoding.detectEncodingByBOMFromBuffer(buffer, bytesRead);
// Detect 0 bytes to see if file is binary (ignore for UTF 16 though)
......@@ -79,6 +87,9 @@ export function detectMimeAndEncodingFromBuffer({buffer, bytesRead}: stream.Read
}
}
}
if (autoGuessEncoding && isText && !enc) {
enc = encoding.guessEncodingByBuffer(buffer);
}
return {
mimes: isText ? [mime.MIME_TEXT] : [mime.MIME_BINARY],
......@@ -116,8 +127,8 @@ function filterAndSortMimes(detectedMimes: string[], guessedMimes: string[]): st
* @param instream the readable stream to detect the mime types from.
* @param nameHint an additional hint that can be used to detect a mime from a file extension.
*/
export function detectMimesFromStream(instream: streams.Readable, nameHint: string): TPromise<IMimeAndEncoding> {
return doDetectMimesFromStream(instream).then(encoding =>
export function detectMimesFromStream(instream: streams.Readable, nameHint: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
return doDetectMimesFromStream(instream, option).then(encoding =>
handleMimeResult(nameHint, encoding)
);
}
......@@ -126,8 +137,8 @@ export function detectMimesFromStream(instream: streams.Readable, nameHint: stri
* Opens the given file to detect its mime type. Returns an array of mime types sorted from most specific to unspecific.
* @param absolutePath the absolute path of the file.
*/
export function detectMimesFromFile(absolutePath: string): TPromise<IMimeAndEncoding> {
return doDetectMimesFromFile(absolutePath).then(encoding =>
export function detectMimesFromFile(absolutePath: string, option?: DetectMimesOption): TPromise<IMimeAndEncoding> {
return doDetectMimesFromFile(absolutePath, option).then(encoding =>
handleMimeResult(absolutePath, encoding)
);
}
......
VSCODE͍ō̃GfB^B
\ No newline at end of file
......@@ -60,4 +60,12 @@ suite('Mime', () => {
done();
}, done);
});
test('autoGuessEncoding (ShiftJIS)', function (done: () => void) {
const file = require.toUrl('./fixtures/some.shiftjis.txt');
mime.detectMimesFromFile(file, { autoGuessEncoding: true }).then(mimes => {
assert.equal(mimes.encoding, 'shiftjis');
done();
}, done);
});
});
......@@ -495,6 +495,11 @@ export interface IResolveContentOptions {
* the contents of the file.
*/
encoding?: string;
/**
* The optional guessEncoding parameter allows to guess encoding from content of the file.
*/
autoGuessEncoding?: boolean;
}
export interface IUpdateContentOptions {
......@@ -575,6 +580,7 @@ export interface IFilesConfiguration {
exclude: glob.IExpression;
watcherExclude: { [filepattern: string]: boolean };
encoding: string;
autoGuessEncoding: boolean;
defaultLanguage: string;
trimTrailingWhitespace: boolean;
autoSave: string;
......@@ -798,17 +804,17 @@ export const SUPPORTED_ENCODINGS: { [encoding: string]: { labelLong: string; lab
labelShort: 'ISO 8859-11',
order: 42
},
'koi8-ru': {
koi8ru: {
labelLong: 'Cyrillic (KOI8-RU)',
labelShort: 'KOI8-RU',
order: 43
},
'koi8-t': {
koi8t: {
labelLong: 'Tajik (KOI8-T)',
labelShort: 'KOI8-T',
order: 44
},
GB2312: {
gb2312: {
labelLong: 'Simplified Chinese (GB 2312)',
labelShort: 'GB 2312',
order: 45
......
......@@ -240,6 +240,7 @@ const configurationValueWhitelist = [
'editor.acceptSuggestionOnCommitCharacter',
'workbench.editor.showTabs',
'files.encoding',
'files.autoGuessEncoding',
'editor.quickSuggestionsDelay',
'editor.snippetSuggestions',
'editor.selectionHighlight',
......
......@@ -34,7 +34,7 @@ import { IEditor as IBaseEditor, IEditorInput } from 'vs/platform/editor/common/
import { IWorkbenchEditorService } from 'vs/workbench/services/editor/common/editorService';
import { IQuickOpenService, IPickOpenEntry, IFilePickOpenEntry } from 'vs/platform/quickOpen/common/quickOpen';
import { IWorkspaceConfigurationService } from 'vs/workbench/services/configuration/common/configuration';
import { IFilesConfiguration, SUPPORTED_ENCODINGS } from 'vs/platform/files/common/files';
import { IFilesConfiguration, SUPPORTED_ENCODINGS, IFileService } from 'vs/platform/files/common/files';
import { IInstantiationService } from 'vs/platform/instantiation/common/instantiation';
import { IModeService } from 'vs/editor/common/services/modeService';
import { IModelService } from 'vs/editor/common/services/modelService';
......@@ -1031,7 +1031,8 @@ export class ChangeEncodingAction extends Action {
actionLabel: string,
@IWorkbenchEditorService private editorService: IWorkbenchEditorService,
@IQuickOpenService private quickOpenService: IQuickOpenService,
@IWorkspaceConfigurationService private configurationService: IWorkspaceConfigurationService
@IWorkspaceConfigurationService private configurationService: IWorkspaceConfigurationService,
@IFileService private fileService: IFileService
) {
super(actionId, actionLabel);
}
......@@ -1072,51 +1073,69 @@ export class ChangeEncodingAction extends Action {
return undefined;
}
return TPromise.timeout(50 /* quick open is sensitive to being opened so soon after another */).then(() => {
const configuration = this.configurationService.getConfiguration<IFilesConfiguration>();
const isReopenWithEncoding = (action === reopenWithEncodingPick);
const configuredEncoding = configuration && configuration.files && configuration.files.encoding;
let directMatchIndex: number;
let aliasMatchIndex: number;
// All encodings are valid picks
const picks: IPickOpenEntry[] = Object.keys(SUPPORTED_ENCODINGS)
.sort((k1, k2) => {
if (k1 === configuredEncoding) {
return -1;
} else if (k2 === configuredEncoding) {
return 1;
}
return SUPPORTED_ENCODINGS[k1].order - SUPPORTED_ENCODINGS[k2].order;
})
.filter(k => {
return !isReopenWithEncoding || !SUPPORTED_ENCODINGS[k].encodeOnly; // hide those that can only be used for encoding if we are about to decode
})
.map((key, index) => {
if (key === encodingSupport.getEncoding()) {
directMatchIndex = index;
} else if (SUPPORTED_ENCODINGS[key].alias === encodingSupport.getEncoding()) {
aliasMatchIndex = index;
}
const guessEncoding = () => {
const resource = toResource(activeEditor.input);
return this.fileService.resolveContent(resource, { autoGuessEncoding: true, acceptTextOnly: true })
.then(content => content.encoding, err => null);
};
return { id: key, label: SUPPORTED_ENCODINGS[key].labelLong };
});
return TPromise.timeout(50 /* quick open is sensitive to being opened so soon after another */)
.then(guessEncoding)
.then(guessedEncoding => {
const configuration = this.configurationService.getConfiguration<IFilesConfiguration>();
const isReopenWithEncoding = (action === reopenWithEncodingPick);
const configuredEncoding = configuration && configuration.files && configuration.files.encoding;
let directMatchIndex: number;
let aliasMatchIndex: number;
// All encodings are valid picks
const picks: IPickOpenEntry[] = Object.keys(SUPPORTED_ENCODINGS)
.sort((k1, k2) => {
if (k1 === configuredEncoding) {
return -1;
} else if (k2 === configuredEncoding) {
return 1;
}
return SUPPORTED_ENCODINGS[k1].order - SUPPORTED_ENCODINGS[k2].order;
})
.filter(k => {
if (k === guessedEncoding && guessedEncoding !== configuredEncoding) {
return false; // do not show encoding if it is the guessed encoding that does not match the configured
}
return !isReopenWithEncoding || !SUPPORTED_ENCODINGS[k].encodeOnly; // hide those that can only be used for encoding if we are about to decode
})
.map((key, index) => {
if (key === encodingSupport.getEncoding()) {
directMatchIndex = index;
} else if (SUPPORTED_ENCODINGS[key].alias === encodingSupport.getEncoding()) {
aliasMatchIndex = index;
}
return { id: key, label: SUPPORTED_ENCODINGS[key].labelLong };
});
// If we have a guessed encoding, show it first unless it matches the configured encoding
if (guessedEncoding && configuredEncoding !== guessedEncoding && SUPPORTED_ENCODINGS[guessedEncoding]) {
picks[0].separator = { border: true };
picks.unshift({ id: guessedEncoding, label: SUPPORTED_ENCODINGS[guessedEncoding].labelLong, description: nls.localize('guessedEncoding', "Guessed from content") });
}
return this.quickOpenService.pick(picks, {
placeHolder: isReopenWithEncoding ? nls.localize('pickEncodingForReopen', "Select File Encoding to Reopen File") : nls.localize('pickEncodingForSave', "Select File Encoding to Save with"),
autoFocus: { autoFocusIndex: typeof directMatchIndex === 'number' ? directMatchIndex : typeof aliasMatchIndex === 'number' ? aliasMatchIndex : void 0 }
}).then(encoding => {
if (encoding) {
activeEditor = this.editorService.getActiveEditor();
encodingSupport = toEditorWithEncodingSupport(activeEditor.input);
if (encodingSupport && encodingSupport.getEncoding() !== encoding.id) {
encodingSupport.setEncoding(encoding.id, isReopenWithEncoding ? EncodingMode.Decode : EncodingMode.Encode); // Set new encoding
return this.quickOpenService.pick(picks, {
placeHolder: isReopenWithEncoding ? nls.localize('pickEncodingForReopen', "Select File Encoding to Reopen File") : nls.localize('pickEncodingForSave', "Select File Encoding to Save with"),
autoFocus: { autoFocusIndex: typeof directMatchIndex === 'number' ? directMatchIndex : typeof aliasMatchIndex === 'number' ? aliasMatchIndex : void 0 }
}).then(encoding => {
if (encoding) {
activeEditor = this.editorService.getActiveEditor();
encodingSupport = toEditorWithEncodingSupport(activeEditor.input);
if (encodingSupport && encodingSupport.getEncoding() !== encoding.id) {
encodingSupport.setEncoding(encoding.id, isReopenWithEncoding ? EncodingMode.Decode : EncodingMode.Encode); // Set new encoding
}
}
}
});
});
});
});
}
}
......@@ -212,6 +212,11 @@ configurationRegistry.registerConfiguration({
'default': 'utf8',
'description': nls.localize('encoding', "The default character set encoding to use when reading and writing files."),
},
'files.autoGuessEncoding': {
'type': 'boolean',
'default': false,
'description': nls.localize('autoGuessEncoding', "When enabled, will attempt to guess the character set encoding when opening files")
},
'files.eol': {
'type': 'string',
'enum': [
......
......@@ -81,6 +81,7 @@ export class FileService implements IFileService {
const fileServiceConfig: IFileServiceOptions = {
errorLogger: (msg: string) => this.onFileServiceError(msg),
encoding: configuration.files && configuration.files.encoding,
autoGuessEncoding: configuration.files && configuration.files.autoGuessEncoding,
encodingOverride,
watcherIgnoredPatterns,
verboseLogging: environmentService.verbose,
......
......@@ -28,7 +28,7 @@ import { dispose, IDisposable, toDisposable } from 'vs/base/common/lifecycle';
import pfs = require('vs/base/node/pfs');
import encoding = require('vs/base/node/encoding');
import mime = require('vs/base/node/mime');
import { IMimeAndEncoding, detectMimesFromFile } from 'vs/base/node/mime';
import flow = require('vs/base/node/flow');
import { FileWatcher as UnixWatcherService } from 'vs/workbench/services/files/node/watcher/unix/watcherService';
import { FileWatcher as WindowsWatcherService } from 'vs/workbench/services/files/node/watcher/win32/watcherService';
......@@ -44,6 +44,7 @@ export interface IFileServiceOptions {
tmpDir?: string;
errorLogger?: (msg: string) => void;
encoding?: string;
autoGuessEncoding?: boolean;
bom?: string;
encodingOverride?: IEncodingOverride[];
watcherIgnoredPatterns?: string[];
......@@ -205,7 +206,8 @@ export class FileService implements IFileService {
}
// 2.) detect mimes
return mime.detectMimesFromFile(absolutePath).then((detected: mime.IMimeAndEncoding) => {
const autoGuessEncoding = (options && options.autoGuessEncoding) || (this.options && this.options.autoGuessEncoding);
return detectMimesFromFile(absolutePath, { autoGuessEncoding }).then((detected: IMimeAndEncoding) => {
const isText = detected.mimes.indexOf(baseMime.MIME_BINARY) === -1;
// Return error early if client only accepts text and this is not text
......
......@@ -209,7 +209,7 @@ export class SearchWorkerEngine {
// Detect encoding and mime when this is the beginning of the file
if (isFirstRead) {
const mimeAndEncoding = detectMimeAndEncodingFromBuffer({ buffer, bytesRead });
const mimeAndEncoding = detectMimeAndEncodingFromBuffer({ buffer, bytesRead }, false);
if (mimeAndEncoding.mimes[mimeAndEncoding.mimes.length - 1] !== baseMime.MIME_TEXT) {
return clb(null); // skip files that seem binary
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册