From 519daf64e14823658d247dbf424041ce33feee7e Mon Sep 17 00:00:00 2001 From: tomoki1207 Date: Mon, 1 Aug 2016 18:31:24 +0900 Subject: [PATCH] detect encoding --- package.json | 1 + src/typings/jschardet.d.ts | 7 +++++++ src/vs/base/node/encoding.ts | 20 ++++++++++++++++++++ src/vs/base/node/mime.ts | 3 +++ 4 files changed, 31 insertions(+) create mode 100644 src/typings/jschardet.d.ts diff --git a/package.json b/package.json index 2b0d7bfc6f5..d646da18c5e 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "http-proxy-agent": "0.2.7", "https-proxy-agent": "0.3.6", "iconv-lite": "0.4.15", + "jschardet": "^1.4.1", "minimist": "1.2.0", "native-keymap": "0.4.0", "node-pty": "0.6.2", diff --git a/src/typings/jschardet.d.ts b/src/typings/jschardet.d.ts new file mode 100644 index 00000000000..5389e32b3c0 --- /dev/null +++ b/src/typings/jschardet.d.ts @@ -0,0 +1,7 @@ +declare module 'jschardet' { + export interface IDetectedMap { + encoding: string, + confidence: number + } + export function detect(buffer: NodeBuffer): IDetectedMap; +} \ No newline at end of file diff --git a/src/vs/base/node/encoding.ts b/src/vs/base/node/encoding.ts index 4d880578013..d14fb61c41e 100644 --- a/src/vs/base/node/encoding.ts +++ b/src/vs/base/node/encoding.ts @@ -8,6 +8,7 @@ import stream = require('vs/base/node/stream'); import iconv = require('iconv-lite'); import { TPromise } from 'vs/base/common/winjs.base'; +import jschardet = require('jschardet'); export const UTF8 = 'utf8'; export const UTF8_with_bom = 'utf8bom'; @@ -94,6 +95,25 @@ export function detectEncodingByBOM(file: string): TPromise { return stream.readExactlyByFile(file, 3).then(({buffer, bytesRead}) => detectEncodingByBOMFromBuffer(buffer, bytesRead)); } +const IGNORE_ENCODINGS = ['ascii', 'utf-8', 'utf-16', 'urf-32']; +/** + * Detects the encoding from buffer. + */ +export function detectEncodingByBuffer(buffer: NodeBuffer): string { + let detected = jschardet.detect(buffer); + if (!detected || !detected.encoding) { + return null; + } + let enc = detected.encoding.toLowerCase(); + + // Ignore encodings that cannot detect correctly + // (http://chardet.readthedocs.io/en/latest/supported-encodings.html) + if (0 <= IGNORE_ENCODINGS.indexOf(enc)) { + return null; + } + + return detected.encoding; +} /** * The encodings that are allowed in a settings file don't match the canonical encoding labels specified by WHATWG. * See https://encoding.spec.whatwg.org/#names-and-labels diff --git a/src/vs/base/node/mime.ts b/src/vs/base/node/mime.ts index 975867a36ab..133f2f17e25 100644 --- a/src/vs/base/node/mime.ts +++ b/src/vs/base/node/mime.ts @@ -79,6 +79,9 @@ export function detectMimeAndEncodingFromBuffer({buffer, bytesRead}: stream.Read } } } + if (isText && !enc) { + enc = encoding.detectEncodingByBuffer(buffer); + } return { mimes: isText ? [mime.MIME_TEXT] : [mime.MIME_BINARY], -- GitLab