chardet.go 2.2 KB
Newer Older
martianzhang's avatar
martianzhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 * Copyright 2018 Xiaomi, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package common

import (
	"github.com/saintfish/chardet"
)

// Chardet get best match charset
func Chardet(buf []byte) string {
martianzhang's avatar
martianzhang 已提交
25 26 27 28 29 30 31 32 33
	// check character set by file BOM
	charset := CheckCharsetByBOM(buf)
	if charset != "" {
		return charset
	}

	// use chardet pkg check file charset
	charset = "unknown"
	var confidence int
martianzhang's avatar
martianzhang 已提交
34
	detector := chardet.NewTextDetector()
martianzhang's avatar
martianzhang 已提交
35 36 37 38

	// detector.DetectBest is unstable
	// when the confidence value are equally, the best detect charset will be random
	result, err := detector.DetectAll(buf)
martianzhang's avatar
martianzhang 已提交
39
	if err != nil {
martianzhang's avatar
martianzhang 已提交
40 41 42 43 44
		return charset
	}

	// SOAR's main user speak Chinese, GB-18030, UTF-8 are higher suggested
	for _, r := range result {
L
liipx 已提交
45
		if confidence > r.Confidence && r.Confidence != 0 {
martianzhang's avatar
martianzhang 已提交
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
			return charset
		}
		confidence = r.Confidence
		if r.Charset == "GB-18030" || r.Charset == "UTF-8" {
			return r.Charset
		}
		charset = r.Charset
	}
	return charset
}

// CheckCharsetByBOM ref: https://en.wikipedia.org/wiki/Byte_order_mark
func CheckCharsetByBOM(buf []byte) string {
	// TODO: There are many kind of BOM
	// UTF-8	EF BB BF
	if len(buf) >= 3 {
		if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf {
			return "UTF-8"
		}
	}
	// GB-18030	84 31 95 33
	if len(buf) >= 4 {
		if buf[0] == 0x84 && buf[1] == 0x31 && buf[2] == 0x95 && buf[3] == 0x33 {
			return "GB-18030"
		}
martianzhang's avatar
martianzhang 已提交
71
	}
martianzhang's avatar
martianzhang 已提交
72 73
	return ""
}
martianzhang's avatar
martianzhang 已提交
74

martianzhang's avatar
martianzhang 已提交
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
// RemoveBOM remove bom from file
func RemoveBOM(buf []byte) (string, []byte) {
	// ef bb bf, UTF-8 BOM
	if len(buf) > 3 {
		if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf {
			return string(buf[3:]), buf[:3]
		}
	}
	// ff fe, UTF-16 (LE) BOM
	if len(buf) > 2 {
		if buf[0] == 0xff && buf[1] == 0xfe {
			return string(buf[2:]), buf[:2]
		}
	}
	return string(buf), []byte{}
martianzhang's avatar
martianzhang 已提交
90
}