diff --git a/cmd/soar/soar.go b/cmd/soar/soar.go index 060ad9c31a03396195718d2e903975af9d2f9a88..d9409d333ff0abdd550362b43e62142702db8058 100644 --- a/cmd/soar/soar.go +++ b/cmd/soar/soar.go @@ -119,6 +119,10 @@ func main() { } } + // remove bom from file header + var bom []byte + sql, bom = common.RemoveBOM([]byte(sql)) + switch common.Config.ReportType { case "html": // HTML 格式输入 CSS 加载 @@ -129,13 +133,17 @@ func main() { fmt.Println(common.Markdown2HTML(sql)) return case "explain-digest": - // 当用户输入为 EXPLAIN信 息,只对 Explain 信息进行分析 + // 当用户输入为 EXPLAIN 信息,只对 Explain 信息进行分析 // 注意: 这里只能处理一条 SQL 的 EXPLAIN 信息,用户一次反馈多条 SQL 的 EXPLAIN 信息无法处理 advisor.DigestExplainText(sql) return case "chardet": // Get charset of input - fmt.Println(common.Chardet([]byte(sql))) + charset := common.CheckCharsetByBOM(bom) + if charset == "" { + charset = common.Chardet([]byte(sql)) + } + fmt.Println(charset) return case "remove-comment": fmt.Println(string(database.RemoveSQLComments([]byte(sql)))) diff --git a/common/chardet.go b/common/chardet.go index bcf4ce336bb3707f278030f95e86f16b93bd29db..4a6c9195fa17b27cdad3cf9d888d85b695c8c376 100644 --- a/common/chardet.go +++ b/common/chardet.go @@ -22,11 +22,69 @@ import ( // Chardet get best match charset func Chardet(buf []byte) string { + // check character set by file BOM + charset := CheckCharsetByBOM(buf) + if charset != "" { + return charset + } + + // use chardet pkg check file charset + charset = "unknown" + var confidence int detector := chardet.NewTextDetector() - result, err := detector.DetectBest(buf) + + // detector.DetectBest is unstable + // when the confidence value are equally, the best detect charset will be random + result, err := detector.DetectAll(buf) if err != nil { - return "unknown" + return charset + } + + // SOAR's main user speak Chinese, GB-18030, UTF-8 are higher suggested + for _, r := range result { + if confidence >= r.Confidence && r.Confidence != 0 { + return charset + } + confidence = r.Confidence + if r.Charset == "GB-18030" || r.Charset == "UTF-8" { + return r.Charset + } + charset = r.Charset + } + return charset +} + +// CheckCharsetByBOM ref: https://en.wikipedia.org/wiki/Byte_order_mark +func CheckCharsetByBOM(buf []byte) string { + // TODO: There are many kind of BOM + // UTF-8 EF BB BF + if len(buf) >= 3 { + if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf { + return "UTF-8" + } + } + // GB-18030 84 31 95 33 + if len(buf) >= 4 { + if buf[0] == 0x84 && buf[1] == 0x31 && buf[2] == 0x95 && buf[3] == 0x33 { + return "GB-18030" + } } - return result.Charset + return "" +} +// RemoveBOM remove bom from file +func RemoveBOM(buf []byte) (string, []byte) { + // ef bb bf, UTF-8 BOM + if len(buf) > 3 { + if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf { + return string(buf[3:]), buf[:3] + } + } + // ff fe, UTF-16 (LE) BOM + if len(buf) > 2 { + if buf[0] == 0xff && buf[1] == 0xfe { + return string(buf[2:]), buf[:2] + } + } + return string(buf), []byte{} } diff --git a/common/chardet_test.go b/common/chardet_test.go index 67a2a33f8aeb3d603a88e8df6ae24b959fe892ad..0d6e411f32a75d19d74879e7f375a0420a8c0c9f 100644 --- a/common/chardet_test.go +++ b/common/chardet_test.go @@ -17,6 +17,7 @@ package common import ( + "fmt" "io/ioutil" "testing" ) @@ -38,3 +39,26 @@ func TestChardet(t *testing.T) { } } } + +func TestRemoveBOM(t *testing.T) { + fileName := DevPath + "/common/testdata/UTF-8.bom.sql" + buf, err := ioutil.ReadFile(fileName) + if err != nil { + t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error()) + } + GoldenDiff(func() { + fmt.Println(RemoveBOM(buf)) + }, t.Name(), update) +} + +func TestCheckCharsetByBOM(t *testing.T) { + fileName := DevPath + "/common/testdata/UTF-8.bom.sql" + buf, err := ioutil.ReadFile(fileName) + if err != nil { + t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error()) + } + + if CheckCharsetByBOM(buf) != "UTF-8" { + t.Errorf("checkCharsetByBOM Want: UTF-8, Get: %s", CheckCharsetByBOM(buf)) + } +} diff --git a/common/testdata/TestRemoveBOM.golden b/common/testdata/TestRemoveBOM.golden new file mode 100644 index 0000000000000000000000000000000000000000..238f8c1699af79aad7496f7c4b43a82a4569dad9 --- /dev/null +++ b/common/testdata/TestRemoveBOM.golden @@ -0,0 +1,2 @@ +select col from tb c = 1; + [239 187 191] diff --git a/common/testdata/UTF-8.bom.sql b/common/testdata/UTF-8.bom.sql new file mode 100644 index 0000000000000000000000000000000000000000..a6c2ca6d7b718b32f268ccccf782b5f96a8ce99f --- /dev/null +++ b/common/testdata/UTF-8.bom.sql @@ -0,0 +1 @@ +select col from tb c = 1; diff --git a/database/explain.go b/database/explain.go index 658e50e56ad41aa69d65aaa3415b96b6f0ee135e..250ecec18a1c5e00d95842553698edaa5c0d2965 100644 --- a/database/explain.go +++ b/database/explain.go @@ -532,7 +532,7 @@ func (db *Connector) explainAbleSQL(sql string) (string, error) { // 执行explain请求,返回mysql.Result执行结果 func (db *Connector) executeExplain(sql string, explainType int, formatType int) (*QueryResult, error) { var err error - sql, _ = db.explainAbleSQL(sql) + sql, err = db.explainAbleSQL(sql) if sql == "" { return nil, err }