From 9d7ec0f37d5cbe7ebab3fb0c7e633611e0401e17 Mon Sep 17 00:00:00 2001 From: Leon Zhang Date: Tue, 20 Nov 2018 17:25:14 +0800 Subject: [PATCH] fix #110 remove bom before auditing --- cmd/soar/soar.go | 12 +++++- common/chardet.go | 64 ++++++++++++++++++++++++++-- common/chardet_test.go | 24 +++++++++++ common/testdata/TestRemoveBOM.golden | 2 + common/testdata/UTF-8.bom.sql | 1 + database/explain.go | 2 +- 6 files changed, 99 insertions(+), 6 deletions(-) create mode 100644 common/testdata/TestRemoveBOM.golden create mode 100644 common/testdata/UTF-8.bom.sql diff --git a/cmd/soar/soar.go b/cmd/soar/soar.go index 060ad9c..d9409d3 100644 --- a/cmd/soar/soar.go +++ b/cmd/soar/soar.go @@ -119,6 +119,10 @@ func main() { } } + // remove bom from file header + var bom []byte + sql, bom = common.RemoveBOM([]byte(sql)) + switch common.Config.ReportType { case "html": // HTML 格式输入 CSS 加载 @@ -129,13 +133,17 @@ func main() { fmt.Println(common.Markdown2HTML(sql)) return case "explain-digest": - // 当用户输入为 EXPLAIN信 息,只对 Explain 信息进行分析 + // 当用户输入为 EXPLAIN 信息,只对 Explain 信息进行分析 // 注意: 这里只能处理一条 SQL 的 EXPLAIN 信息,用户一次反馈多条 SQL 的 EXPLAIN 信息无法处理 advisor.DigestExplainText(sql) return case "chardet": // Get charset of input - fmt.Println(common.Chardet([]byte(sql))) + charset := common.CheckCharsetByBOM(bom) + if charset == "" { + charset = common.Chardet([]byte(sql)) + } + fmt.Println(charset) return case "remove-comment": fmt.Println(string(database.RemoveSQLComments([]byte(sql)))) diff --git a/common/chardet.go b/common/chardet.go index bcf4ce3..4a6c919 100644 --- a/common/chardet.go +++ b/common/chardet.go @@ -22,11 +22,69 @@ import ( // Chardet get best match charset func Chardet(buf []byte) string { + // check character set by file BOM + charset := CheckCharsetByBOM(buf) + if charset != "" { + return charset + } + + // use chardet pkg check file charset + charset = "unknown" + var confidence int detector := chardet.NewTextDetector() - result, err := detector.DetectBest(buf) + + // detector.DetectBest is unstable + // when the confidence value are equally, the best detect charset will be random + result, err := detector.DetectAll(buf) if err != nil { - return "unknown" + return charset + } + + // SOAR's main user speak Chinese, GB-18030, UTF-8 are higher suggested + for _, r := range result { + if confidence >= r.Confidence && r.Confidence != 0 { + return charset + } + confidence = r.Confidence + if r.Charset == "GB-18030" || r.Charset == "UTF-8" { + return r.Charset + } + charset = r.Charset + } + return charset +} + +// CheckCharsetByBOM ref: https://en.wikipedia.org/wiki/Byte_order_mark +func CheckCharsetByBOM(buf []byte) string { + // TODO: There are many kind of BOM + // UTF-8 EF BB BF + if len(buf) >= 3 { + if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf { + return "UTF-8" + } + } + // GB-18030 84 31 95 33 + if len(buf) >= 4 { + if buf[0] == 0x84 && buf[1] == 0x31 && buf[2] == 0x95 && buf[3] == 0x33 { + return "GB-18030" + } } - return result.Charset + return "" +} +// RemoveBOM remove bom from file +func RemoveBOM(buf []byte) (string, []byte) { + // ef bb bf, UTF-8 BOM + if len(buf) > 3 { + if buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf { + return string(buf[3:]), buf[:3] + } + } + // ff fe, UTF-16 (LE) BOM + if len(buf) > 2 { + if buf[0] == 0xff && buf[1] == 0xfe { + return string(buf[2:]), buf[:2] + } + } + return string(buf), []byte{} } diff --git a/common/chardet_test.go b/common/chardet_test.go index 67a2a33..0d6e411 100644 --- a/common/chardet_test.go +++ b/common/chardet_test.go @@ -17,6 +17,7 @@ package common import ( + "fmt" "io/ioutil" "testing" ) @@ -38,3 +39,26 @@ func TestChardet(t *testing.T) { } } } + +func TestRemoveBOM(t *testing.T) { + fileName := DevPath + "/common/testdata/UTF-8.bom.sql" + buf, err := ioutil.ReadFile(fileName) + if err != nil { + t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error()) + } + GoldenDiff(func() { + fmt.Println(RemoveBOM(buf)) + }, t.Name(), update) +} + +func TestCheckCharsetByBOM(t *testing.T) { + fileName := DevPath + "/common/testdata/UTF-8.bom.sql" + buf, err := ioutil.ReadFile(fileName) + if err != nil { + t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error()) + } + + if CheckCharsetByBOM(buf) != "UTF-8" { + t.Errorf("checkCharsetByBOM Want: UTF-8, Get: %s", CheckCharsetByBOM(buf)) + } +} diff --git a/common/testdata/TestRemoveBOM.golden b/common/testdata/TestRemoveBOM.golden new file mode 100644 index 0000000..238f8c1 --- /dev/null +++ b/common/testdata/TestRemoveBOM.golden @@ -0,0 +1,2 @@ +select col from tb c = 1; + [239 187 191] diff --git a/common/testdata/UTF-8.bom.sql b/common/testdata/UTF-8.bom.sql new file mode 100644 index 0000000..a6c2ca6 --- /dev/null +++ b/common/testdata/UTF-8.bom.sql @@ -0,0 +1 @@ +select col from tb c = 1; diff --git a/database/explain.go b/database/explain.go index 658e50e..250ecec 100644 --- a/database/explain.go +++ b/database/explain.go @@ -532,7 +532,7 @@ func (db *Connector) explainAbleSQL(sql string) (string, error) { // 执行explain请求,返回mysql.Result执行结果 func (db *Connector) executeExplain(sql string, explainType int, formatType int) (*QueryResult, error) { var err error - sql, _ = db.explainAbleSQL(sql) + sql, err = db.explainAbleSQL(sql) if sql == "" { return nil, err } -- GitLab