support `-report-type chardet`

chardet will detect input sql charset

support `-report-type chardet`
chardet will detect input sql charset
47bf9643 · martianzhang · 378dbf76 · 47bf9643 · 47bf9643 · 47bf9643
21 changed file
--- a/cmd/soar/soar.go
+++ b/cmd/soar/soar.go
@@ -133,6 +133,10 @@ func main() {
 		// 注意： 这里只能处理一条 SQL 的 EXPLAIN 信息，用户一次反馈多条 SQL 的 EXPLAIN 信息无法处理
 		advisor.DigestExplainText(sql)
 		return
+	case "chardet":
+		// Get charset of input
+		fmt.Println(common.Chardet([]byte(sql)))
+		return
 	case "remove-comment":
 		fmt.Println(string(database.RemoveSQLComments([]byte(sql))))
 		return

--- a/common/chardet.go
+++ b/common/chardet.go
+/*
+ * Copyright 2018 Xiaomi, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package common
+
+import (
+	"github.com/saintfish/chardet"
+)
+
+// Chardet get best match charset
+func Chardet(buf []byte) string {
+	detector := chardet.NewTextDetector()
+	result, err := detector.DetectBest(buf)
+	if err != nil {
+		return "unknown"
+	}
+	return result.Charset
+
+}
--- a/common/chardet_test.go
+++ b/common/chardet_test.go
+/*
+ * Copyright 2018 Xiaomi, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package common
+
+import (
+	"io/ioutil"
+	"testing"
+)
+
+func TestChardet(t *testing.T) {
+	charsets := []string{
+		"GB-18030",
+		"UTF-8",
+	}
+	for _, c := range charsets {
+		fileName := DevPath + "/common/testdata/chardet_" + c + ".txt"
+		buf, err := ioutil.ReadFile(fileName)
+		if err != nil {
+			t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error())
+		}
+		name := Chardet(buf)
+		if name != c {
+			t.Errorf("file: %s, Want: %s, Get: %s", fileName, c, name)
+		}
+	}
+}
--- a/common/config.go
+++ b/common/config.go
@@ -828,6 +828,11 @@ from
 		Description: "去除SQL语句中的注释，支持单行多行注释的去除",
 		Example:     `echo "select/*comment*/ * from film" | soar -report-type remove-comment`,
 	},
+	{
+		Name:        "chardet",
+		Description: "猜测输入的 SQL 使用的字符集",
+		Example:     "echo '中文' | soar -report-type chardet",
+	},
 }

 // ListReportTypes 查看所有支持的report-type

--- a/common/testdata/TestListReportTypes.golden
+++ b/common/testdata/TestListReportTypes.golden
@@ -131,3 +131,11 @@ echo "select * from film" | soar -report-type pretty
 ```bash
 echo "select/*comment*/ * from film" | soar -report-type remove-comment
 ```
+## chardet
+* **Description**:猜测输入的 SQL 使用的字符集
+
+* **Example**:
+
+```bash
+echo '中文' | soar -report-type chardet
+```
--- a/common/testdata/chardet_BIG5.txt
+++ b/common/testdata/chardet_BIG5.txt
+¤¤¤å
--- a/common/testdata/chardet_GB-18030.txt
+++ b/common/testdata/chardet_GB-18030.txt
+
--- a/common/testdata/chardet_UTF-8.txt
+++ b/common/testdata/chardet_UTF-8.txt
+中文
--- a/doc/report_type.md
+++ b/doc/report_type.md
@@ -131,3 +131,11 @@ echo "select * from film" | soar -report-type pretty
 ```bash
 echo "select/*comment*/ * from film" | soar -report-type remove-comment
 ```
+## chardet
+* **Description**:猜测输入的 SQL 使用的字符集
+
+* **Example**:
+
+```bash
+echo '中文' | soar -report-type chardet
+```
--- a/vendor/github.com/saintfish/chardet/2022.go
+++ b/vendor/github.com/saintfish/chardet/2022.go
+package chardet
+
+import (
+	"bytes"
+)
+
+type recognizer2022 struct {
+	charset string
+	escapes [][]byte
+}
+
+func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
+	return recognizerOutput{
+		Charset:    r.charset,
+		Confidence: r.matchConfidence(input.input),
+	}
+}
+
+func (r *recognizer2022) matchConfidence(input []byte) int {
+	var hits, misses, shifts int
+input:
+	for i := 0; i < len(input); i++ {
+		c := input[i]
+		if c == 0x1B {
+			for _, esc := range r.escapes {
+				if bytes.HasPrefix(input[i+1:], esc) {
+					hits++
+					i += len(esc)
+					continue input
+				}
+			}
+			misses++
+		} else if c == 0x0E || c == 0x0F {
+			shifts++
+		}
+	}
+	if hits == 0 {
+		return 0
+	}
+	quality := (100*hits - 100*misses) / (hits + misses)
+	if hits+shifts < 5 {
+		quality -= (5 - (hits + shifts)) * 10
+	}
+	if quality < 0 {
+		quality = 0
+	}
+	return quality
+}
+
+var escapeSequences_2022JP = [][]byte{
+	{0x24, 0x28, 0x43}, // KS X 1001:1992
+	{0x24, 0x28, 0x44}, // JIS X 212-1990
+	{0x24, 0x40},       // JIS C 6226-1978
+	{0x24, 0x41},       // GB 2312-80
+	{0x24, 0x42},       // JIS X 208-1983
+	{0x26, 0x40},       // JIS X 208 1990, 1997
+	{0x28, 0x42},       // ASCII
+	{0x28, 0x48},       // JIS-Roman
+	{0x28, 0x49},       // Half-width katakana
+	{0x28, 0x4a},       // JIS-Roman
+	{0x2e, 0x41},       // ISO 8859-1
+	{0x2e, 0x46},       // ISO 8859-7
+}
+
+var escapeSequences_2022KR = [][]byte{
+	{0x24, 0x29, 0x43},
+}
+
+var escapeSequences_2022CN = [][]byte{
+	{0x24, 0x29, 0x41}, // GB 2312-80
+	{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
+	{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
+	{0x24, 0x29, 0x45}, // ISO-IR-165
+	{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
+	{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
+	{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
+	{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
+	{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
+	{0x4e},             // SS2
+	{0x4f},             // SS3
+}
+
+func newRecognizer_2022JP() *recognizer2022 {
+	return &recognizer2022{
+		"ISO-2022-JP",
+		escapeSequences_2022JP,
+	}
+}
+
+func newRecognizer_2022KR() *recognizer2022 {
+	return &recognizer2022{
+		"ISO-2022-KR",
+		escapeSequences_2022KR,
+	}
+}
+
+func newRecognizer_2022CN() *recognizer2022 {
+	return &recognizer2022{
+		"ISO-2022-CN",
+		escapeSequences_2022CN,
+	}
+}
--- a/vendor/github.com/saintfish/chardet/AUTHORS
+++ b/vendor/github.com/saintfish/chardet/AUTHORS
+Sheng Yu (yusheng dot sjtu at gmail dot com)
--- a/vendor/github.com/saintfish/chardet/LICENSE
+++ b/vendor/github.com/saintfish/chardet/LICENSE
+Copyright (c) 2012 chardet Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+Partial of the Software is derived from ICU project. See icu-license.html for
+license of the derivative portions.
--- a/vendor/github.com/saintfish/chardet/README.md
+++ b/vendor/github.com/saintfish/chardet/README.md
+# chardet
+
+chardet is library to automatically detect
+[charset](http://en.wikipedia.org/wiki/Character_encoding) of texts for [Go
+programming language](http://golang.org/). It's based on the algorithm and data
+in [ICU](http://icu-project.org/)'s implementation.
+
+## Documentation and Usage
+
+See [pkgdoc](http://go.pkgdoc.org/github.com/saintfish/chardet)
--- a/vendor/github.com/saintfish/chardet/detector.go
+++ b/vendor/github.com/saintfish/chardet/detector.go
+// Package chardet ports character set detection from ICU.
+package chardet
+
+import (
+	"errors"
+	"sort"
+)
+
+// Result contains all the information that charset detector gives.
+type Result struct {
+	// IANA name of the detected charset.
+	Charset string
+	// IANA name of the detected language. It may be empty for some charsets.
+	Language string
+	// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
+	Confidence int
+}
+
+// Detector implements charset detection.
+type Detector struct {
+	recognizers []recognizer
+	stripTag    bool
+}
+
+// List of charset recognizers
+var recognizers = []recognizer{
+	newRecognizer_utf8(),
+	newRecognizer_utf16be(),
+	newRecognizer_utf16le(),
+	newRecognizer_utf32be(),
+	newRecognizer_utf32le(),
+	newRecognizer_8859_1_en(),
+	newRecognizer_8859_1_da(),
+	newRecognizer_8859_1_de(),
+	newRecognizer_8859_1_es(),
+	newRecognizer_8859_1_fr(),
+	newRecognizer_8859_1_it(),
+	newRecognizer_8859_1_nl(),
+	newRecognizer_8859_1_no(),
+	newRecognizer_8859_1_pt(),
+	newRecognizer_8859_1_sv(),
+	newRecognizer_8859_2_cs(),
+	newRecognizer_8859_2_hu(),
+	newRecognizer_8859_2_pl(),
+	newRecognizer_8859_2_ro(),
+	newRecognizer_8859_5_ru(),
+	newRecognizer_8859_6_ar(),
+	newRecognizer_8859_7_el(),
+	newRecognizer_8859_8_I_he(),
+	newRecognizer_8859_8_he(),
+	newRecognizer_windows_1251(),
+	newRecognizer_windows_1256(),
+	newRecognizer_KOI8_R(),
+	newRecognizer_8859_9_tr(),
+
+	newRecognizer_sjis(),
+	newRecognizer_gb_18030(),
+	newRecognizer_euc_jp(),
+	newRecognizer_euc_kr(),
+	newRecognizer_big5(),
+
+	newRecognizer_2022JP(),
+	newRecognizer_2022KR(),
+	newRecognizer_2022CN(),
+
+	newRecognizer_IBM424_he_rtl(),
+	newRecognizer_IBM424_he_ltr(),
+	newRecognizer_IBM420_ar_rtl(),
+	newRecognizer_IBM420_ar_ltr(),
+}
+
+// NewTextDetector creates a Detector for plain text.
+func NewTextDetector() *Detector {
+	return &Detector{recognizers, false}
+}
+
+// NewHtmlDetector creates a Detector for Html.
+func NewHtmlDetector() *Detector {
+	return &Detector{recognizers, true}
+}
+
+var (
+	NotDetectedError = errors.New("Charset not detected.")
+)
+
+// DetectBest returns the Result with highest Confidence.
+func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
+	var all []Result
+	if all, err = d.DetectAll(b); err == nil {
+		r = &all[0]
+	}
+	return
+}
+
+// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
+func (d *Detector) DetectAll(b []byte) ([]Result, error) {
+	input := newRecognizerInput(b, d.stripTag)
+	outputChan := make(chan recognizerOutput)
+	for _, r := range d.recognizers {
+		go matchHelper(r, input, outputChan)
+	}
+	outputs := make([]recognizerOutput, 0, len(d.recognizers))
+	for i := 0; i < len(d.recognizers); i++ {
+		o := <-outputChan
+		if o.Confidence > 0 {
+			outputs = append(outputs, o)
+		}
+	}
+	if len(outputs) == 0 {
+		return nil, NotDetectedError
+	}
+
+	sort.Sort(recognizerOutputs(outputs))
+	dedupOutputs := make([]Result, 0, len(outputs))
+	foundCharsets := make(map[string]struct{}, len(outputs))
+	for _, o := range outputs {
+		if _, found := foundCharsets[o.Charset]; !found {
+			dedupOutputs = append(dedupOutputs, Result(o))
+			foundCharsets[o.Charset] = struct{}{}
+		}
+	}
+	if len(dedupOutputs) == 0 {
+		return nil, NotDetectedError
+	}
+	return dedupOutputs, nil
+}
+
+func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
+	outputChan <- r.Match(input)
+}
+
+type recognizerOutputs []recognizerOutput
+
+func (r recognizerOutputs) Len() int           { return len(r) }
+func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
+func (r recognizerOutputs) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
--- a/vendor/github.com/saintfish/chardet/icu-license.html
+++ b/vendor/github.com/saintfish/chardet/icu-license.html
+<html>
+
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
+<title>ICU License - ICU 1.8.1 and later</title>
+</head>
+
+<body BGCOLOR="#ffffff">
+<h2>ICU License - ICU 1.8.1 and later</h2>
+
+<p>COPYRIGHT AND PERMISSION NOTICE</p>
+
+<p>
+Copyright (c) 1995-2012 International Business Machines Corporation and others
+</p>
+<p>
+All rights reserved.
+</p>
+<p>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies
+of the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+</p>
+<p>
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
+THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
+OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
+USE OR PERFORMANCE OF THIS SOFTWARE.
+</p>
+<p>
+Except as contained in this notice, the name of a copyright holder shall not be
+used in advertising or otherwise to promote the sale, use or other dealings in
+this Software without prior written authorization of the copyright holder.
+</p>
+
+<hr>
+<p><small>
+All trademarks and registered trademarks mentioned herein are the property of their respective owners.
+</small></p>
+</body>
+</html>
--- a/vendor/github.com/saintfish/chardet/multi_byte.go
+++ b/vendor/github.com/saintfish/chardet/multi_byte.go
+package chardet
+
+import (
+	"errors"
+	"math"
+)
+
+type recognizerMultiByte struct {
+	charset     string
+	language    string
+	decoder     charDecoder
+	commonChars []uint16
+}
+
+type charDecoder interface {
+	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
+}
+
+func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
+	return recognizerOutput{
+		Charset:    r.charset,
+		Language:   r.language,
+		Confidence: r.matchConfidence(input),
+	}
+}
+
+func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
+	raw := input.raw
+	var c uint16
+	var err error
+	var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
+	for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
+		totalCharCount++
+		if err != nil {
+			badCharCount++
+		} else if c <= 0xFF {
+			singleByteCharCount++
+		} else {
+			doubleByteCharCount++
+			if r.commonChars != nil && binarySearch(r.commonChars, c) {
+				commonCharCount++
+			}
+		}
+		if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
+			return 0
+		}
+	}
+
+	if doubleByteCharCount <= 10 && badCharCount == 0 {
+		if doubleByteCharCount == 0 && totalCharCount < 10 {
+			return 0
+		} else {
+			return 10
+		}
+	}
+
+	if doubleByteCharCount < 20*badCharCount {
+		return 0
+	}
+	if r.commonChars == nil {
+		confidence := 30 + doubleByteCharCount - 20*badCharCount
+		if confidence > 100 {
+			confidence = 100
+		}
+		return confidence
+	}
+	maxVal := math.Log(float64(doubleByteCharCount) / 4)
+	scaleFactor := 90 / maxVal
+	confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
+	if confidence > 100 {
+		confidence = 100
+	}
+	if confidence < 0 {
+		confidence = 0
+	}
+	return confidence
+}
+
+func binarySearch(l []uint16, c uint16) bool {
+	start := 0
+	end := len(l) - 1
+	for start <= end {
+		mid := (start + end) / 2
+		if c == l[mid] {
+			return true
+		} else if c < l[mid] {
+			end = mid - 1
+		} else {
+			start = mid + 1
+		}
+	}
+	return false
+}
+
+var eobError = errors.New("End of input buffer")
+var badCharError = errors.New("Decode a bad char")
+
+type charDecoder_sjis struct {
+}
+
+func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
+	if len(input) == 0 {
+		return 0, nil, eobError
+	}
+	first := input[0]
+	c = uint16(first)
+	remain = input[1:]
+	if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
+		return
+	}
+	if len(remain) == 0 {
+		return c, remain, badCharError
+	}
+	second := remain[0]
+	remain = remain[1:]
+	c = c<<8 | uint16(second)
+	if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
+	} else {
+		err = badCharError
+	}
+	return
+}
+
+var commonChars_sjis = []uint16{
+	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
+	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
+	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
+	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
+	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
+	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
+}
+
+func newRecognizer_sjis() *recognizerMultiByte {
+	return &recognizerMultiByte{
+		"Shift_JIS",
+		"ja",
+		charDecoder_sjis{},
+		commonChars_sjis,
+	}
+}
+
+type charDecoder_euc struct {
+}
+
+func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
+	if len(input) == 0 {
+		return 0, nil, eobError
+	}
+	first := input[0]
+	remain = input[1:]
+	c = uint16(first)
+	if first <= 0x8D {
+		return uint16(first), remain, nil
+	}
+	if len(remain) == 0 {
+		return 0, nil, eobError
+	}
+	second := remain[0]
+	remain = remain[1:]
+	c = c<<8 | uint16(second)
+	if first >= 0xA1 && first <= 0xFE {
+		if second < 0xA1 {
+			err = badCharError
+		}
+		return
+	}
+	if first == 0x8E {
+		if second < 0xA1 {
+			err = badCharError
+		}
+		return
+	}
+	if first == 0x8F {
+		if len(remain) == 0 {
+			return 0, nil, eobError
+		}
+		third := remain[0]
+		remain = remain[1:]
+		c = c<<0 | uint16(third)
+		if third < 0xa1 {
+			err = badCharError
+		}
+	}
+	return
+}
+
+var commonChars_euc_jp = []uint16{
+	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
+	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
+	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
+	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
+	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
+	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
+	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
+	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
+	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
+	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
+}
+
+var commonChars_euc_kr = []uint16{
+	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
+	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
+	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
+	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
+	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
+	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
+	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
+	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
+	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
+	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
+}
+
+func newRecognizer_euc_jp() *recognizerMultiByte {
+	return &recognizerMultiByte{
+		"EUC-JP",
+		"ja",
+		charDecoder_euc{},
+		commonChars_euc_jp,
+	}
+}
+
+func newRecognizer_euc_kr() *recognizerMultiByte {
+	return &recognizerMultiByte{
+		"EUC-KR",
+		"ko",
+		charDecoder_euc{},
+		commonChars_euc_kr,
+	}
+}
+
+type charDecoder_big5 struct {
+}
+
+func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
+	if len(input) == 0 {
+		return 0, nil, eobError
+	}
+	first := input[0]
+	remain = input[1:]
+	c = uint16(first)
+	if first <= 0x7F || first == 0xFF {
+		return
+	}
+	if len(remain) == 0 {
+		return c, nil, eobError
+	}
+	second := remain[0]
+	remain = remain[1:]
+	c = c<<8 | uint16(second)
+	if second < 0x40 || second == 0x7F || second == 0xFF {
+		err = badCharError
+	}
+	return
+}
+
+var commonChars_big5 = []uint16{
+	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
+	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
+	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
+	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
+	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
+	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
+	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
+	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
+	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
+	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
+}
+
+func newRecognizer_big5() *recognizerMultiByte {
+	return &recognizerMultiByte{
+		"Big5",
+		"zh",
+		charDecoder_big5{},
+		commonChars_big5,
+	}
+}
+
+type charDecoder_gb_18030 struct {
+}
+
+func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
+	if len(input) == 0 {
+		return 0, nil, eobError
+	}
+	first := input[0]
+	remain = input[1:]
+	c = uint16(first)
+	if first <= 0x80 {
+		return
+	}
+	if len(remain) == 0 {
+		return 0, nil, eobError
+	}
+	second := remain[0]
+	remain = remain[1:]
+	c = c<<8 | uint16(second)
+	if first >= 0x81 && first <= 0xFE {
+		if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
+			return
+		}
+
+		if second >= 0x30 && second <= 0x39 {
+			if len(remain) == 0 {
+				return 0, nil, eobError
+			}
+			third := remain[0]
+			remain = remain[1:]
+			if third >= 0x81 && third <= 0xFE {
+				if len(remain) == 0 {
+					return 0, nil, eobError
+				}
+				fourth := remain[0]
+				remain = remain[1:]
+				if fourth >= 0x30 && fourth <= 0x39 {
+					c = c<<16 | uint16(third)<<8 | uint16(fourth)
+					return
+				}
+			}
+		}
+		err = badCharError
+	}
+	return
+}
+
+var commonChars_gb_18030 = []uint16{
+	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
+	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
+	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
+	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
+	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
+	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
+	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
+	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
+	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
+	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
+}
+
+func newRecognizer_gb_18030() *recognizerMultiByte {
+	return &recognizerMultiByte{
+		"GB-18030",
+		"zh",
+		charDecoder_gb_18030{},
+		commonChars_gb_18030,
+	}
+}
--- a/vendor/github.com/saintfish/chardet/recognizer.go
+++ b/vendor/github.com/saintfish/chardet/recognizer.go
+package chardet
+
+type recognizer interface {
+	Match(*recognizerInput) recognizerOutput
+}
+
+type recognizerOutput Result
+
+type recognizerInput struct {
+	raw         []byte
+	input       []byte
+	tagStripped bool
+	byteStats   []int
+	hasC1Bytes  bool
+}
+
+func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
+	input, stripped := mayStripInput(raw, stripTag)
+	byteStats := computeByteStats(input)
+	return &recognizerInput{
+		raw:         raw,
+		input:       input,
+		tagStripped: stripped,
+		byteStats:   byteStats,
+		hasC1Bytes:  computeHasC1Bytes(byteStats),
+	}
+}
+
+func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
+	const inputBufferSize = 8192
+	out = make([]byte, 0, inputBufferSize)
+	var badTags, openTags int32
+	var inMarkup bool = false
+	stripped = false
+	if stripTag {
+		stripped = true
+		for _, c := range raw {
+			if c == '<' {
+				if inMarkup {
+					badTags += 1
+				}
+				inMarkup = true
+				openTags += 1
+			}
+			if !inMarkup {
+				out = append(out, c)
+				if len(out) >= inputBufferSize {
+					break
+				}
+			}
+			if c == '>' {
+				inMarkup = false
+			}
+		}
+	}
+	if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
+		limit := len(raw)
+		if limit > inputBufferSize {
+			limit = inputBufferSize
+		}
+		out = make([]byte, limit)
+		copy(out, raw[:limit])
+		stripped = false
+	}
+	return
+}
+
+func computeByteStats(input []byte) []int {
+	r := make([]int, 256)
+	for _, c := range input {
+		r[c] += 1
+	}
+	return r
+}
+
+func computeHasC1Bytes(byteStats []int) bool {
+	for _, count := range byteStats[0x80 : 0x9F+1] {
+		if count > 0 {
+			return true
+		}
+	}
+	return false
+}
--- a/vendor/github.com/saintfish/chardet/single_byte.go
+++ b/vendor/github.com/saintfish/chardet/single_byte.go
--- a/vendor/github.com/saintfish/chardet/unicode.go
+++ b/vendor/github.com/saintfish/chardet/unicode.go
+package chardet
+
+import (
+	"bytes"
+)
+
+var (
+	utf16beBom = []byte{0xFE, 0xFF}
+	utf16leBom = []byte{0xFF, 0xFE}
+	utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
+	utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
+)
+
+type recognizerUtf16be struct {
+}
+
+func newRecognizer_utf16be() *recognizerUtf16be {
+	return &recognizerUtf16be{}
+}
+
+func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
+	output = recognizerOutput{
+		Charset: "UTF-16BE",
+	}
+	if bytes.HasPrefix(input.raw, utf16beBom) {
+		output.Confidence = 100
+	}
+	return
+}
+
+type recognizerUtf16le struct {
+}
+
+func newRecognizer_utf16le() *recognizerUtf16le {
+	return &recognizerUtf16le{}
+}
+
+func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
+	output = recognizerOutput{
+		Charset: "UTF-16LE",
+	}
+	if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
+		output.Confidence = 100
+	}
+	return
+}
+
+type recognizerUtf32 struct {
+	name       string
+	bom        []byte
+	decodeChar func(input []byte) uint32
+}
+
+func decodeUtf32be(input []byte) uint32 {
+	return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
+}
+
+func decodeUtf32le(input []byte) uint32 {
+	return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
+}
+
+func newRecognizer_utf32be() *recognizerUtf32 {
+	return &recognizerUtf32{
+		"UTF-32BE",
+		utf32beBom,
+		decodeUtf32be,
+	}
+}
+
+func newRecognizer_utf32le() *recognizerUtf32 {
+	return &recognizerUtf32{
+		"UTF-32LE",
+		utf32leBom,
+		decodeUtf32le,
+	}
+}
+
+func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
+	output = recognizerOutput{
+		Charset: r.name,
+	}
+	hasBom := bytes.HasPrefix(input.raw, r.bom)
+	var numValid, numInvalid uint32
+	for b := input.raw; len(b) >= 4; b = b[4:] {
+		if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
+			numInvalid++
+		} else {
+			numValid++
+		}
+	}
+	if hasBom && numInvalid == 0 {
+		output.Confidence = 100
+	} else if hasBom && numValid > numInvalid*10 {
+		output.Confidence = 80
+	} else if numValid > 3 && numInvalid == 0 {
+		output.Confidence = 100
+	} else if numValid > 0 && numInvalid == 0 {
+		output.Confidence = 80
+	} else if numValid > numInvalid*10 {
+		output.Confidence = 25
+	}
+	return
+}
--- a/vendor/github.com/saintfish/chardet/utf8.go
+++ b/vendor/github.com/saintfish/chardet/utf8.go
+package chardet
+
+import (
+	"bytes"
+)
+
+var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
+
+type recognizerUtf8 struct {
+}
+
+func newRecognizer_utf8() *recognizerUtf8 {
+	return &recognizerUtf8{}
+}
+
+func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
+	output = recognizerOutput{
+		Charset: "UTF-8",
+	}
+	hasBom := bytes.HasPrefix(input.raw, utf8Bom)
+	inputLen := len(input.raw)
+	var numValid, numInvalid uint32
+	var trailBytes uint8
+	for i := 0; i < inputLen; i++ {
+		c := input.raw[i]
+		if c&0x80 == 0 {
+			continue
+		}
+		if c&0xE0 == 0xC0 {
+			trailBytes = 1
+		} else if c&0xF0 == 0xE0 {
+			trailBytes = 2
+		} else if c&0xF8 == 0xF0 {
+			trailBytes = 3
+		} else {
+			numInvalid++
+			if numInvalid > 5 {
+				break
+			}
+			trailBytes = 0
+		}
+
+		for i++; i < inputLen; i++ {
+			c = input.raw[i]
+			if c&0xC0 != 0x80 {
+				numInvalid++
+				break
+			}
+			if trailBytes--; trailBytes == 0 {
+				numValid++
+				break
+			}
+		}
+	}
+
+	if hasBom && numInvalid == 0 {
+		output.Confidence = 100
+	} else if hasBom && numValid > numInvalid*10 {
+		output.Confidence = 80
+	} else if numValid > 3 && numInvalid == 0 {
+		output.Confidence = 100
+	} else if numValid > 0 && numInvalid == 0 {
+		output.Confidence = 80
+	} else if numValid == 0 && numInvalid == 0 {
+		// Plain ASCII
+		output.Confidence = 10
+	} else if numValid > numInvalid*10 {
+		output.Confidence = 25
+	}
+	return
+}
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -417,12 +417,6 @@
 			"revision": "6d0bcf50ac9eb25d2e6f8fbd686d488a701eba55",
 			"revisionTime": "2018-08-20T15:09:00Z"
 		},
-		{
-			"checksumSHA1": "11D2ZLtDH4gRO/S6BhYcDgsTiCs=",
-			"path": "github.com/pingcap/tidb/util/charset",
-			"revision": "6d0bcf50ac9eb25d2e6f8fbd686d488a701eba55",
-			"revisionTime": "2018-08-20T15:09:00Z"
-		},
 		{
 			"checksumSHA1": "5yYzS0BTxlmELR9P/OB4RV/yw0o=",
 			"path": "github.com/pingcap/tidb/util/chunk",
@@ -546,6 +540,12 @@
 			"revision": "11635eb403ff09dbc3a6b5a007ab5ab09151c229",
 			"revisionTime": "2018-04-28T10:25:19Z"
 		},
+		{
+			"checksumSHA1": "EEXppdradk5G/UaQUQ95BQ9n7uw=",
+			"path": "github.com/saintfish/chardet",
+			"revision": "3af4cd4741ca4f3eb0c407c034571a6fb0ea529c",
+			"revisionTime": "2012-08-16T06:12:21Z"
+		},
 		{
 			"checksumSHA1": "p7FQsKPOhh5D4XkBZ7VKDU/rkLI=",
 			"origin": "github.com/pingcap/tidb/vendor/github.com/sirupsen/logrus",