提交 47bf9643 编写于 作者: martianzhang's avatar martianzhang

support `-report-type chardet`

  chardet will detect input sql charset
上级 378dbf76
......@@ -133,6 +133,10 @@ func main() {
// 注意: 这里只能处理一条 SQL 的 EXPLAIN 信息,用户一次反馈多条 SQL 的 EXPLAIN 信息无法处理
advisor.DigestExplainText(sql)
return
case "chardet":
// Get charset of input
fmt.Println(common.Chardet([]byte(sql)))
return
case "remove-comment":
fmt.Println(string(database.RemoveSQLComments([]byte(sql))))
return
......
/*
* Copyright 2018 Xiaomi, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package common
import (
"github.com/saintfish/chardet"
)
// Chardet get best match charset
func Chardet(buf []byte) string {
detector := chardet.NewTextDetector()
result, err := detector.DetectBest(buf)
if err != nil {
return "unknown"
}
return result.Charset
}
/*
* Copyright 2018 Xiaomi, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package common
import (
"io/ioutil"
"testing"
)
func TestChardet(t *testing.T) {
charsets := []string{
"GB-18030",
"UTF-8",
}
for _, c := range charsets {
fileName := DevPath + "/common/testdata/chardet_" + c + ".txt"
buf, err := ioutil.ReadFile(fileName)
if err != nil {
t.Errorf("ioutil.ReadFile %s, Error: %s", fileName, err.Error())
}
name := Chardet(buf)
if name != c {
t.Errorf("file: %s, Want: %s, Get: %s", fileName, c, name)
}
}
}
......@@ -828,6 +828,11 @@ from
Description: "去除SQL语句中的注释,支持单行多行注释的去除",
Example: `echo "select/*comment*/ * from film" | soar -report-type remove-comment`,
},
{
Name: "chardet",
Description: "猜测输入的 SQL 使用的字符集",
Example: "echo '中文' | soar -report-type chardet",
},
}
// ListReportTypes 查看所有支持的report-type
......
......@@ -131,3 +131,11 @@ echo "select * from film" | soar -report-type pretty
```bash
echo "select/*comment*/ * from film" | soar -report-type remove-comment
```
## chardet
* **Description**:猜测输入的 SQL 使用的字符集
* **Example**:
```bash
echo '中文' | soar -report-type chardet
```
......@@ -131,3 +131,11 @@ echo "select * from film" | soar -report-type pretty
```bash
echo "select/*comment*/ * from film" | soar -report-type remove-comment
```
## chardet
* **Description**:猜测输入的 SQL 使用的字符集
* **Example**:
```bash
echo '中文' | soar -report-type chardet
```
package chardet
import (
"bytes"
)
type recognizer2022 struct {
charset string
escapes [][]byte
}
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Confidence: r.matchConfidence(input.input),
}
}
func (r *recognizer2022) matchConfidence(input []byte) int {
var hits, misses, shifts int
input:
for i := 0; i < len(input); i++ {
c := input[i]
if c == 0x1B {
for _, esc := range r.escapes {
if bytes.HasPrefix(input[i+1:], esc) {
hits++
i += len(esc)
continue input
}
}
misses++
} else if c == 0x0E || c == 0x0F {
shifts++
}
}
if hits == 0 {
return 0
}
quality := (100*hits - 100*misses) / (hits + misses)
if hits+shifts < 5 {
quality -= (5 - (hits + shifts)) * 10
}
if quality < 0 {
quality = 0
}
return quality
}
var escapeSequences_2022JP = [][]byte{
{0x24, 0x28, 0x43}, // KS X 1001:1992
{0x24, 0x28, 0x44}, // JIS X 212-1990
{0x24, 0x40}, // JIS C 6226-1978
{0x24, 0x41}, // GB 2312-80
{0x24, 0x42}, // JIS X 208-1983
{0x26, 0x40}, // JIS X 208 1990, 1997
{0x28, 0x42}, // ASCII
{0x28, 0x48}, // JIS-Roman
{0x28, 0x49}, // Half-width katakana
{0x28, 0x4a}, // JIS-Roman
{0x2e, 0x41}, // ISO 8859-1
{0x2e, 0x46}, // ISO 8859-7
}
var escapeSequences_2022KR = [][]byte{
{0x24, 0x29, 0x43},
}
var escapeSequences_2022CN = [][]byte{
{0x24, 0x29, 0x41}, // GB 2312-80
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
{0x24, 0x29, 0x45}, // ISO-IR-165
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
{0x4e}, // SS2
{0x4f}, // SS3
}
func newRecognizer_2022JP() *recognizer2022 {
return &recognizer2022{
"ISO-2022-JP",
escapeSequences_2022JP,
}
}
func newRecognizer_2022KR() *recognizer2022 {
return &recognizer2022{
"ISO-2022-KR",
escapeSequences_2022KR,
}
}
func newRecognizer_2022CN() *recognizer2022 {
return &recognizer2022{
"ISO-2022-CN",
escapeSequences_2022CN,
}
}
Sheng Yu (yusheng dot sjtu at gmail dot com)
Copyright (c) 2012 chardet Authors
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Partial of the Software is derived from ICU project. See icu-license.html for
license of the derivative portions.
# chardet
chardet is library to automatically detect
[charset](http://en.wikipedia.org/wiki/Character_encoding) of texts for [Go
programming language](http://golang.org/). It's based on the algorithm and data
in [ICU](http://icu-project.org/)'s implementation.
## Documentation and Usage
See [pkgdoc](http://go.pkgdoc.org/github.com/saintfish/chardet)
// Package chardet ports character set detection from ICU.
package chardet
import (
"errors"
"sort"
)
// Result contains all the information that charset detector gives.
type Result struct {
// IANA name of the detected charset.
Charset string
// IANA name of the detected language. It may be empty for some charsets.
Language string
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
Confidence int
}
// Detector implements charset detection.
type Detector struct {
recognizers []recognizer
stripTag bool
}
// List of charset recognizers
var recognizers = []recognizer{
newRecognizer_utf8(),
newRecognizer_utf16be(),
newRecognizer_utf16le(),
newRecognizer_utf32be(),
newRecognizer_utf32le(),
newRecognizer_8859_1_en(),
newRecognizer_8859_1_da(),
newRecognizer_8859_1_de(),
newRecognizer_8859_1_es(),
newRecognizer_8859_1_fr(),
newRecognizer_8859_1_it(),
newRecognizer_8859_1_nl(),
newRecognizer_8859_1_no(),
newRecognizer_8859_1_pt(),
newRecognizer_8859_1_sv(),
newRecognizer_8859_2_cs(),
newRecognizer_8859_2_hu(),
newRecognizer_8859_2_pl(),
newRecognizer_8859_2_ro(),
newRecognizer_8859_5_ru(),
newRecognizer_8859_6_ar(),
newRecognizer_8859_7_el(),
newRecognizer_8859_8_I_he(),
newRecognizer_8859_8_he(),
newRecognizer_windows_1251(),
newRecognizer_windows_1256(),
newRecognizer_KOI8_R(),
newRecognizer_8859_9_tr(),
newRecognizer_sjis(),
newRecognizer_gb_18030(),
newRecognizer_euc_jp(),
newRecognizer_euc_kr(),
newRecognizer_big5(),
newRecognizer_2022JP(),
newRecognizer_2022KR(),
newRecognizer_2022CN(),
newRecognizer_IBM424_he_rtl(),
newRecognizer_IBM424_he_ltr(),
newRecognizer_IBM420_ar_rtl(),
newRecognizer_IBM420_ar_ltr(),
}
// NewTextDetector creates a Detector for plain text.
func NewTextDetector() *Detector {
return &Detector{recognizers, false}
}
// NewHtmlDetector creates a Detector for Html.
func NewHtmlDetector() *Detector {
return &Detector{recognizers, true}
}
var (
NotDetectedError = errors.New("Charset not detected.")
)
// DetectBest returns the Result with highest Confidence.
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
var all []Result
if all, err = d.DetectAll(b); err == nil {
r = &all[0]
}
return
}
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
input := newRecognizerInput(b, d.stripTag)
outputChan := make(chan recognizerOutput)
for _, r := range d.recognizers {
go matchHelper(r, input, outputChan)
}
outputs := make([]recognizerOutput, 0, len(d.recognizers))
for i := 0; i < len(d.recognizers); i++ {
o := <-outputChan
if o.Confidence > 0 {
outputs = append(outputs, o)
}
}
if len(outputs) == 0 {
return nil, NotDetectedError
}
sort.Sort(recognizerOutputs(outputs))
dedupOutputs := make([]Result, 0, len(outputs))
foundCharsets := make(map[string]struct{}, len(outputs))
for _, o := range outputs {
if _, found := foundCharsets[o.Charset]; !found {
dedupOutputs = append(dedupOutputs, Result(o))
foundCharsets[o.Charset] = struct{}{}
}
}
if len(dedupOutputs) == 0 {
return nil, NotDetectedError
}
return dedupOutputs, nil
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
type recognizerOutputs []recognizerOutput
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii"></meta>
<title>ICU License - ICU 1.8.1 and later</title>
</head>
<body BGCOLOR="#ffffff">
<h2>ICU License - ICU 1.8.1 and later</h2>
<p>COPYRIGHT AND PERMISSION NOTICE</p>
<p>
Copyright (c) 1995-2012 International Business Machines Corporation and others
</p>
<p>
All rights reserved.
</p>
<p>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies
of the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
</p>
<p>
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL
THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM,
OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
USE OR PERFORMANCE OF THIS SOFTWARE.
</p>
<p>
Except as contained in this notice, the name of a copyright holder shall not be
used in advertising or otherwise to promote the sale, use or other dealings in
this Software without prior written authorization of the copyright holder.
</p>
<hr>
<p><small>
All trademarks and registered trademarks mentioned herein are the property of their respective owners.
</small></p>
</body>
</html>
package chardet
import (
"errors"
"math"
)
type recognizerMultiByte struct {
charset string
language string
decoder charDecoder
commonChars []uint16
}
type charDecoder interface {
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
}
func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
return recognizerOutput{
Charset: r.charset,
Language: r.language,
Confidence: r.matchConfidence(input),
}
}
func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
raw := input.raw
var c uint16
var err error
var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
totalCharCount++
if err != nil {
badCharCount++
} else if c <= 0xFF {
singleByteCharCount++
} else {
doubleByteCharCount++
if r.commonChars != nil && binarySearch(r.commonChars, c) {
commonCharCount++
}
}
if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
return 0
}
}
if doubleByteCharCount <= 10 && badCharCount == 0 {
if doubleByteCharCount == 0 && totalCharCount < 10 {
return 0
} else {
return 10
}
}
if doubleByteCharCount < 20*badCharCount {
return 0
}
if r.commonChars == nil {
confidence := 30 + doubleByteCharCount - 20*badCharCount
if confidence > 100 {
confidence = 100
}
return confidence
}
maxVal := math.Log(float64(doubleByteCharCount) / 4)
scaleFactor := 90 / maxVal
confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
if confidence > 100 {
confidence = 100
}
if confidence < 0 {
confidence = 0
}
return confidence
}
func binarySearch(l []uint16, c uint16) bool {
start := 0
end := len(l) - 1
for start <= end {
mid := (start + end) / 2
if c == l[mid] {
return true
} else if c < l[mid] {
end = mid - 1
} else {
start = mid + 1
}
}
return false
}
var eobError = errors.New("End of input buffer")
var badCharError = errors.New("Decode a bad char")
type charDecoder_sjis struct {
}
func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
c = uint16(first)
remain = input[1:]
if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
return
}
if len(remain) == 0 {
return c, remain, badCharError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
} else {
err = badCharError
}
return
}
var commonChars_sjis = []uint16{
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
}
func newRecognizer_sjis() *recognizerMultiByte {
return &recognizerMultiByte{
"Shift_JIS",
"ja",
charDecoder_sjis{},
commonChars_sjis,
}
}
type charDecoder_euc struct {
}
func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x8D {
return uint16(first), remain, nil
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0xA1 && first <= 0xFE {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8E {
if second < 0xA1 {
err = badCharError
}
return
}
if first == 0x8F {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
c = c<<0 | uint16(third)
if third < 0xa1 {
err = badCharError
}
}
return
}
var commonChars_euc_jp = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
}
var commonChars_euc_kr = []uint16{
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
}
func newRecognizer_euc_jp() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-JP",
"ja",
charDecoder_euc{},
commonChars_euc_jp,
}
}
func newRecognizer_euc_kr() *recognizerMultiByte {
return &recognizerMultiByte{
"EUC-KR",
"ko",
charDecoder_euc{},
commonChars_euc_kr,
}
}
type charDecoder_big5 struct {
}
func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x7F || first == 0xFF {
return
}
if len(remain) == 0 {
return c, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if second < 0x40 || second == 0x7F || second == 0xFF {
err = badCharError
}
return
}
var commonChars_big5 = []uint16{
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
}
func newRecognizer_big5() *recognizerMultiByte {
return &recognizerMultiByte{
"Big5",
"zh",
charDecoder_big5{},
commonChars_big5,
}
}
type charDecoder_gb_18030 struct {
}
func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
if len(input) == 0 {
return 0, nil, eobError
}
first := input[0]
remain = input[1:]
c = uint16(first)
if first <= 0x80 {
return
}
if len(remain) == 0 {
return 0, nil, eobError
}
second := remain[0]
remain = remain[1:]
c = c<<8 | uint16(second)
if first >= 0x81 && first <= 0xFE {
if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
return
}
if second >= 0x30 && second <= 0x39 {
if len(remain) == 0 {
return 0, nil, eobError
}
third := remain[0]
remain = remain[1:]
if third >= 0x81 && third <= 0xFE {
if len(remain) == 0 {
return 0, nil, eobError
}
fourth := remain[0]
remain = remain[1:]
if fourth >= 0x30 && fourth <= 0x39 {
c = c<<16 | uint16(third)<<8 | uint16(fourth)
return
}
}
}
err = badCharError
}
return
}
var commonChars_gb_18030 = []uint16{
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
}
func newRecognizer_gb_18030() *recognizerMultiByte {
return &recognizerMultiByte{
"GB-18030",
"zh",
charDecoder_gb_18030{},
commonChars_gb_18030,
}
}
package chardet
type recognizer interface {
Match(*recognizerInput) recognizerOutput
}
type recognizerOutput Result
type recognizerInput struct {
raw []byte
input []byte
tagStripped bool
byteStats []int
hasC1Bytes bool
}
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
input, stripped := mayStripInput(raw, stripTag)
byteStats := computeByteStats(input)
return &recognizerInput{
raw: raw,
input: input,
tagStripped: stripped,
byteStats: byteStats,
hasC1Bytes: computeHasC1Bytes(byteStats),
}
}
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
const inputBufferSize = 8192
out = make([]byte, 0, inputBufferSize)
var badTags, openTags int32
var inMarkup bool = false
stripped = false
if stripTag {
stripped = true
for _, c := range raw {
if c == '<' {
if inMarkup {
badTags += 1
}
inMarkup = true
openTags += 1
}
if !inMarkup {
out = append(out, c)
if len(out) >= inputBufferSize {
break
}
}
if c == '>' {
inMarkup = false
}
}
}
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
limit := len(raw)
if limit > inputBufferSize {
limit = inputBufferSize
}
out = make([]byte, limit)
copy(out, raw[:limit])
stripped = false
}
return
}
func computeByteStats(input []byte) []int {
r := make([]int, 256)
for _, c := range input {
r[c] += 1
}
return r
}
func computeHasC1Bytes(byteStats []int) bool {
for _, count := range byteStats[0x80 : 0x9F+1] {
if count > 0 {
return true
}
}
return false
}
此差异已折叠。
package chardet
import (
"bytes"
)
var (
utf16beBom = []byte{0xFE, 0xFF}
utf16leBom = []byte{0xFF, 0xFE}
utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
)
type recognizerUtf16be struct {
}
func newRecognizer_utf16be() *recognizerUtf16be {
return &recognizerUtf16be{}
}
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16BE",
}
if bytes.HasPrefix(input.raw, utf16beBom) {
output.Confidence = 100
}
return
}
type recognizerUtf16le struct {
}
func newRecognizer_utf16le() *recognizerUtf16le {
return &recognizerUtf16le{}
}
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-16LE",
}
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
output.Confidence = 100
}
return
}
type recognizerUtf32 struct {
name string
bom []byte
decodeChar func(input []byte) uint32
}
func decodeUtf32be(input []byte) uint32 {
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
}
func decodeUtf32le(input []byte) uint32 {
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
}
func newRecognizer_utf32be() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32BE",
utf32beBom,
decodeUtf32be,
}
}
func newRecognizer_utf32le() *recognizerUtf32 {
return &recognizerUtf32{
"UTF-32LE",
utf32leBom,
decodeUtf32le,
}
}
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: r.name,
}
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
for b := input.raw; len(b) >= 4; b = b[4:] {
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
numInvalid++
} else {
numValid++
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
package chardet
import (
"bytes"
)
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
type recognizerUtf8 struct {
}
func newRecognizer_utf8() *recognizerUtf8 {
return &recognizerUtf8{}
}
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
output = recognizerOutput{
Charset: "UTF-8",
}
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
inputLen := len(input.raw)
var numValid, numInvalid uint32
var trailBytes uint8
for i := 0; i < inputLen; i++ {
c := input.raw[i]
if c&0x80 == 0 {
continue
}
if c&0xE0 == 0xC0 {
trailBytes = 1
} else if c&0xF0 == 0xE0 {
trailBytes = 2
} else if c&0xF8 == 0xF0 {
trailBytes = 3
} else {
numInvalid++
if numInvalid > 5 {
break
}
trailBytes = 0
}
for i++; i < inputLen; i++ {
c = input.raw[i]
if c&0xC0 != 0x80 {
numInvalid++
break
}
if trailBytes--; trailBytes == 0 {
numValid++
break
}
}
}
if hasBom && numInvalid == 0 {
output.Confidence = 100
} else if hasBom && numValid > numInvalid*10 {
output.Confidence = 80
} else if numValid > 3 && numInvalid == 0 {
output.Confidence = 100
} else if numValid > 0 && numInvalid == 0 {
output.Confidence = 80
} else if numValid == 0 && numInvalid == 0 {
// Plain ASCII
output.Confidence = 10
} else if numValid > numInvalid*10 {
output.Confidence = 25
}
return
}
......@@ -417,12 +417,6 @@
"revision": "6d0bcf50ac9eb25d2e6f8fbd686d488a701eba55",
"revisionTime": "2018-08-20T15:09:00Z"
},
{
"checksumSHA1": "11D2ZLtDH4gRO/S6BhYcDgsTiCs=",
"path": "github.com/pingcap/tidb/util/charset",
"revision": "6d0bcf50ac9eb25d2e6f8fbd686d488a701eba55",
"revisionTime": "2018-08-20T15:09:00Z"
},
{
"checksumSHA1": "5yYzS0BTxlmELR9P/OB4RV/yw0o=",
"path": "github.com/pingcap/tidb/util/chunk",
......@@ -546,6 +540,12 @@
"revision": "11635eb403ff09dbc3a6b5a007ab5ab09151c229",
"revisionTime": "2018-04-28T10:25:19Z"
},
{
"checksumSHA1": "EEXppdradk5G/UaQUQ95BQ9n7uw=",
"path": "github.com/saintfish/chardet",
"revision": "3af4cd4741ca4f3eb0c407c034571a6fb0ea529c",
"revisionTime": "2012-08-16T06:12:21Z"
},
{
"checksumSHA1": "p7FQsKPOhh5D4XkBZ7VKDU/rkLI=",
"origin": "github.com/pingcap/tidb/vendor/github.com/sirupsen/logrus",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册