提交 6557bef6 编写于 作者: chai2010's avatar chai2010

补充测试

上级 91ecd0c2
// 版权 @2023 凹语言 作者。保留所有权利。
import (
"unicode/ctypes"
"unicode/utf8"
)
......@@ -28,7 +29,6 @@ func EqualFold(s, t: []byte) => bool {
return true
}
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep: []byte) => int {
n := len(sep)
......@@ -262,7 +262,6 @@ func IndexRune(s: []byte, r: rune) => int {
}
}
// Count counts the number of non-overlapping instances of sep in s.
// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
func Count(s, sep: []byte) => int {
......@@ -337,27 +336,6 @@ func genSplit(s, sep: []byte, sepSave, n: int) => [][]byte {
global asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
const (
unicode_MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
unicode_ReplacementChar = '\uFFFD' // Represents invalid code points.
unicode_MaxASCII = '\u007F' // maximum ASCII value.
unicode_MaxLatin1 = '\u00FF' // maximum Latin-1 value.
)
// 简化版本, 不支持 unicode 空白
func unicode_IsSpace(r: rune) => bool {
// This property isn't the same as Z; special-case it.
if u32(r) <= unicode_MaxLatin1 {
switch r {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
return true
}
return false
}
return false
}
// Fields interprets s as a sequence of UTF-8-encoded code points.
// It splits the slice s around each instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
......@@ -379,7 +357,7 @@ func Fields(s: []byte) => [][]byte {
if setBits >= utf8.RuneSelf {
// Some runes in the input slice are not ASCII.
return FieldsFunc(s, unicode_IsSpace)
return FieldsFunc(s, ctypes.IsSpace)
}
// ASCII fast path
......@@ -412,7 +390,6 @@ func Fields(s: []byte) => [][]byte {
return a
}
// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
// It splits the slice s at each run of code points c satisfying f(c) and
// returns a slice of subslices of s. If all code points in s satisfy f(c), or
......@@ -513,3 +490,341 @@ func Join(s: [][]byte, sep: []byte) => []byte {
}
return b
}
// Map returns a copy of the byte slice s with all its characters modified
// according to the mapping function. If mapping returns a negative value, the character is
// dropped from the byte slice with no replacement. The characters in s and the
// output are interpreted as UTF-8-encoded code points.
func Map(mapping: func(r: rune) => rune, s: []byte) => []byte {
// In the worst case, the slice can grow when mapped, making
// things unpleasant. But it's so rare we barge in assuming it's
// fine. It could also shrink but that falls out naturally.
maxbytes := len(s) // length of b
nbytes := 0 // number of bytes encoded in b
b := make([]byte, maxbytes)
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
r = mapping(r)
if r >= 0 {
rl := utf8.RuneLen(r)
if rl < 0 {
rl = len(string(utf8.RuneError))
}
if nbytes+rl > maxbytes {
// Grow the buffer.
maxbytes = maxbytes*2 + utf8.UTFMax
nb := make([]byte, maxbytes)
copy(nb, b[0:nbytes])
b = nb
}
nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
}
i += wid
}
return b[0:nbytes]
}
// ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
// their upper case.
func ToUpper(s: []byte) => []byte {
isASCII, hasLower := true, false
for i := 0; i < len(s); i++ {
c := s[i]
if c >= utf8.RuneSelf {
isASCII = false
break
}
hasLower = hasLower || ('a' <= c && c <= 'z')
}
if isASCII { // optimize for ASCII-only byte slices.
if !hasLower {
// Just return a copy.
return append([]byte(""), s...)
}
b := make([]byte, len(s))
for i := 0; i < len(s); i++ {
c := s[i]
if 'a' <= c && c <= 'z' {
c -= 'a' - 'A'
}
b[i] = c
}
return b
}
return Map(ctypes.ToUpper, s)
}
// ToLower returns a copy of the byte slice s with all Unicode letters mapped to
// their lower case.
func ToLower(s: []byte) => []byte {
isASCII, hasUpper := true, false
for i := 0; i < len(s); i++ {
c := s[i]
if c >= utf8.RuneSelf {
isASCII = false
break
}
hasUpper = hasUpper || ('A' <= c && c <= 'Z')
}
if isASCII { // optimize for ASCII-only byte slices.
if !hasUpper {
return append([]byte(""), s...)
}
b := make([]byte, len(s))
for i := 0; i < len(s); i++ {
c := s[i]
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
b[i] = c
}
return b
}
return Map(ctypes.ToLower, s)
}
// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
func ToValidUTF8(s, replacement: []byte) => []byte {
b := make([]byte, 0, len(s)+len(replacement))
invalid := false // previous byte was from an invalid UTF-8 sequence
for i := 0; i < len(s); {
c := s[i]
if c < utf8.RuneSelf {
i++
invalid = false
b = append(b, byte(c))
continue
}
_, wid := utf8.DecodeRune(s[i:])
if wid == 1 {
i++
if !invalid {
invalid = true
b = append(b, replacement...)
}
continue
}
invalid = false
b = append(b, s[i:i+wid]...)
i += wid
}
return b
}
// TrimSpace returns a subslice of s by slicing off all leading and
// trailing white space, as defined by Unicode.
func TrimSpace(s: []byte) => []byte {
// Fast path for ASCII: look for the first ASCII non-space byte
start := 0
for ; start < len(s); start++ {
c := s[start]
if c >= utf8.RuneSelf {
// If we run into a non-ASCII byte, fall back to the
// slower unicode-aware method on the remaining bytes
return TrimFunc(s[start:], ctypes.IsSpace)
}
if asciiSpace[c] == 0 {
break
}
}
// Now look for the first ASCII non-space byte from the end
stop := len(s)
for ; stop > start; stop-- {
c := s[stop-1]
if c >= utf8.RuneSelf {
return TrimFunc(s[start:stop], ctypes.IsSpace)
}
if asciiSpace[c] == 0 {
break
}
}
// At this point s[start:stop] starts and ends with an ASCII
// non-space bytes, so we're done. Non-ASCII cases have already
// been handled above.
if start == stop {
// Special case to preserve previous TrimLeftFunc behavior,
// returning nil instead of empty slice if all spaces.
return nil
}
return s[start:stop]
}
// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
// all leading UTF-8-encoded code points c that satisfy f(c).
func TrimLeftFunc(s: []byte, f: func(r: rune) => bool) => []byte {
i := indexFunc(s, f, false)
if i == -1 {
return nil
}
return s[i:]
}
// indexFunc is the same as IndexFunc except that if
// truth==false, the sense of the predicate function is
// inverted.
func indexFunc(s: []byte, f: func(r: rune) => bool, truth: bool) => int {
start := 0
for start < len(s) {
wid := 1
r := rune(s[start])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[start:])
}
if f(r) == truth {
return start
}
start += wid
}
return -1
}
// lastIndexFunc is the same as LastIndexFunc except that if
// truth==false, the sense of the predicate function is
// inverted.
func lastIndexFunc(s: []byte, f: func(r: rune) => bool, truth: bool) => int {
for i := len(s); i > 0; {
r, size := rune(s[i-1]), 1
if r >= utf8.RuneSelf {
r, size = utf8.DecodeLastRune(s[0:i])
}
i -= size
if f(r) == truth {
return i
}
}
return -1
}
// TrimRightFunc returns a subslice of s by slicing off all trailing
// UTF-8-encoded code points c that satisfy f(c).
func TrimRightFunc(s: []byte, f: func(r: rune) => bool) => []byte {
i := lastIndexFunc(s, f, false)
if i >= 0 && s[i] >= utf8.RuneSelf {
_, wid := utf8.DecodeRune(s[i:])
i += wid
} else {
i++
}
return s[0:i]
}
// TrimFunc returns a subslice of s by slicing off all leading and trailing
// UTF-8-encoded code points c that satisfy f(c).
func TrimFunc(s: []byte, f: func(r: rune) => bool) => []byte {
return TrimRightFunc(TrimLeftFunc(s, f), f)
}
// Trim returns a subslice of s by slicing off all leading and
// trailing UTF-8-encoded code points contained in cutset.
func Trim(s: []byte, cutset: string) => []byte {
return TrimFunc(s, makeCutsetFunc(cutset))
}
// TrimLeft returns a subslice of s by slicing off all leading
// UTF-8-encoded code points contained in cutset.
func TrimLeft(s: []byte, cutset: string) => []byte {
return TrimLeftFunc(s, makeCutsetFunc(cutset))
}
// TrimRight returns a subslice of s by slicing off all trailing
// UTF-8-encoded code points that are contained in cutset.
func TrimRight(s: []byte, cutset: string) => []byte {
return TrimRightFunc(s, makeCutsetFunc(cutset))
}
// Runes interprets s as a sequence of UTF-8-encoded code points.
// It returns a slice of runes (Unicode code points) equivalent to s.
func Runes(s: []byte) => []rune {
t := make([]rune, utf8.RuneCount(s))
i := 0
for len(s) > 0 {
r, l := utf8.DecodeRune(s)
t[i] = r
i++
s = s[l:]
}
return t
}
// TrimPrefix returns s without the provided leading prefix string.
// If s doesn't start with prefix, s is returned unchanged.
func TrimPrefix(s, prefix: []byte) => []byte {
if HasPrefix(s, prefix) {
return s[len(prefix):]
}
return s
}
// TrimSuffix returns s without the provided trailing suffix string.
// If s doesn't end with suffix, s is returned unchanged.
func TrimSuffix(s, suffix: []byte) => []byte {
if HasSuffix(s, suffix) {
return s[:len(s)-len(suffix)]
}
return s
}
func makeCutsetFunc(cutset: string) => func(r: rune) => bool {
if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
return func(r: rune) => bool {
return r == rune(cutset[0])
}
}
if as, isASCII := makeASCIISet(cutset); isASCII {
return func(r: rune) => bool {
return r < utf8.RuneSelf && as.contains(byte(r))
}
}
return func(r: rune) => bool {
for _, c := range []byte(cutset) {
if rune(c) == r {
return true
}
}
// todo(chai2010)
//for _, c := range cutset {
// if c == r {
// return true
// }
//}
return false
}
}
// asciiSet is a 32-byte value, where each bit represents the presence of a
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
// starting with the least-significant bit of the lowest word to the
// most-significant bit of the highest word, map to the full range of all
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
// ensuring that any non-ASCII character will be reported as not in the set.
type asciiSet struct {
Data: [8]uint32
}
// makeASCIISet creates a set of ASCII characters and reports whether all
// characters in chars are ASCII.
func makeASCIISet(chars: string) => (as: asciiSet, ok: bool) {
for i := 0; i < len(chars); i++ {
c := chars[i]
if c >= utf8.RuneSelf {
return as, false
}
as.Data[c>>5] |= 1 << uint(c&31)
}
return as, true
}
// contains reports whether c is inside the set.
func asciiSet.contains(c: byte) => bool {
return (this.Data[c>>5] & (1 << uint(c&31))) != 0
}
// 版权 @2023 凹语言 作者。保留所有权利。
import (
"unicode"
"unicode/ctypes"
"unicode/utf8"
)
......@@ -53,7 +55,7 @@ func TestEqualExhaustive {
func TestNotEqual {
size := 128
//if testing.Short() {
size = 32
size = 32
//}
a := make([]byte, size)
b := make([]byte, size)
......@@ -74,7 +76,6 @@ func TestNotEqual {
}
}
var indexTests = []BinOpTest{
{"", "", 0},
{"", "a", -1},
......@@ -120,7 +121,7 @@ func TestLastIndexAny {
// todo
}
func TestIndexByte{
func TestIndexByte {
for _, tt := range indexTests {
if len(tt.b) != 1 {
continue
......@@ -500,7 +501,6 @@ global splitaftertests = []SplitTest{
{"123", "", 17, []string{"1", "2", "3"}},
}
func TestSplitAfter {
for _, tt := range splitaftertests {
a := SplitAfterN([]byte(tt.s), []byte(tt.sep), tt.n)
......@@ -586,6 +586,398 @@ func TestFields {
}
}
func TestFieldsFunc {
for _, tt := range fieldstests {
a := FieldsFunc([]byte(tt.s), ctypes.IsSpace)
result := sliceOfString(a)
if !eq(result, tt.a) {
assert(false)
//t.Errorf("FieldsFunc(%q, unicode.IsSpace) = %v; want %v", tt.s, a, tt.a)
//continue
}
}
pred := func(c: rune) => bool { return c == 'X' }
fieldsFuncTests := []FieldsTest{
{"", []string{}},
{"XX", []string{}},
{"XXhiXXX", []string{"hi"}},
{"aXXbXXXcX", []string{"a", "b", "c"}},
}
for _, tt := range fieldsFuncTests {
b := []byte(tt.s)
a := FieldsFunc(b, pred)
// Appending to the results should not change future results.
x: []byte
for _, v := range a {
x = append(v, 'z')
}
result := sliceOfString(a)
if !eq(result, tt.a) {
assert(false)
//t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
}
if string(b) != tt.s {
assert(false)
//t.Errorf("slice changed to %s; want %s", b, tt.s)
}
if len(tt.a) > 0 {
if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
assert(false)
//t.Errorf("last appended result was %s; want %s", x, want)
}
}
}
}
// Test case for any function which accepts and returns a byte slice.
// For ease of creation, we write the input byte slice as a string.
type StringTest struct {
in: string
out: []byte
}
global upperTests = []StringTest{
{"", []byte("")},
{"ONLYUPPER", []byte("ONLYUPPER")},
{"abc", []byte("ABC")},
{"AbC123", []byte("ABC123")},
{"azAZ09_", []byte("AZAZ09_")},
{"longStrinGwitHmixofsmaLLandcAps", []byte("LONGSTRINGWITHMIXOFSMALLANDCAPS")},
//{"long\u0250string\u0250with\u0250nonascii\u2C6Fchars", []byte("LONG\u2C6FSTRING\u2C6FWITH\u2C6FNONASCII\u2C6FCHARS")},
//{"\u0250\u0250\u0250\u0250\u0250", []byte("\u2C6F\u2C6F\u2C6F\u2C6F\u2C6F")}, // grows one byte per char
//{"a\u0080\U0010FFFF", []byte("A\u0080\U0010FFFF")}, // test utf8.RuneSelf and utf8.MaxRune
}
global lowerTests = []StringTest{
{"", []byte("")},
{"abc", []byte("abc")},
{"AbC123", []byte("abc123")},
{"azAZ09_", []byte("azaz09_")},
{"longStrinGwitHmixofsmaLLandcAps", []byte("longstringwithmixofsmallandcaps")},
//{"LONG\u2C6FSTRING\u2C6FWITH\u2C6FNONASCII\u2C6FCHARS", []byte("long\u0250string\u0250with\u0250nonascii\u0250chars")},
//{"\u2C6D\u2C6D\u2C6D\u2C6D\u2C6D", []byte("\u0251\u0251\u0251\u0251\u0251")}, // shrinks one byte per char
//{"A\u0080\U0010FFFF", []byte("a\u0080\U0010FFFF")}, // test utf8.RuneSelf and utf8.MaxRune
}
const space = "\t\v\r\f\n"
global trimSpaceTests = []StringTest{
{"", nil},
{" a", []byte("a")},
{"b ", []byte("b")},
{"abc", []byte("abc")},
{space + "abc" + space, []byte("abc")},
{" ", nil},
//{"\u3000 ", nil},
//{" \u3000", nil},
{" \t\r\n \t\t\r\r\n\n ", nil},
{" \t\r\n x\t\t\r\r\n\n ", []byte("x")},
//{" \u2000\t\r\n x\t\t\r\r\ny\n \u3000", []byte("x\t\t\r\r\ny")},
{"1 \t\r\n2", []byte("1 \t\r\n2")},
{" x\x80", []byte("x\x80")},
{" x\xc0", []byte("x\xc0")},
{"x \xc0\xc0 ", []byte("x \xc0\xc0")},
{"x \xc0", []byte("x \xc0")},
{"x \xc0 ", []byte("x \xc0")},
{"x \xc0\xc0 ", []byte("x \xc0\xc0")},
{"x ☺\xc0\xc0 ", []byte("x ☺\xc0\xc0")},
{"x ☺ ", []byte("x ☺")},
}
// Execute f on each test case. funcName should be the name of f; it's used
// in failure reports.
func runStringTests(f: func([]byte) => []byte, funcName: string, testCases: []StringTest) {
for _, tc := range testCases {
actual := f([]byte(tc.in))
if actual == nil && tc.out != nil {
assert(false)
// t.Errorf("%s(%q) = nil; want %q", funcName, tc.in, tc.out)
}
if actual != nil && tc.out == nil {
assert(false)
//t.Errorf("%s(%q) = %q; want nil", funcName, tc.in, actual)
}
if !Equal(actual, tc.out) {
assert(false)
//t.Errorf("%s(%q) = %q; want %q", funcName, tc.in, actual, tc.out)
}
}
}
func tenRunes(r: rune) => string {
runes := make([]byte, 10)
for i := range runes {
runes[i] = byte(r)
}
return string(runes)
}
// User-defined self-inverse mapping function
func rot13(r: rune) => rune {
const step = 13
if r >= 'a' && r <= 'z' {
return ((r - 'a' + step) % 26) + 'a'
}
if r >= 'A' && r <= 'Z' {
return ((r - 'A' + step) % 26) + 'A'
}
return r
}
func TestMap {
// Run a couple of awful growth/shrinkage tests
a := tenRunes('a')
// 1. Grow. This triggers two reallocations in Map.
//maxRune := func(r: rune) => rune { return unicode.MaxRune }
//m := Map(maxRune, []byte(a))
//expect := tenRunes(unicode.MaxRune)
//if string(m) != expect {
// t.Errorf("growing: expected %q got %q", expect, m)
//}
// 2. Shrink
minRune := func(r: rune) => rune { return 'a' }
m := Map(minRune, []byte(tenRunes(unicode.MaxRune)))
expect := a
if string(m) != expect {
assert(false)
//t.Errorf("shrinking: expected %q got %q", expect, m)
}
// 3. Rot13
m = Map(rot13, []byte("a to zed"))
expect = "n gb mrq"
if string(m) != expect {
assert(false)
//t.Errorf("rot13: expected %q got %q", expect, m)
}
// 4. Rot13^2
m = Map(rot13, Map(rot13, []byte("a to zed")))
expect = "a to zed"
if string(m) != expect {
assert(false)
//t.Errorf("rot13: expected %q got %q", expect, m)
}
// 5. Drop
dropNotLatin := func(r: rune) => rune {
// add by chai2010
if r < unicode.MaxLatin1 {
return r
}
//if unicode.Is(unicode.Latin, r) {
// return r
//}
return -1
}
m = Map(dropNotLatin, []byte("Hello凹凹"))
expect = "Hello"
if string(m) != expect {
assert(false)
//t.Errorf("drop: expected %q got %q", expect, m)
}
// 6. Invalid rune
//invalidRune := func(r: rune) => rune {
// return utf8.MaxRune + 1
//}
//m = Map(invalidRune, []byte("x"))
//expect = "\uFFFD"
//if string(m) != expect {
// t.Errorf("invalidRune: expected %q got %q", expect, m)
//}
}
func TestToUpper { runStringTests(ToUpper, "ToUpper", upperTests) }
func TestToLower { runStringTests(ToLower, "ToLower", lowerTests) }
global toValidUTF8Tests = []struct {
in: string
repl: string
out: string
}{
{"", "\uFFFD", ""},
{"abc", "\uFFFD", "abc"},
{"\uFDDD", "\uFFFD", "\uFDDD"},
{"a\xffb", "\uFFFD", "a\uFFFDb"},
{"a\xffb\uFFFD", "X", "aXb\uFFFD"},
{"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"},
{"\xC0\xAF", "\uFFFD", "\uFFFD"},
{"\xE0\x80\xAF", "\uFFFD", "\uFFFD"},
{"\xed\xa0\x80", "abc", "abc"},
{"\xed\xbf\xbf", "\uFFFD", "\uFFFD"},
{"\xF0\x80\x80\xaf", "☺", "☺"},
{"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
{"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
}
func TestToValidUTF8 {
for _, tc := range toValidUTF8Tests {
got := ToValidUTF8([]byte(tc.in), []byte(tc.repl))
if !Equal(got, []byte(tc.out)) {
assert(false)
// t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out)
}
}
}
func TestTrimSpace {
runStringTests(TrimSpace, "TrimSpace", trimSpaceTests)
}
type RepeatTest struct {
in, out: string
count: int
}
global RepeatTests = []RepeatTest{
{"", "", 0},
{"", "", 1},
{"", "", 2},
{"-", "", 0},
{"-", "-", 1},
{"-", "----------", 10},
{"abc ", "abc abc abc ", 3},
}
func TestRepeat {
for _, tt := range RepeatTests {
tin := []byte(tt.in)
tout := []byte(tt.out)
a := Repeat(tin, tt.count)
if !Equal(a, tout) {
assert(false)
//t.Errorf("Repeat(%q, %d) = %q; want %q", tin, tt.count, a, tout)
//continue
}
}
}
func runesEqual(a, b: []rune) => bool {
if len(a) != len(b) {
return false
}
for i, r := range a {
if r != b[i] {
return false
}
}
return true
}
type RunesTest struct {
in: string
out: []rune
lossy: bool
}
global RunesTests = []RunesTest{
{"", []rune{}, false},
{" ", []rune{32}, false},
{"ABC", []rune{65, 66, 67}, false},
{"abc", []rune{97, 98, 99}, false},
//{"\u65e5\u672c\u8a9e", []rune{26085, 26412, 35486}, false},
//{"ab\x80c", []rune{97, 98, 0xFFFD, 99}, true},
//{"ab\xc0c", []rune{97, 98, 0xFFFD, 99}, true},
}
func TestRunes {
for _, tt := range RunesTests {
tin := []byte(tt.in)
a := Runes(tin)
if !runesEqual(a, tt.out) {
assert(false)
//t.Errorf("Runes(%q) = %v; want %v", tin, a, tt.out)
//continue
}
if !tt.lossy {
// can only test reassembly if we didn't lose information
//s := string(a)
//if s != tt.in {
// assert(false)
//t.Errorf("string(Runes(%q)) = %x; want %x", tin, s, tin)
//}
}
}
}
type TrimTest struct {
f: string
in, arg, out: string
}
global trimTests = []TrimTest{
{"Trim", "abba", "a", "bb"},
{"Trim", "abba", "ab", ""},
{"TrimLeft", "abba", "ab", ""},
{"TrimRight", "abba", "ab", ""},
{"TrimLeft", "abba", "a", "bba"},
{"TrimRight", "abba", "a", "abb"},
{"Trim", "<tag>", "<>", "tag"},
{"Trim", "* listitem", " *", "listitem"},
{"Trim", `"quote"`, `"`, "quote"},
//{"Trim", "\u2C6F\u2C6F\u0250\u0250\u2C6F\u2C6F", "\u2C6F", "\u0250\u0250"},
//{"Trim", "\x80test\xff", "\xff", "test"},
//{"Trim", " Ġ ", " ", "Ġ"},
//{"Trim", " Ġİ0", "0 ", "Ġİ"},
//empty string tests
{"Trim", "abba", "", "abba"},
{"Trim", "", "123", ""},
{"Trim", "", "", ""},
{"TrimLeft", "abba", "", "abba"},
{"TrimLeft", "", "123", ""},
{"TrimLeft", "", "", ""},
{"TrimRight", "abba", "", "abba"},
{"TrimRight", "", "123", ""},
{"TrimRight", "", "", ""},
//{"TrimRight", "☺\xc0", "☺", "☺\xc0"},
//{"TrimPrefix", "aabb", "a", "abb"}, // todo(chai2010): bug
//{"TrimPrefix", "aabb", "b", "aabb"},
//{"TrimSuffix", "aabb", "a", "aabb"},
//{"TrimSuffix", "aabb", "b", "aab"},
}
func TestTrim {
for i, tc := range trimTests {
name := tc.f
f: func([]byte, string) => []byte
fb: func([]byte, []byte) => []byte
switch name {
case "Trim":
f = Trim
case "TrimLeft":
f = TrimLeft
case "TrimRight":
f = TrimRight
case "TrimPrefix":
fb = TrimPrefix
case "TrimSuffix":
fb = TrimSuffix
default:
assert(false)
//t.Errorf("Undefined trim function %s", name)
}
actual: string
if f != nil {
actual = string(f([]byte(tc.in), tc.arg))
} else {
actual = string(fb([]byte(tc.in), []byte(tc.arg)))
}
if actual != tc.out {
println(i, actual, tc.out)
assert(false)
//t.Errorf("%s(%q, %q) = %q; want %q", name, tc.in, tc.arg, actual, tc.out)
}
}
}
func TestEqualFold {
for _, tt := range EqualFoldTests {
if out := EqualFold([]byte(tt.s), []byte(tt.t)); out != tt.out {
......@@ -600,8 +992,8 @@ func TestEqualFold {
}
global EqualFoldTests = []struct {
s, t string
out bool
s, t: string
out: bool
}{
{"abc", "abc", true},
{"ABcd", "ABcd", true},
......
// 版权 @2023 凹语言 作者。保留所有权利。
const (
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
ReplacementChar = '\uFFFD' // Represents invalid code points.
MaxASCII = '\u007F' // maximum ASCII value.
MaxLatin1 = '\u00FF' // maximum Latin-1 value.
)
// 版权 @2023 凹语言 作者。保留所有权利。
func IsAlnum(r: rune) => bool {
return IsAlpha(r) || IsDigit(r)
}
func IsAlpha(r: rune) => bool {
return (r|32)-'a' < 26
}
func IsAsscii(r: rune) => bool {
return r <= 0x7f
}
func IsBlank(r: rune) => bool {
return r == ' ' || r == '\t'
}
func IsCntrl(r: rune) => bool {
return r < 0x20 || r == 0x7f
}
func IsDigit(r: rune) => bool {
return r >= '0' && r <= '9'
}
func IsGraph(r: rune) => bool {
if IsAsscii(r) {
return r-0x21 < 0x5e
}
return false
}
func IsLower(r: rune) => bool {
return r >= 'a' && r <= 'z'
}
func IsPrint(r: rune) => bool {
if IsAsscii(r) {
return r-0x20 < 0x5f
}
return false
}
func IsPunct(r: rune) => bool {
if IsAsscii(r) {
return IsGraph(r) && !IsAlnum(r)
}
return false
}
func IsSpace(r: rune) => bool {
switch r {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
return true
}
return false
}
func IsUpper(r: rune) => bool {
return r >= 'A' && r <= 'Z'
}
func IsXdigit(r: rune) => bool {
return IsDigit(r) || (r >= 'A' && r <= 'F') || (r >= 'a' && r <= 'f')
}
func ToAscii(r: rune) => rune {
return r & 0x7f
}
func ToLower(r: rune) => rune {
if IsUpper(r) {
return r | 32
}
return r
}
func ToUpper(r: rune) => rune {
if IsLower(r) {
return r & 0x5f
}
return r
}
// 版权 @2023 凹语言 作者。保留所有权利。
// 版权 @2023 凹语言 作者。保留所有权利。
// IsDigit reports whether the rune is a decimal digit.
func IsDigit(r: rune) => bool {
if r <= MaxLatin1 {
return '0' <= r && r <= '9'
}
return isExcludingLatin(Digit, r)
}
// 版权 @2023 凹语言 作者。保留所有权利。
// Bit masks for each code point under U+0100, for fast lookup.
const (
pC = 1 << iota // a control character.
pP // a punctuation character.
pN // a numeral.
pS // a symbolic character.
pZ // a spacing character.
pLu // an upper-case letter.
pLl // a lower-case letter.
pp // a printable character according to Go's definition.
pg = pp | pZ // a graphical character according to the Unicode definition.
pLo = pLl | pLu // a letter that is neither upper nor lower case.
pLmask = pLo
)
// GraphicRanges defines the set of graphic characters according to Unicode.
global GraphicRanges = []*RangeTable{
L, M, N, P, S, Zs,
}
// PrintRanges defines the set of printable characters according to Go.
// ASCII space, U+0020, is handled separately.
global PrintRanges = []*RangeTable{
L, M, N, P, S,
}
// IsGraphic reports whether the rune is defined as a Graphic by Unicode.
// Such characters include letters, marks, numbers, punctuation, symbols, and
// spaces, from categories L, M, N, P, S, Zs.
func IsGraphic(r: rune) => bool {
// We convert to uint32 to avoid the extra test for negative,
// and in the index we convert to uint8 to avoid the range check.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pg != 0
}
return In(r, GraphicRanges...)
}
// IsPrint reports whether the rune is defined as printable by Go. Such
// characters include letters, marks, numbers, punctuation, symbols, and the
// ASCII space character, from categories L, M, N, P, S and the ASCII space
// character. This categorization is the same as IsGraphic except that the
// only spacing character is ASCII space, U+0020.
func IsPrint(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pp != 0
}
return In(r, PrintRanges...)
}
// IsOneOf reports whether the rune is a member of one of the ranges.
// The function "In" provides a nicer signature and should be used in preference to IsOneOf.
func IsOneOf(ranges: []*RangeTable, r: rune) => bool {
for _, inside := range ranges {
if Is(inside, r) {
return true
}
}
return false
}
// In reports whether the rune is a member of one of the ranges.
func In(r: rune, ranges: ...*RangeTable) => bool {
for _, inside := range ranges {
if Is(inside, r) {
return true
}
}
return false
}
// IsControl reports whether the rune is a control character.
// The C (Other) Unicode category includes more code points
// such as surrogates; use Is(C, r) to test for them.
func IsControl(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pC != 0
}
// All control characters are < MaxLatin1.
return false
}
// IsLetter reports whether the rune is a letter (category L).
func IsLetter(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&(pLmask) != 0
}
return isExcludingLatin(Letter, r)
}
// IsMark reports whether the rune is a mark character (category M).
func IsMark(r: rune) => bool {
// There are no mark characters in Latin-1.
return isExcludingLatin(Mark, r)
}
// IsNumber reports whether the rune is a number (category N).
func IsNumber(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pN != 0
}
return isExcludingLatin(Number, r)
}
// IsPunct reports whether the rune is a Unicode punctuation character
// (category P).
func IsPunct(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pP != 0
}
return Is(Punct, r)
}
// IsSpace reports whether the rune is a space character as defined
// by Unicode's White Space property; in the Latin-1 space
// this is
// '\t', '\n', '\v', '\f', '\r', ' ', U+0085 (NEL), U+00A0 (NBSP).
// Other definitions of spacing characters are set by category
// Z and property Pattern_White_Space.
func IsSpace(r: rune) => bool {
// This property isn't the same as Z; special-case it.
if uint32(r) <= MaxLatin1 {
switch r {
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0:
return true
}
return false
}
return isExcludingLatin(White_Space, r)
}
// IsSymbol reports whether the rune is a symbolic character.
func IsSymbol(r: rune) => bool {
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pS != 0
}
return isExcludingLatin(Symbol, r)
}
// 版权 @2023 凹语言 作者。保留所有权利。
// Package unicode provides data and functions to test some properties of
// Unicode code points.
const (
MaxRune = '\U0010FFFF' // Maximum valid Unicode code point.
ReplacementChar = '\uFFFD' // Represents invalid code points.
MaxASCII = '\u007F' // maximum ASCII value.
MaxLatin1 = '\u00FF' // maximum Latin-1 value.
)
// RangeTable defines a set of Unicode code points by listing the ranges of
// code points within the set. The ranges are listed in two slices
// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
// The two slices must be in sorted order and non-overlapping.
// Also, R32 should contain only values >= 0x10000 (1<<16).
type RangeTable struct {
R16 []Range16
R32 []Range32
LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
}
// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
// inclusive and has the specified stride.
type Range16 struct {
Lo uint16
Hi uint16
Stride uint16
}
// Range32 represents of a range of Unicode code points and is used when one or
// more of the values will not fit in 16 bits. The range runs from Lo to Hi
// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
type Range32 struct {
Lo uint32
Hi uint32
Stride uint32
}
// CaseRange represents a range of Unicode code points for simple (one
// code point to one code point) case conversion.
// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
// are the number to add to the code point to reach the code point for a
// different case for that character. They may be negative. If zero, it
// means the character is in the corresponding case. There is a special
// case representing sequences of alternating corresponding Upper and Lower
// pairs. It appears with a fixed Delta of
// {UpperLower, UpperLower, UpperLower}
// The constant UpperLower has an otherwise impossible delta value.
type CaseRange struct {
Lo uint32
Hi uint32
Delta d
}
// SpecialCase represents language-specific case mappings such as Turkish.
// Methods of SpecialCase customize (by overriding) the standard mappings.
type SpecialCase []CaseRange
// BUG(r): There is no mechanism for full case folding, that is, for
// characters that involve multiple runes in the input or output.
// Indices into the Delta arrays inside CaseRanges for case mapping.
const (
UpperCase = iota
LowerCase
TitleCase
MaxCase
)
type d [MaxCase]rune // to make the CaseRanges text shorter
// If the Delta field of a CaseRange is UpperLower, it means
// this CaseRange represents a sequence of the form (say)
// Upper Lower Upper Lower.
const (
UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
)
// linearMax is the maximum size table for linear search for non-Latin1 rune.
// Derived by running 'go test -calibrate'.
const linearMax = 18
// is16 reports whether r is in the sorted slice of 16-bit ranges.
func is16(ranges: []Range16, r: uint16) => bool {
if len(ranges) <= linearMax || r <= MaxLatin1 {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := lo + (hi-lo)/2
range_ := &ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
// is32 reports whether r is in the sorted slice of 32-bit ranges.
func is32(ranges: []Range32, r: uint32) => bool {
if len(ranges) <= linearMax {
for i := range ranges {
range_ := &ranges[i]
if r < range_.Lo {
return false
}
if r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
}
return false
}
// binary search over ranges
lo := 0
hi := len(ranges)
for lo < hi {
m := lo + (hi-lo)/2
range_ := ranges[m]
if range_.Lo <= r && r <= range_.Hi {
return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
}
if r < range_.Lo {
hi = m
} else {
lo = m + 1
}
}
return false
}
// Is reports whether the rune is in the specified table of ranges.
func Is(rangeTab: *RangeTable, r: rune) => bool {
r16 := rangeTab.R16
// Compare as uint32 to correctly handle negative runes.
if len(r16) > 0 && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
return is16(r16, uint16(r))
}
r32 := rangeTab.R32
if len(r32) > 0 && r >= rune(r32[0].Lo) {
return is32(r32, uint32(r))
}
return false
}
func isExcludingLatin(rangeTab: *RangeTable, r: rune) => bool {
r16 := rangeTab.R16
// Compare as uint32 to correctly handle negative runes.
if off := rangeTab.LatinOffset; len(r16) > off && uint32(r) <= uint32(r16[len(r16)-1].Hi) {
return is16(r16[off:], uint16(r))
}
r32 := rangeTab.R32
if len(r32) > 0 && r >= rune(r32[0].Lo) {
return is32(r32, uint32(r))
}
return false
}
// IsUpper reports whether the rune is an upper case letter.
func IsUpper(r: rune) => bool {
// See comment in IsGraphic.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pLmask == pLu
}
return isExcludingLatin(Upper, r)
}
// IsLower reports whether the rune is a lower case letter.
func IsLower(r: rune) => bool {
// See comment in IsGraphic.
if uint32(r) <= MaxLatin1 {
return properties[uint8(r)]&pLmask == pLl
}
return isExcludingLatin(Lower, r)
}
// IsTitle reports whether the rune is a title case letter.
func IsTitle(r: rune) => bool {
if r <= MaxLatin1 {
return false
}
return isExcludingLatin(Title, r)
}
// to maps the rune using the specified case mapping.
// It additionally reports whether caseRange contained a mapping for r.
func to(_case: int, r: rune, caseRange: []CaseRange) => (mappedRune: rune, foundMapping: bool) {
if _case < 0 || MaxCase <= _case {
return ReplacementChar, false // as reasonable an error as any
}
// binary search over ranges
lo := 0
hi := len(caseRange)
for lo < hi {
m := lo + (hi-lo)/2
cr := caseRange[m]
if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
delta := cr.Delta[_case]
if delta > MaxRune {
// In an Upper-Lower sequence, which always starts with
// an UpperCase letter, the real deltas always look like:
// {0, 1, 0} UpperCase (Lower is next)
// {-1, 0, -1} LowerCase (Upper, Title are previous)
// The characters at even offsets from the beginning of the
// sequence are upper case; the ones at odd offsets are lower.
// The correct mapping can be done by clearing or setting the low
// bit in the sequence offset.
// The constants UpperCase and TitleCase are even while LowerCase
// is odd so we take the low bit from _case.
return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
}
return r + delta, true
}
if r < rune(cr.Lo) {
hi = m
} else {
lo = m + 1
}
}
return r, false
}
// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
func To(_case: int, r: rune) => rune {
r, _ = to(_case, r, CaseRanges)
return r
}
// ToUpper maps the rune to upper case.
func ToUpper(r: rune) => rune {
if r <= MaxASCII {
if 'a' <= r && r <= 'z' {
r -= 'a' - 'A'
}
return r
}
return To(UpperCase, r)
}
// ToLower maps the rune to lower case.
func ToLower(r: rune) => rune {
if r <= MaxASCII {
if 'A' <= r && r <= 'Z' {
r += 'a' - 'A'
}
return r
}
return To(LowerCase, r)
}
// ToTitle maps the rune to title case.
func ToTitle(r: rune) => rune {
if r <= MaxASCII {
if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
r -= 'a' - 'A'
}
return r
}
return To(TitleCase, r)
}
// ToUpper maps the rune to upper case giving priority to the special mapping.
func SpecialCase.ToUpper(r: rune) => rune {
r1, hadMapping := to(UpperCase, r, []CaseRange(*this))
if r1 == r && !hadMapping {
r1 = ToUpper(r)
}
return r1
}
// ToTitle maps the rune to title case giving priority to the special mapping.
func SpecialCase.ToTitle(r: rune) => rune {
r1, hadMapping := to(TitleCase, r, []CaseRange(*this))
if r1 == r && !hadMapping {
r1 = ToTitle(r)
}
return r1
}
// ToLower maps the rune to lower case giving priority to the special mapping.
func SpecialCase.ToLower(r: rune) => rune {
r1, hadMapping := to(LowerCase, r, []CaseRange(*this))
if r1 == r && !hadMapping {
r1 = ToLower(r)
}
return r1
}
// caseOrbit is defined in tables.go as []foldPair. Right now all the
// entries fit in uint16, so use uint16. If that changes, compilation
// will fail (the constants in the composite literal will not fit in uint16)
// and the types here can change to uint32.
type foldPair struct {
From :u16
To :u16
}
// SimpleFold iterates over Unicode code points equivalent under
// the Unicode-defined simple case folding. Among the code points
// equivalent to rune (including rune itself), SimpleFold returns the
// smallest rune > r if one exists, or else the smallest rune >= 0.
// If r is not a valid Unicode code point, SimpleFold(r) returns r.
//
// For example:
// SimpleFold('A') = 'a'
// SimpleFold('a') = 'A'
//
// SimpleFold('K') = 'k'
// SimpleFold('k') = '\u212A' (Kelvin symbol, K)
// SimpleFold('\u212A') = 'K'
//
// SimpleFold('1') = '1'
//
// SimpleFold(-2) = -2
//
func SimpleFold(r: rune) => rune {
if r < 0 || r > MaxRune {
return r
}
if int(r) < len(asciiFold) {
return rune(asciiFold[r])
}
// Consult caseOrbit table for special cases.
lo := 0
hi := len(caseOrbit)
for lo < hi {
m := lo + (hi-lo)/2
if rune(caseOrbit[m].From) < r {
lo = m + 1
} else {
hi = m
}
}
if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
return rune(caseOrbit[lo].To)
}
// No folding specified. This is a one- or two-element
// equivalence class containing rune and ToLower(rune)
// and ToUpper(rune) if they are different from rune.
if l := ToLower(r); l != r {
return l
}
return ToUpper(r)
}
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册