bytes 包补充函数和测试

69aaa0d6 · chai2010 · 34dbf326 · 69aaa0d6 · 69aaa0d6
隐藏空白更改
内联并排

Showing with 241 addition and 0 deletion

waroot/src/bytes/bytes.wa waroot/src/bytes/bytes.wa +138 -0

waroot/src/bytes/bytes_test.wa waroot/src/bytes/bytes_test.wa +103 -0

未找到文件。
--- a/waroot/src/bytes/bytes.wa
+++ b/waroot/src/bytes/bytes.wa
 // 版权 @2023 凹语言 作者。保留所有权利。

 import (
+	//"unicode" => _
 	"unicode/utf8"
 )

@@ -295,6 +296,17 @@ func SplitN(s, sep: []byte, n: int) => [][]byte {
 	return genSplit(s, sep, 0, n)
 }

+// SplitAfterN slices s into subslices after each instance of sep and
+// returns a slice of those subslices.
+// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
+// The count determines the number of subslices to return:
+//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
+//   n == 0: the result is nil (zero subslices)
+//   n < 0: all subslices
+func SplitAfterN(s, sep: []byte, n: int) => [][]byte {
+	return genSplit(s, sep, len(sep), n)
+}
+
 // Generic split: splits after each instance of sep,
 // including sepSave bytes of sep in the subslices.
 func genSplit(s, sep: []byte, sepSave, n: int) => [][]byte {
@@ -324,6 +336,132 @@ func genSplit(s, sep: []byte, sepSave, n: int) => [][]byte {
 	return a[:i+1]
 }

+global asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
+
+
+// Fields interprets s as a sequence of UTF-8-encoded code points.
+// It splits the slice s around each instance of one or more consecutive white space
+// characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
+// empty slice if s contains only white space.
+func Fields(s: []byte) => [][]byte {
+	if true {
+		return nil
+	}
+
+	/*
+	// First count the fields.
+	// This is an exact count if s is ASCII, otherwise it is an approximation.
+	n := 0
+	wasSpace := 1
+	// setBits is used to track which bits are set in the bytes of s.
+	setBits := uint8(0)
+	for i := 0; i < len(s); i++ {
+		r := s[i]
+		setBits |= r
+		isSpace := int(asciiSpace[r])
+		n += wasSpace & ^isSpace
+		wasSpace = isSpace
+	}
+
+	if setBits >= utf8.RuneSelf {
+		// Some runes in the input slice are not ASCII.
+		return FieldsFunc(s, unicode.IsSpace)
+	}
+
+	// ASCII fast path
+	a := make([][]byte, n)
+	na := 0
+	fieldStart := 0
+	i := 0
+	// Skip spaces in the front of the input.
+	for i < len(s) && asciiSpace[s[i]] != 0 {
+		i++
+	}
+	fieldStart = i
+	for i < len(s) {
+		if asciiSpace[s[i]] == 0 {
+			i++
+			continue
+		}
+		a[na] = s[fieldStart:i:i]
+		na++
+		i++
+		// Skip spaces in between fields.
+		for i < len(s) && asciiSpace[s[i]] != 0 {
+			i++
+		}
+		fieldStart = i
+	}
+	if fieldStart < len(s) { // Last field might end at EOF.
+		a[na] = s[fieldStart:len(s):len(s)]
+	}
+	return a
+
+	*/
+	return nil
+}
+
+
+// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
+// It splits the slice s at each run of code points c satisfying f(c) and
+// returns a slice of subslices of s. If all code points in s satisfy f(c), or
+// len(s) == 0, an empty slice is returned.
+//
+// FieldsFunc makes no guarantees about the order in which it calls f(c)
+// and assumes that f always returns the same value for a given c.
+func FieldsFunc(s: []byte, f: func(rune) => bool) => [][]byte {
+	if true {
+		return nil
+	}
+	/*
+	// A span is used to record a slice of s of the form s[start:end].
+	// The start index is inclusive and the end index is exclusive.
+	type span struct {
+		start: int
+		end:   int
+	}
+	spans := make([]span, 0, 32)
+
+	// Find the field start and end indices.
+	// Doing this in a separate pass (rather than slicing the string s
+	// and collecting the result substrings right away) is significantly
+	// more efficient, possibly due to cache effects.
+	start := -1 // valid span start if >= 0
+	for i := 0; i < len(s); {
+		size := 1
+		r := rune(s[i])
+		if r >= utf8.RuneSelf {
+			r, size = utf8.DecodeRune(s[i:])
+		}
+		if f(r) {
+			if start >= 0 {
+				spans = append(spans, span{start, i})
+				start = -1
+			}
+		} else {
+			if start < 0 {
+				start = i
+			}
+		}
+		i += size
+	}
+
+	// Last field might end at EOF.
+	if start >= 0 {
+		spans = append(spans, span{start, len(s)})
+	}
+
+	// Create subslices from recorded field indices.
+	a := make([][]byte, len(spans))
+	for i, span := range spans {
+		a[i] = s[span.start:span.end:span.end]
+	}
+
+	return a
+	*/
+
+	return nil
+}

 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.

--- a/waroot/src/bytes/bytes_test.wa
+++ b/waroot/src/bytes/bytes_test.wa
@@ -484,6 +484,109 @@ func TestSplit {
 	}
 }

+global splitaftertests = []SplitTest{
+	{abcd, "a", -1, []string{"a", "bcd"}},
+	{abcd, "z", -1, []string{"abcd"}},
+	{abcd, "", -1, []string{"a", "b", "c", "d"}},
+	{commas, ",", -1, []string{"1,", "2,", "3,", "4"}},
+	{dots, "...", -1, []string{"1...", ".2...", ".3...", ".4"}},
+	{faces, "☹", -1, []string{"☺☻☹", ""}},
+	{faces, "~", -1, []string{faces}},
+	{faces, "", -1, []string{"☺", "☻", "☹"}},
+	{"1 2 3 4", " ", 3, []string{"1 ", "2 ", "3 4"}},
+	{"1 2 3", " ", 3, []string{"1 ", "2 ", "3"}},
+	{"1 2", " ", 3, []string{"1 ", "2"}},
+	{"123", "", 2, []string{"1", "23"}},
+	{"123", "", 17, []string{"1", "2", "3"}},
+}
+
+
+func TestSplitAfter {
+	for _, tt := range splitaftertests {
+		a := SplitAfterN([]byte(tt.s), []byte(tt.sep), tt.n)
+
+		// Appending to the results should not change future results.
+		x: []byte
+		for _, v := range a {
+			x = append(v, 'z')
+		}
+
+		result := sliceOfString(a)
+		if !eq(result, tt.a) {
+			assert(false)
+			//t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
+			//continue
+		}
+
+		if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
+			assert(false)
+			//t.Errorf("last appended result was %s; want %s", x, want)
+		}
+
+		s := Join(a, nil)
+		if string(s) != tt.s {
+			assert(false)
+			//t.Errorf(`Join(Split(%q, %q, %d), %q) = %q`, tt.s, tt.sep, tt.n, tt.sep, s)
+		}
+		if tt.n < 0 {
+			//b := SplitAfter([]byte(tt.s), []byte(tt.sep))
+			//if !reflect.DeepEqual(a, b) {
+			//	t.Errorf("SplitAfter disagrees withSplitAfterN(%q, %q, %d) = %v; want %v", tt.s, tt.sep, tt.n, b, a)
+			//}
+		}
+	}
+}
+
+type FieldsTest struct {
+	s: string
+	a: []string
+}
+
+global fieldstests = []FieldsTest{
+	{"", []string{}},
+	{" ", []string{}},
+	{" \t ", []string{}},
+	{"  abc  ", []string{"abc"}},
+	{"1 2 3 4", []string{"1", "2", "3", "4"}},
+	{"1  2  3  4", []string{"1", "2", "3", "4"}},
+	{"1\t\t2\t\t3\t4", []string{"1", "2", "3", "4"}},
+	{"1\u20002\u20013\u20024", []string{"1", "2", "3", "4"}},
+	{"\u2000\u2001\u2002", []string{}},
+	{"\n™\t™\n", []string{"™", "™"}},
+	{faces, []string{faces}},
+}
+
+func _TestFields {
+	for _, tt := range fieldstests {
+		b := []byte(tt.s)
+		a := Fields(b)
+
+		// Appending to the results should not change future results.
+		x: []byte
+		for _, v := range a {
+			x = append(v, 'z')
+		}
+
+		result := sliceOfString(a)
+		if !eq(result, tt.a) {
+			assert(false)
+			//t.Errorf("Fields(%q) = %v; want %v", tt.s, a, tt.a)
+			//continue
+		}
+
+		if string(b) != tt.s {
+			//t.Errorf("slice changed to %s; want %s", string(b), tt.s)
+		}
+		if len(tt.a) > 0 {
+			assert(false)
+			if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
+				assert(false)
+				// t.Errorf("last appended result was %s; want %s", x, want)
+			}
+		}
+	}
+}
+
 func TestEqualFold {
 	for _, tt := range EqualFoldTests {
 		if out := EqualFold([]byte(tt.s), []byte(tt.t)); out != tt.out {