contract.go 9.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package build

import (
	"fmt"
	"io"
	"reflect"
	"sort"
	"strings"

	"golang.org/x/text/internal/colltab"
)

// This file contains code for detecting contractions and generating
// the necessary tables.
// Any Unicode Collation Algorithm (UCA) table entry that has more than
// one rune one the left-hand side is called a contraction.
// See https://www.unicode.org/reports/tr10/#Contractions for more details.
//
// We define the following terms:
//   initial:     a rune that appears as the first rune in a contraction.
//   suffix:      a sequence of runes succeeding the initial rune
//                in a given contraction.
//   non-initial: a rune that appears in a suffix.
//
// A rune may be both an initial and a non-initial and may be so in
// many contractions.  An initial may typically also appear by itself.
// In case of ambiguities, the UCA requires we match the longest
// contraction.
//
// Many contraction rules share the same set of possible suffixes.
// We store sets of suffixes in a trie that associates an index with
// each suffix in the set.  This index can be used to look up a
// collation element associated with the (starter rune, suffix) pair.
//
// The trie is defined on a UTF-8 byte sequence.
// The overall trie is represented as an array of ctEntries.  Each node of the trie
// is represented as a subsequence of ctEntries, where each entry corresponds to
// a possible match of a next character in the search string.  An entry
// also includes the length and offset to the next sequence of entries
// to check in case of a match.

const (
	final   = 0
	noIndex = 0xFF
)

// ctEntry associates to a matching byte an offset and/or next sequence of
// bytes to check. A ctEntry c is called final if a match means that the
// longest suffix has been found.  An entry c is final if c.N == 0.
// A single final entry can match a range of characters to an offset.
// A non-final entry always matches a single byte. Note that a non-final
// entry might still resemble a completed suffix.
// Examples:
// The suffix strings "ab" and "ac" can be represented as:
// []ctEntry{
//     {'a', 1, 1, noIndex},  // 'a' by itself does not match, so i is 0xFF.
//     {'b', 'c', 0, 1},   // "ab" -> 1, "ac" -> 2
// }
//
// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as:
// []ctEntry{
//     {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'.
//     {'b', 1, 2, 1},    // "ab" -> 1, may be followed by 'c' or 'd'.
//     {'d', 'd', final, 3},  // "abd" -> 3
//     {'c', 4, 1, 2},    // "abc" -> 2, may be followed by 'd'.
//     {'d', 'd', final, 4},  // "abcd" -> 4
// }
// See genStateTests in contract_test.go for more examples.
type ctEntry struct {
	L uint8 // non-final: byte value to match; final: lowest match in range.
	H uint8 // non-final: relative index to next block; final: highest match in range.
	N uint8 // non-final: length of next block; final: final
	I uint8 // result offset. Will be noIndex if more bytes are needed to complete.
}

// contractTrieSet holds a set of contraction tries. The tries are stored
// consecutively in the entry field.
type contractTrieSet []struct{ l, h, n, i uint8 }

// ctHandle is used to identify a trie in the trie set, consisting in an offset
// in the array and the size of the first node.
type ctHandle struct {
	index, n int
}

// appendTrie adds a new trie for the given suffixes to the trie set and returns
// a handle to it.  The handle will be invalid on error.
func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) {
	es := make([]stridx, len(suffixes))
	for i, s := range suffixes {
		es[i].str = s
	}
	sort.Sort(offsetSort(es))
	for i := range es {
		es[i].index = i + 1
	}
	sort.Sort(genidxSort(es))
	i := len(*ct)
	n, err := genStates(ct, es)
	if err != nil {
		*ct = (*ct)[:i]
		return ctHandle{}, err
	}
	return ctHandle{i, n}, nil
}

// genStates generates ctEntries for a given suffix set and returns
// the number of entries for the first node.
func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) {
	if len(sis) == 0 {
		return 0, fmt.Errorf("genStates: list of suffices must be non-empty")
	}
	start := len(*ct)
	// create entries for differing first bytes.
	for _, si := range sis {
		s := si.str
		if len(s) == 0 {
			continue
		}
		added := false
		c := s[0]
		if len(s) > 1 {
			for j := len(*ct) - 1; j >= start; j-- {
				if (*ct)[j].L == c {
					added = true
					break
				}
			}
			if !added {
				*ct = append(*ct, ctEntry{L: c, I: noIndex})
			}
		} else {
			for j := len(*ct) - 1; j >= start; j-- {
				// Update the offset for longer suffixes with the same byte.
				if (*ct)[j].L == c {
					(*ct)[j].I = uint8(si.index)
					added = true
				}
				// Extend range of final ctEntry, if possible.
				if (*ct)[j].H+1 == c {
					(*ct)[j].H = c
					added = true
				}
			}
			if !added {
				*ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)})
			}
		}
	}
	n := len(*ct) - start
	// Append nodes for the remainder of the suffixes for each ctEntry.
	sp := 0
	for i, end := start, len(*ct); i < end; i++ {
		fe := (*ct)[i]
		if fe.H == 0 { // uninitialized non-final
			ln := len(*ct) - start - n
			if ln > 0xFF {
				return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln)
			}
			fe.H = uint8(ln)
			// Find first non-final strings with same byte as current entry.
			for ; sis[sp].str[0] != fe.L; sp++ {
			}
			se := sp + 1
			for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ {
			}
			sl := sis[sp:se]
			sp = se
			for i, si := range sl {
				sl[i].str = si.str[1:]
			}
			nn, err := genStates(ct, sl)
			if err != nil {
				return 0, err
			}
			fe.N = uint8(nn)
			(*ct)[i] = fe
		}
	}
	sort.Sort(entrySort((*ct)[start : start+n]))
	return n, nil
}

// There may be both a final and non-final entry for a byte if the byte
// is implied in a range of matches in the final entry.
// We need to ensure that the non-final entry comes first in that case.
type entrySort colltab.ContractTrieSet

func (fe entrySort) Len() int      { return len(fe) }
func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] }
func (fe entrySort) Less(i, j int) bool {
	return fe[i].L > fe[j].L
}

// stridx is used for sorting suffixes and their associated offsets.
type stridx struct {
	str   string
	index int
}

// For computing the offsets, we first sort by size, and then by string.
// This ensures that strings that only differ in the last byte by 1
// are sorted consecutively in increasing order such that they can
// be packed as a range in a final ctEntry.
type offsetSort []stridx

func (si offsetSort) Len() int      { return len(si) }
func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
func (si offsetSort) Less(i, j int) bool {
	if len(si[i].str) != len(si[j].str) {
		return len(si[i].str) > len(si[j].str)
	}
	return si[i].str < si[j].str
}

// For indexing, we want to ensure that strings are sorted in string order, where
// for strings with the same prefix, we put longer strings before shorter ones.
type genidxSort []stridx

func (si genidxSort) Len() int      { return len(si) }
func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] }
func (si genidxSort) Less(i, j int) bool {
	if strings.HasPrefix(si[j].str, si[i].str) {
		return false
	}
	if strings.HasPrefix(si[i].str, si[j].str) {
		return true
	}
	return si[i].str < si[j].str
}

// lookup matches the longest suffix in str and returns the associated offset
// and the number of bytes consumed.
func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) {
	states := (*ct)[h.index:]
	p := 0
	n := h.n
	for i := 0; i < n && p < len(str); {
		e := states[i]
		c := str[p]
		if c >= e.L {
			if e.L == c {
				p++
				if e.I != noIndex {
					index, ns = int(e.I), p
				}
				if e.N != final {
					// set to new state
					i, states, n = 0, states[int(e.H)+n:], int(e.N)
				} else {
					return
				}
				continue
			} else if e.N == final && c <= e.H {
				p++
				return int(c-e.L) + int(e.I), p
			}
		}
		i++
	}
	return
}

// print writes the contractTrieSet t as compilable Go code to w. It returns
// the total number of bytes written and the size of the resulting data structure in bytes.
func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	update3 := func(nn, sz int, e error) {
		n += nn
		if err == nil {
			err = e
		}
		size += sz
	}
	update2 := func(nn int, e error) { update3(nn, 0, e) }

	update3(printArray(*t, w, name))
	update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name))
	update3(printStruct(*t, w, name))
	update2(fmt.Fprintln(w))
	return
}

func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	p := func(f string, a ...interface{}) {
		nn, e := fmt.Fprintf(w, f, a...)
		n += nn
		if err == nil {
			err = e
		}
	}
	size = len(ct) * 4
	p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size)
	p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct))
	for _, fe := range ct {
		p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I)
	}
	p("}\n")
	return
}

func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) {
	n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name)
	size = int(reflect.TypeOf(ct).Size())
	return
}