prop.go 5.7 KB
Newer Older
W
wangkang101 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package bidi

import "unicode/utf8"

// Properties provides access to BiDi properties of runes.
type Properties struct {
	entry uint8
	last  uint8
}

var trie = newBidiTrie(0)

// TODO: using this for bidirule reduces the running time by about 5%. Consider
// if this is worth exposing or if we can find a way to speed up the Class
// method.
//
// // CompactClass is like Class, but maps all of the BiDi control classes
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
// func (p Properties) CompactClass() Class {
// 	return Class(p.entry & 0x0F)
// }

// Class returns the Bidi class for p.
func (p Properties) Class() Class {
	c := Class(p.entry & 0x0F)
	if c == Control {
		c = controlByteToClass[p.last&0xF]
	}
	return c
}

// IsBracket reports whether the rune is a bracket.
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 }

// IsOpeningBracket reports whether the rune is an opening bracket.
// IsBracket must return true.
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 }

// TODO: find a better API and expose.
func (p Properties) reverseBracket(r rune) rune {
	return xorMasks[p.entry>>xorMaskShift] ^ r
}

var controlByteToClass = [16]Class{
	0xD: LRO, // U+202D LeftToRightOverride,
	0xE: RLO, // U+202E RightToLeftOverride,
	0xA: LRE, // U+202A LeftToRightEmbedding,
	0xB: RLE, // U+202B RightToLeftEmbedding,
	0xC: PDF, // U+202C PopDirectionalFormat,
	0x6: LRI, // U+2066 LeftToRightIsolate,
	0x7: RLI, // U+2067 RightToLeftIsolate,
	0x8: FSI, // U+2068 FirstStrongIsolate,
	0x9: PDI, // U+2069 PopDirectionalIsolate,
}

// LookupRune returns properties for r.
func LookupRune(r rune) (p Properties, size int) {
	var buf [4]byte
	n := utf8.EncodeRune(buf[:], r)
	return Lookup(buf[:n])
}

// TODO: these lookup methods are based on the generated trie code. The returned
// sizes have slightly different semantics from the generated code, in that it
// always returns size==1 for an illegal UTF-8 byte (instead of the length
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
// leave invalid UTF-8 untouched, in which case it has performance benefits to
// do so (without changing the semantics). Bidi requires the semantics used here
// for the bidirule implementation to be compatible with the Go semantics.
//  They ultimately should perhaps be adopted by all trie implementations, for
// convenience sake.
// This unrolled code also boosts performance of the secure/bidirule package by
// about 30%.
// So, to remove this code:
//   - add option to trie generator to define return type.
//   - always return 1 byte size for ill-formed UTF-8 runes.

// Lookup returns properties for the first rune in s and the width in bytes of
// its encoding. The size will be 0 if s does not hold enough bytes to complete
// the encoding.
func Lookup(s []byte) (p Properties, sz int) {
	c0 := s[0]
	switch {
	case c0 < 0x80: // is ASCII
		return Properties{entry: bidiValues[c0]}, 1
	case c0 < 0xC2:
		return Properties{}, 1
	case c0 < 0xE0: // 2-byte UTF-8
		if len(s) < 2 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
	case c0 < 0xF0: // 3-byte UTF-8
		if len(s) < 3 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		o := uint32(i)<<6 + uint32(c1)
		i = bidiIndex[o]
		c2 := s[2]
		if c2 < 0x80 || 0xC0 <= c2 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
	case c0 < 0xF8: // 4-byte UTF-8
		if len(s) < 4 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		o := uint32(i)<<6 + uint32(c1)
		i = bidiIndex[o]
		c2 := s[2]
		if c2 < 0x80 || 0xC0 <= c2 {
			return Properties{}, 1
		}
		o = uint32(i)<<6 + uint32(c2)
		i = bidiIndex[o]
		c3 := s[3]
		if c3 < 0x80 || 0xC0 <= c3 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
	}
	// Illegal rune
	return Properties{}, 1
}

// LookupString returns properties for the first rune in s and the width in
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
// complete the encoding.
func LookupString(s string) (p Properties, sz int) {
	c0 := s[0]
	switch {
	case c0 < 0x80: // is ASCII
		return Properties{entry: bidiValues[c0]}, 1
	case c0 < 0xC2:
		return Properties{}, 1
	case c0 < 0xE0: // 2-byte UTF-8
		if len(s) < 2 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2
	case c0 < 0xF0: // 3-byte UTF-8
		if len(s) < 3 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		o := uint32(i)<<6 + uint32(c1)
		i = bidiIndex[o]
		c2 := s[2]
		if c2 < 0x80 || 0xC0 <= c2 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3
	case c0 < 0xF8: // 4-byte UTF-8
		if len(s) < 4 {
			return Properties{}, 0
		}
		i := bidiIndex[c0]
		c1 := s[1]
		if c1 < 0x80 || 0xC0 <= c1 {
			return Properties{}, 1
		}
		o := uint32(i)<<6 + uint32(c1)
		i = bidiIndex[o]
		c2 := s[2]
		if c2 < 0x80 || 0xC0 <= c2 {
			return Properties{}, 1
		}
		o = uint32(i)<<6 + uint32(c2)
		i = bidiIndex[o]
		c3 := s[3]
		if c3 < 0x80 || 0xC0 <= c3 {
			return Properties{}, 1
		}
		return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4
	}
	// Illegal rune
	return Properties{}, 1
}