json.go 8.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* Copyright 2016-2017 Vector Creations Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gomatrixserverlib

import (
	"encoding/binary"
	"sort"
	"unicode/utf8"
T
tanggen 已提交
22 23 24

	"github.com/pkg/errors"
	"github.com/tidwall/gjson"
25 26
)

27
// CanonicalJSON re-encodes the JSON in a canonical encoding. The encoding is
28 29 30
// the shortest possible encoding using integer values with sorted object keys.
// https://matrix.org/docs/spec/server_server/unstable.html#canonical-json
func CanonicalJSON(input []byte) ([]byte, error) {
T
tanggen 已提交
31 32
	if !gjson.Valid(string(input)) {
		return nil, errors.Errorf("invalid json")
33
	}
T
tanggen 已提交
34 35 36 37 38 39 40 41 42

	return CanonicalJSONAssumeValid(input), nil
}

// CanonicalJSONAssumeValid is the same as CanonicalJSON, but assumes the
// input is valid JSON
func CanonicalJSONAssumeValid(input []byte) []byte {
	input = CompactJSON(input, make([]byte, 0, len(input)))
	return SortJSON(input, make([]byte, 0, len(input)))
43 44 45 46
}

// SortJSON reencodes the JSON with the object keys sorted by lexicographically
// by codepoint. The input must be valid JSON.
T
tanggen 已提交
47 48 49 50 51
func SortJSON(input, output []byte) []byte {
	result := gjson.ParseBytes(input)

	rawJSON := rawJSONFromResult(result, input)
	return sortJSONValue(result, rawJSON, output)
52 53
}

T
tanggen 已提交
54 55 56 57 58 59 60 61 62
// sortJSONValue takes a gjson.Result and sorts it. inputJSON must be the
// raw JSON bytes that gjson.Result points to.
func sortJSONValue(input gjson.Result, inputJSON, output []byte) []byte {
	if input.IsArray() {
		return sortJSONArray(input, inputJSON, output)
	}

	if input.IsObject() {
		return sortJSONObject(input, inputJSON, output)
63
	}
T
tanggen 已提交
64 65 66 67

	// If its neither an object nor an array then there is no sub structure
	// to sort, so just append the raw bytes.
	return append(output, inputJSON...)
68 69
}

T
tanggen 已提交
70 71 72
// sortJSONArray takes a gjson.Result and sorts it, assuming its an array.
// inputJSON must be the raw JSON bytes that gjson.Result points to.
func sortJSONArray(input gjson.Result, inputJSON, output []byte) []byte {
73
	sep := byte('[')
T
tanggen 已提交
74 75 76

	// Iterate over each value in the array and sort it.
	input.ForEach(func(_, value gjson.Result) bool {
77 78
		output = append(output, sep)
		sep = ','
T
tanggen 已提交
79 80 81 82 83 84 85

		rawJSON := rawJSONFromResult(value, inputJSON)
		output = sortJSONValue(value, rawJSON, output)

		return true // keep iterating
	})

86 87 88 89 90 91 92 93
	if sep == '[' {
		// If sep is still '[' then the array was empty and we never wrote the
		// initial '[', so we write it now along with the closing ']'.
		output = append(output, '[', ']')
	} else {
		// Otherwise we end the array by writing a single ']'
		output = append(output, ']')
	}
T
tanggen 已提交
94
	return output
95 96
}

T
tanggen 已提交
97 98 99 100 101 102 103
// sortJSONObject takes a gjson.Result and sorts it, assuming its an object.
// inputJSON must be the raw JSON bytes that gjson.Result points to.
func sortJSONObject(input gjson.Result, inputJSON, output []byte) []byte {
	type entry struct {
		key    string // The parsed key string
		rawKey []byte // The raw, unparsed key JSON string
		value  gjson.Result
104
	}
T
tanggen 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123

	var entries []entry

	// Iterate over each key/value pair and add it to a slice
	// that we can sort
	input.ForEach(func(key, value gjson.Result) bool {
		entries = append(entries, entry{
			key:    key.String(),
			rawKey: rawJSONFromResult(key, inputJSON),
			value:  value,
		})
		return true // keep iterating
	})

	// Sort the slice based on the *parsed* key
	sort.Slice(entries, func(a, b int) bool {
		return entries[a].key < entries[b].key
	})

124
	sep := byte('{')
T
tanggen 已提交
125 126

	for _, entry := range entries {
127 128
		output = append(output, sep)
		sep = ','
T
tanggen 已提交
129 130 131

		// Append the raw unparsed JSON key, *not* the parsed key
		output = append(output, entry.rawKey...)
132
		output = append(output, ':')
T
tanggen 已提交
133 134 135 136

		rawJSON := rawJSONFromResult(entry.value, inputJSON)

		output = sortJSONValue(entry.value, rawJSON, output)
137 138 139 140 141 142 143 144 145
	}
	if sep == '{' {
		// If sep is still '{' then the object was empty and we never wrote the
		// initial '{', so we write it now along with the closing '}'.
		output = append(output, '{', '}')
	} else {
		// Otherwise we end the object by writing a single '}'
		output = append(output, '}')
	}
T
tanggen 已提交
146
	return output
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
}

// CompactJSON makes the encoded JSON as small as possible by removing
// whitespace and unneeded unicode escapes
func CompactJSON(input, output []byte) []byte {
	var i int
	for i < len(input) {
		c := input[i]
		i++
		// The valid whitespace characters are all less than or equal to SPACE 0x20.
		// The valid non-white characters are all greater than SPACE 0x20.
		// So we can check for whitespace by comparing against SPACE 0x20.
		if c <= ' ' {
			// Skip over whitespace.
			continue
		}
		// Add the non-whitespace character to the output.
		output = append(output, c)
		if c == '"' {
			// We are inside a string.
			for i < len(input) {
				c = input[i]
				i++
				// Check if this is an escape sequence.
				if c == '\\' {
					escape := input[i]
					i++
					if escape == 'u' {
						// If this is a unicode escape then we need to handle it specially
						output, i = compactUnicodeEscape(input, output, i)
					} else if escape == '/' {
						// JSON does not require escaping '/', but allows encoders to escape it as a special case.
						// Since the escape isn't required we remove it.
						output = append(output, escape)
					} else {
						// All other permitted escapes are single charater escapes that are already in their shortest form.
						output = append(output, '\\', escape)
					}
				} else {
					output = append(output, c)
				}
				if c == '"' {
					break
				}
			}
		}
	}
	return output
}

// compactUnicodeEscape unpacks a 4 byte unicode escape starting at index.
// If the escape is a surrogate pair then decode the 6 byte \uXXXX escape
// that follows. Returns the output slice and a new input index.
func compactUnicodeEscape(input, output []byte, index int) ([]byte, int) {
	const (
		ESCAPES = "uuuuuuuubtnufruuuuuuuuuuuuuuuuuu"
		HEX     = "0123456789ABCDEF"
	)
	// If there aren't enough bytes to decode the hex escape then return.
	if len(input)-index < 4 {
		return output, len(input)
	}
	// Decode the 4 hex digits.
	c := readHexDigits(input[index:])
	index += 4
	if c < ' ' {
		// If the character is less than SPACE 0x20 then it will need escaping.
		escape := ESCAPES[c]
		output = append(output, '\\', escape)
		if escape == 'u' {
			output = append(output, '0', '0', byte('0'+(c>>4)), HEX[c&0xF])
		}
	} else if c == '\\' || c == '"' {
		// Otherwise the character only needs escaping if it is a QUOTE '"' or BACKSLASH '\\'.
		output = append(output, '\\', byte(c))
	} else if c < 0xD800 || c >= 0xE000 {
		// If the character isn't a surrogate pair then encoded it directly as UTF-8.
		var buffer [4]byte
		n := utf8.EncodeRune(buffer[:], rune(c))
		output = append(output, buffer[:n]...)
	} else {
		// Otherwise the escaped character was the first part of a UTF-16 style surrogate pair.
		// The next 6 bytes MUST be a '\uXXXX'.
		// If there aren't enough bytes to decode the hex escape then return.
		if len(input)-index < 6 {
			return output, len(input)
		}
		// Decode the 4 hex digits from the '\uXXXX'.
		surrogate := readHexDigits(input[index+2:])
		index += 6
		// Reconstruct the UCS4 codepoint from the surrogates.
		codepoint := 0x10000 + (((c & 0x3FF) << 10) | (surrogate & 0x3FF))
		// Encode the charater as UTF-8.
		var buffer [4]byte
		n := utf8.EncodeRune(buffer[:], rune(codepoint))
		output = append(output, buffer[:n]...)
	}
	return output, index
}

// Read 4 hex digits from the input slice.
// Taken from https://github.com/NegativeMjark/indolentjson-rust/blob/8b959791fe2656a88f189c5d60d153be05fe3deb/src/readhex.rs#L21
func readHexDigits(input []byte) uint32 {
	hex := binary.BigEndian.Uint32(input)
251
	// subtract '0'
252 253 254 255 256 257 258 259 260 261 262 263 264
	hex -= 0x30303030
	// strip the higher bits, maps 'a' => 'A'
	hex &= 0x1F1F1F1F
	mask := hex & 0x10101010
	// subtract 'A' - 10 - '9' - 9 = 7 from the letters.
	hex -= mask >> 1
	hex += mask >> 4
	// collect the nibbles
	hex |= hex >> 4
	hex &= 0xFF00FF
	hex |= hex >> 8
	return hex & 0xFFFF
}
T
tanggen 已提交
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280

// rawJSONFromResult extracts the raw JSON bytes pointed to by result.
// input must be the json bytes that were used to generate result
func rawJSONFromResult(result gjson.Result, input []byte) (rawJSON []byte) {
	// This is lifted from gjson README. Basically, result.Raw is a copy of
	// the bytes we want, but its more efficient to take a slice.
	// If Index is 0 then for some reason we can't extract it from the original
	// JSON bytes.
	if result.Index > 0 {
		rawJSON = input[result.Index : result.Index+len(result.Raw)]
	} else {
		rawJSON = []byte(result.Raw)
	}

	return
}