blake2s-core.S 9.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * BLAKE2s digest algorithm, ARM scalar implementation
 *
 * Copyright 2020 Google LLC
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */

#include <linux/linkage.h>
11
#include <asm/assembler.h>
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41

	// Registers used to hold message words temporarily.  There aren't
	// enough ARM registers to hold the whole message block, so we have to
	// load the words on-demand.
	M_0		.req	r12
	M_1		.req	r14

// The BLAKE2s initialization vector
.Lblake2s_IV:
	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19

.macro __ldrd		a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
	ldrd		\a, \b, [\src, #\offset]
#else
	ldr		\a, [\src, #\offset]
	ldr		\b, [\src, #\offset + 4]
#endif
.endm

.macro __strd		a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
	strd		\a, \b, [\dst, #\offset]
#else
	str		\a, [\dst, #\offset]
	str		\b, [\dst, #\offset + 4]
#endif
.endm

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
.macro _le32_bswap	a, tmp
#ifdef __ARMEB__
	rev_l		\a, \tmp
#endif
.endm

.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
	_le32_bswap	\a, \tmp
	_le32_bswap	\b, \tmp
	_le32_bswap	\c, \tmp
	_le32_bswap	\d, \tmp
	_le32_bswap	\e, \tmp
	_le32_bswap	\f, \tmp
	_le32_bswap	\g, \tmp
	_le32_bswap	\h, \tmp
.endm

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
// columns/diagonals.  s0-s1 are the word offsets to the message words the first
// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
//
// Note that to save instructions, the rotations don't happen when the
// pseudocode says they should, but rather they are delayed until the values are
// used.  See the comment above _blake2s_round().
.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3

	ldr		M_0, [sp, #32 + 4 * \s0]
	ldr		M_1, [sp, #32 + 4 * \s2]

	// a += b + m[blake2s_sigma[r][2*i + 0]];
	add		\a0, \a0, \b0, ror #brot
	add		\a1, \a1, \b1, ror #brot
	add		\a0, \a0, M_0
	add		\a1, \a1, M_1

	// d = ror32(d ^ a, 16);
	eor		\d0, \a0, \d0, ror #drot
	eor		\d1, \a1, \d1, ror #drot

	// c += d;
	add		\c0, \c0, \d0, ror #16
	add		\c1, \c1, \d1, ror #16

	// b = ror32(b ^ c, 12);
	eor		\b0, \c0, \b0, ror #brot
	eor		\b1, \c1, \b1, ror #brot

	ldr		M_0, [sp, #32 + 4 * \s1]
	ldr		M_1, [sp, #32 + 4 * \s3]

	// a += b + m[blake2s_sigma[r][2*i + 1]];
	add		\a0, \a0, \b0, ror #12
	add		\a1, \a1, \b1, ror #12
	add		\a0, \a0, M_0
	add		\a1, \a1, M_1

	// d = ror32(d ^ a, 8);
	eor		\d0, \a0, \d0, ror#16
	eor		\d1, \a1, \d1, ror#16

	// c += d;
	add		\c0, \c0, \d0, ror#8
	add		\c1, \c1, \d1, ror#8

	// b = ror32(b ^ c, 7);
	eor		\b0, \c0, \b0, ror#12
	eor		\b1, \c1, \b1, ror#12
.endm

// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
// r14 are free to use.  The macro arguments s0-s15 give the order in which the
// message words are used in this round.
//
// All rotates are performed using the implicit rotate operand accepted by the
// 'add' and 'eor' instructions.  This is faster than using explicit rotate
// instructions.  To make this work, we allow the values in the second and last
// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
// wrong rotation amount.  The rotation amount is then fixed up just in time
// when the values are used.  'brot' is the number of bits the values in row 'b'
// need to be rotated right to arrive at the correct values, and 'drot'
// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
// that they end up as (7, 8) after every round.
.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
			s8, s9, s10, s11, s12, s13, s14, s15

	// Mix first two columns:
	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
				\s0, \s1, \s2, \s3
	__strd		r8, r9, sp, 0
	__strd		r10, r11, sp, 16

	// Mix second two columns:
	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
				\s4, \s5, \s6, \s7
	str		r10, [sp, #24]		// store v[14]
	// v[10], v[11], and v[15] are used below, so no need to store them yet.

	.set brot, 7
	.set drot, 8

	// Mix first two diagonals:
	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
	ldr		r10, [sp, #16]		// load v[12]
	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
				\s8, \s9, \s10, \s11
	__strd		r8, r9, sp, 8
	str		r11, [sp, #28]
	str		r10, [sp, #16]

	// Mix second two diagonals:
	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
				\s12, \s13, \s14, \s15
	__strd		r10, r11, sp, 20
.endm

//
170 171
// void blake2s_compress(struct blake2s_state *state,
//			 const u8 *block, size_t nblocks, u32 inc);
172 173 174 175 176 177 178
//
// Only the first three fields of struct blake2s_state are used:
//	u32 h[8];	(inout)
//	u32 t[2];	(inout)
//	u32 f[2];	(in)
//
	.align		5
179
ENTRY(blake2s_compress)
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
	push		{r0-r2,r4-r11,lr}	// keep this an even number

.Lnext_block:
	// r0 is 'state'
	// r1 is 'block'
	// r3 is 'inc'

	// Load and increment the counter t[0..1].
	__ldrd		r10, r11, r0, 32
	adds		r10, r10, r3
	adc		r11, r11, #0
	__strd		r10, r11, r0, 32

	// _blake2s_round is very short on registers, so copy the message block
	// to the stack to save a register during the rounds.  This also has the
	// advantage that misalignment only needs to be dealt with in one place.
	sub		sp, sp, #64
	mov		r12, sp
	tst		r1, #3
	bne		.Lcopy_block_misaligned
	ldmia		r1!, {r2-r9}
201
	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
202 203
	stmia		r12!, {r2-r9}
	ldmia		r1!, {r2-r9}
204
	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
	stmia		r12, {r2-r9}
.Lcopy_block_done:
	str		r1, [sp, #68]		// Update message pointer

	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
	mov		r14, r0			// r14 = state
	adr		r12, .Lblake2s_IV
	ldmia		r12!, {r8-r9}		// load IV[0..1]
	__ldrd		r0, r1, r14, 40		// load f[0..1]
	ldm		r12, {r2-r7}		// load IV[3..7]
	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
	push		{r2-r7}			// push v[9..15]
	sub		sp, sp, #8		// leave space for v[8..9]

	// Load h[0..7] == v[0..7].
	ldm		r14, {r0-r7}

	// Execute the rounds.  Each round is provided the order in which it
	// needs to use the message words.
	.set brot, 0
	.set drot, 0
	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0

	// Fold the final state matrix into the hash chaining value:
	//
	//	for (i = 0; i < 8; i++)
	//		h[i] ^= v[i] ^ v[i + 8];
	//
	ldr		r14, [sp, #96]		// r14 = &h[0]
	add		sp, sp, #8		// v[8..9] are already loaded.
	pop		{r10-r11}		// load v[10..11]
	eor		r0, r0, r8
	eor		r1, r1, r9
	eor		r2, r2, r10
	eor		r3, r3, r11
	ldm		r14, {r8-r11}		// load h[0..3]
	eor		r0, r0, r8
	eor		r1, r1, r9
	eor		r2, r2, r10
	eor		r3, r3, r11
	stmia		r14!, {r0-r3}		// store new h[0..3]
	ldm		r14, {r0-r3}		// load old h[4..7]
	pop		{r8-r11}		// load v[12..15]
	eor		r0, r0, r4, ror #brot
	eor		r1, r1, r5, ror #brot
	eor		r2, r2, r6, ror #brot
	eor		r3, r3, r7, ror #brot
	eor		r0, r0, r8, ror #drot
	eor		r1, r1, r9, ror #drot
	eor		r2, r2, r10, ror #drot
	eor		r3, r3, r11, ror #drot
	  add		sp, sp, #64		// skip copy of message block
	stm		r14, {r0-r3}		// store new h[4..7]

	// Advance to the next block, if there is one.  Note that if there are
	// multiple blocks, then 'inc' (the counter increment amount) must be
	// 64.  So we can simply set it to 64 without re-loading it.
	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
	mov		r3, #64			// set 'inc'
	subs		r2, r2, #1		// nblocks--
	str		r2, [sp, #8]
	bne		.Lnext_block		// nblocks != 0?

	pop		{r0-r2,r4-r11,pc}

	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
	// by r12) using an alternative method.  r2-r9 are free to use.
.Lcopy_block_misaligned:
	mov		r2, #64
1:
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
	ldr		r3, [r1], #4
291
	_le32_bswap	r3, r4
292 293 294 295 296 297 298 299 300 301 302 303 304 305
#else
	ldrb		r3, [r1, #0]
	ldrb		r4, [r1, #1]
	ldrb		r5, [r1, #2]
	ldrb		r6, [r1, #3]
	add		r1, r1, #4
	orr		r3, r3, r4, lsl #8
	orr		r3, r3, r5, lsl #16
	orr		r3, r3, r6, lsl #24
#endif
	subs		r2, r2, #4
	str		r3, [r12], #4
	bne		1b
	b		.Lcopy_block_done
306
ENDPROC(blake2s_compress)