aes-i586-asm_32.S 10.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
// -------------------------------------------------------------------------
// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
// All rights reserved.
//
// LICENSE TERMS
//
// The free distribution and use of this software in both source and binary 
// form is allowed (with or without changes) provided that:
//
//   1. distributions of this source code include the above copyright 
//      notice, this list of conditions and the following disclaimer//
//
//   2. distributions in binary form include the above copyright
//      notice, this list of conditions and the following disclaimer
//      in the documentation and/or other associated materials//
//
//   3. the copyright holder's name is not used to endorse products 
//      built using this software without specific written permission.
//
//
// ALTERNATIVELY, provided that this notice is retained in full, this product
// may be distributed under the terms of the GNU General Public License (GPL),
// in which case the provisions of the GPL apply INSTEAD OF those given above.
//
// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>

// DISCLAIMER
//
// This software is provided 'as is' with no explicit or implied warranties
// in respect of its properties including, but not limited to, correctness 
// and fitness for purpose.
// -------------------------------------------------------------------------
// Issue Date: 29/07/2002

.file "aes-i586-asm.S"
.text

39
#include <asm/asm-offsets.h>
L
Linus Torvalds 已提交
40

41
#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
L
Linus Torvalds 已提交
42

43
/* offsets to parameters with one register pushed onto stack */
44
#define ctx 8
45 46
#define out_blk 12
#define in_blk 16
L
Linus Torvalds 已提交
47

48 49 50 51
/* offsets in crypto_aes_ctx structure */
#define klen (480)
#define ekey (0)
#define dkey (240)
L
Linus Torvalds 已提交
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219

// register mapping for encrypt and decrypt subroutines

#define r0  eax
#define r1  ebx
#define r2  ecx
#define r3  edx
#define r4  esi
#define r5  edi

#define eaxl  al
#define eaxh  ah
#define ebxl  bl
#define ebxh  bh
#define ecxl  cl
#define ecxh  ch
#define edxl  dl
#define edxh  dh

#define _h(reg) reg##h
#define h(reg) _h(reg)

#define _l(reg) reg##l
#define l(reg) _l(reg)

// This macro takes a 32-bit word representing a column and uses
// each of its four bytes to index into four tables of 256 32-bit
// words to obtain values that are then xored into the appropriate
// output registers r0, r1, r4 or r5.  

// Parameters:
// table table base address
//   %1  out_state[0]
//   %2  out_state[1]
//   %3  out_state[2]
//   %4  out_state[3]
//   idx input register for the round (destroyed)
//   tmp scratch register for the round
// sched key schedule

#define do_col(table, a1,a2,a3,a4, idx, tmp)	\
	movzx   %l(idx),%tmp;			\
	xor     table(,%tmp,4),%a1;		\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+2*tlen(,%tmp,4),%a3;	\
	xor     table+3*tlen(,%idx,4),%a4;

// initialise output registers from the key schedule
// NB1: original value of a3 is in idx on exit
// NB2: original values of a1,a2,a4 aren't used
#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov     0 sched,%a1;			\
	movzx   %l(idx),%tmp;			\
	mov     12 sched,%a2;			\
	xor     table(,%tmp,4),%a1;		\
	mov     4 sched,%a4;			\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+3*tlen(,%idx,4),%a4;	\
	mov     %a3,%idx;			\
	mov     8 sched,%a3;			\
	xor     table+2*tlen(,%tmp,4),%a3;

// initialise output registers from the key schedule
// NB1: original value of a3 is in idx on exit
// NB2: original values of a1,a2,a4 aren't used
#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
	mov     0 sched,%a1;			\
	movzx   %l(idx),%tmp;			\
	mov     4 sched,%a2;			\
	xor     table(,%tmp,4),%a1;		\
	mov     12 sched,%a4;			\
	movzx   %h(idx),%tmp;			\
	shr     $16,%idx;			\
	xor     table+tlen(,%tmp,4),%a2;	\
	movzx   %l(idx),%tmp;			\
	movzx   %h(idx),%idx;			\
	xor     table+3*tlen(,%idx,4),%a4;	\
	mov     %a3,%idx;			\
	mov     8 sched,%a3;			\
	xor     table+2*tlen(,%tmp,4),%a3;


// original Gladman had conditional saves to MMX regs.
#define save(a1, a2)		\
	mov     %a2,4*a1(%esp)

#define restore(a1, a2)		\
	mov     4*a2(%esp),%a1

// These macros perform a forward encryption cycle. They are entered with
// the first previous round column values in r0,r1,r4,r5 and
// exit with the final values in the same registers, using stack
// for temporary storage.

// round column values
// on entry: r0,r1,r4,r5
// on exit:  r2,r1,r4,r5
#define fwd_rnd1(arg, table)						\
	save   (0,r1);							\
	save   (1,r5);							\
									\
	/* compute new column values */					\
	do_fcol(table, r2,r5,r4,r1, r0,r3, arg);	/* idx=r0 */	\
	do_col (table, r4,r1,r2,r5, r0,r3);		/* idx=r4 */	\
	restore(r0,0);							\
	do_col (table, r1,r2,r5,r4, r0,r3);		/* idx=r1 */	\
	restore(r0,1);							\
	do_col (table, r5,r4,r1,r2, r0,r3);		/* idx=r5 */

// round column values
// on entry: r2,r1,r4,r5
// on exit:  r0,r1,r4,r5
#define fwd_rnd2(arg, table)						\
	save   (0,r1);							\
	save   (1,r5);							\
									\
	/* compute new column values */					\
	do_fcol(table, r0,r5,r4,r1, r2,r3, arg);	/* idx=r2 */	\
	do_col (table, r4,r1,r0,r5, r2,r3);		/* idx=r4 */	\
	restore(r2,0);							\
	do_col (table, r1,r0,r5,r4, r2,r3);		/* idx=r1 */	\
	restore(r2,1);							\
	do_col (table, r5,r4,r1,r0, r2,r3);		/* idx=r5 */

// These macros performs an inverse encryption cycle. They are entered with
// the first previous round column values in r0,r1,r4,r5 and
// exit with the final values in the same registers, using stack
// for temporary storage

// round column values
// on entry: r0,r1,r4,r5
// on exit:  r2,r1,r4,r5
#define inv_rnd1(arg, table)						\
	save    (0,r1);							\
	save    (1,r5);							\
									\
	/* compute new column values */					\
	do_icol(table, r2,r1,r4,r5, r0,r3, arg);	/* idx=r0 */	\
	do_col (table, r4,r5,r2,r1, r0,r3);		/* idx=r4 */	\
	restore(r0,0);							\
	do_col (table, r1,r4,r5,r2, r0,r3);		/* idx=r1 */	\
	restore(r0,1);							\
	do_col (table, r5,r2,r1,r4, r0,r3);		/* idx=r5 */

// round column values
// on entry: r2,r1,r4,r5
// on exit:  r0,r1,r4,r5
#define inv_rnd2(arg, table)						\
	save    (0,r1);							\
	save    (1,r5);							\
									\
	/* compute new column values */					\
	do_icol(table, r0,r1,r4,r5, r2,r3, arg);	/* idx=r2 */	\
	do_col (table, r4,r5,r0,r1, r2,r3);		/* idx=r4 */	\
	restore(r2,0);							\
	do_col (table, r1,r4,r5,r0, r2,r3);		/* idx=r1 */	\
	restore(r2,1);							\
	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */

// AES (Rijndael) Encryption Subroutine
220
/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
L
Linus Torvalds 已提交
221 222 223

.global  aes_enc_blk

224 225
.extern  crypto_ft_tab
.extern  crypto_fl_tab
L
Linus Torvalds 已提交
226 227 228 229 230

.align 4

aes_enc_blk:
	push    %ebp
231
	mov     ctx(%esp),%ebp
L
Linus Torvalds 已提交
232 233 234 235 236 237 238

// CAUTION: the order and the values used in these assigns 
// rely on the register mappings

1:	push    %ebx
	mov     in_blk+4(%esp),%r2
	push    %esi
239
	mov     klen(%ebp),%r3   // key size
L
Linus Torvalds 已提交
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	push    %edi
#if ekey != 0
	lea     ekey(%ebp),%ebp  // key pointer
#endif

// input four columns and xor in first round key

	mov     (%r2),%r0
	mov     4(%r2),%r1
	mov     8(%r2),%r4
	mov     12(%r2),%r5
	xor     (%ebp),%r0
	xor     4(%ebp),%r1
	xor     8(%ebp),%r4
	xor     12(%ebp),%r5

256 257
	sub     $8,%esp		// space for register saves on stack
	add     $16,%ebp	// increment to next round key
258
	cmp     $24,%r3
259 260 261 262 263
	jb      4f		// 10 rounds for 128-bit key
	lea     32(%ebp),%ebp
	je      3f		// 12 rounds for 192-bit key
	lea     32(%ebp),%ebp

264 265 266 267 268 269 270 271 272 273 274 275 276 277
2:	fwd_rnd1( -64(%ebp), crypto_ft_tab)	// 14 rounds for 256-bit key
	fwd_rnd2( -48(%ebp), crypto_ft_tab)
3:	fwd_rnd1( -32(%ebp), crypto_ft_tab)	// 12 rounds for 192-bit key
	fwd_rnd2( -16(%ebp), crypto_ft_tab)
4:	fwd_rnd1(    (%ebp), crypto_ft_tab)	// 10 rounds for 128-bit key
	fwd_rnd2( +16(%ebp), crypto_ft_tab)
	fwd_rnd1( +32(%ebp), crypto_ft_tab)
	fwd_rnd2( +48(%ebp), crypto_ft_tab)
	fwd_rnd1( +64(%ebp), crypto_ft_tab)
	fwd_rnd2( +80(%ebp), crypto_ft_tab)
	fwd_rnd1( +96(%ebp), crypto_ft_tab)
	fwd_rnd2(+112(%ebp), crypto_ft_tab)
	fwd_rnd1(+128(%ebp), crypto_ft_tab)
	fwd_rnd2(+144(%ebp), crypto_fl_tab)	// last round uses a different table
L
Linus Torvalds 已提交
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294

// move final values to the output array.  CAUTION: the 
// order of these assigns rely on the register mappings

	add     $8,%esp
	mov     out_blk+12(%esp),%ebp
	mov     %r5,12(%ebp)
	pop     %edi
	mov     %r4,8(%ebp)
	pop     %esi
	mov     %r1,4(%ebp)
	pop     %ebx
	mov     %r0,(%ebp)
	pop     %ebp
	ret

// AES (Rijndael) Decryption Subroutine
295
/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
L
Linus Torvalds 已提交
296 297 298

.global  aes_dec_blk

299 300
.extern  crypto_it_tab
.extern  crypto_il_tab
L
Linus Torvalds 已提交
301 302 303 304 305

.align 4

aes_dec_blk:
	push    %ebp
306
	mov     ctx(%esp),%ebp
L
Linus Torvalds 已提交
307 308 309 310 311 312 313

// CAUTION: the order and the values used in these assigns 
// rely on the register mappings

1:	push    %ebx
	mov     in_blk+4(%esp),%r2
	push    %esi
314
	mov     klen(%ebp),%r3   // key size
L
Linus Torvalds 已提交
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	push    %edi
#if dkey != 0
	lea     dkey(%ebp),%ebp  // key pointer
#endif
	
// input four columns and xor in first round key

	mov     (%r2),%r0
	mov     4(%r2),%r1
	mov     8(%r2),%r4
	mov     12(%r2),%r5
	xor     (%ebp),%r0
	xor     4(%ebp),%r1
	xor     8(%ebp),%r4
	xor     12(%ebp),%r5

331
	sub     $8,%esp		// space for register saves on stack
332 333
	add     $16,%ebp	// increment to next round key
	cmp     $24,%r3
334
	jb      4f		// 10 rounds for 128-bit key
335
	lea     32(%ebp),%ebp
336
	je      3f		// 12 rounds for 192-bit key
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
	lea     32(%ebp),%ebp

2:	inv_rnd1( -64(%ebp), crypto_it_tab)	// 14 rounds for 256-bit key
	inv_rnd2( -48(%ebp), crypto_it_tab)
3:	inv_rnd1( -32(%ebp), crypto_it_tab)	// 12 rounds for 192-bit key
	inv_rnd2( -16(%ebp), crypto_it_tab)
4:	inv_rnd1(    (%ebp), crypto_it_tab)	// 10 rounds for 128-bit key
	inv_rnd2( +16(%ebp), crypto_it_tab)
	inv_rnd1( +32(%ebp), crypto_it_tab)
	inv_rnd2( +48(%ebp), crypto_it_tab)
	inv_rnd1( +64(%ebp), crypto_it_tab)
	inv_rnd2( +80(%ebp), crypto_it_tab)
	inv_rnd1( +96(%ebp), crypto_it_tab)
	inv_rnd2(+112(%ebp), crypto_it_tab)
	inv_rnd1(+128(%ebp), crypto_it_tab)
	inv_rnd2(+144(%ebp), crypto_il_tab)	// last round uses a different table
L
Linus Torvalds 已提交
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367

// move final values to the output array.  CAUTION: the 
// order of these assigns rely on the register mappings

	add     $8,%esp
	mov     out_blk+12(%esp),%ebp
	mov     %r5,12(%ebp)
	pop     %edi
	mov     %r4,8(%ebp)
	pop     %esi
	mov     %r1,4(%ebp)
	pop     %ebx
	mov     %r0,(%ebp)
	pop     %ebp
	ret