From f22e1e4dd2bf02390385f6cb5bfabe7c5b74b397 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 2 Jan 2003 17:40:33 +0000 Subject: [PATCH] UltraSPARC assembler DES implementation tune-up. The code can be compiled for any SPARC CPU (UltraSPARC performance is *not* affected), can be compiled for 64-bit ABI and is position-independent. --- crypto/des/asm/des_enc.m4 | 570 ++++++++++++++++++++++++-------------- 1 file changed, 356 insertions(+), 214 deletions(-) diff --git a/crypto/des/asm/des_enc.m4 b/crypto/des/asm/des_enc.m4 index 8db30f33c4..ab99660ad5 100644 --- a/crypto/des/asm/des_enc.m4 +++ b/crypto/des/asm/des_enc.m4 @@ -7,6 +7,11 @@ ! ! June 8, 2000. ! +! Version 2.0. 32/64-bit, PIC-ification, blended CPU adaptation +! by Andy Polyakov. +! +! January 1, 2003. +! ! Assembler version: Copyright Svend Olaf Mikkelsen. ! ! Original C code: Copyright Eric A. Young. @@ -27,9 +32,45 @@ ! Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S ! ! Assemble through cc: cc -c -xarch=v8plusa -o des_enc.o des_enc.S +! +! Performance improvement according to './apps/openssl speed des' +! +! 32-bit build: +! 23% faster than cc-5.2 -xarch=v8plus -xO5 +! 115% faster than gcc-3.2.1 -m32 -mcpu=ultrasparc -O5 +! 64-bit build: +! 50% faster than cc-5.2 -xarch=v9 -xO5 +! 100% faster than gcc-3.2.1 -m64 -mcpu=ultrasparc -O5 +! - -.ident "des_enc.m4 1.0" +.ident "des_enc.m4 2.0" + +#if defined(__SUNPRO_C) && defined(__sparcv9) +# define ABI64 /* They've said -xarch=v9 at command line */ +#elif defined(__GNUC__) && defined(__arch64__) +# define ABI64 /* They've said -m64 at command line */ +#endif + +#ifdef ABI64 + .register %g2,#scratch + .register %g3,#scratch +# define FRAME -192 +# define BIAS 2047 +# define LDPTR ldx +# define STPTR stx +# define ARG0 128 +# define ARGSZ 8 +# ifndef OPENSSL_SYSNAME_ULTRASPARC +# define OPENSSL_SYSNAME_ULTRASPARC +# endif +#else +# define FRAME -96 +# define BIAS 0 +# define LDPTR ld +# define STPTR st +# define ARG0 68 +# define ARGSZ 4 +#endif #define LOOPS 7 @@ -125,13 +166,13 @@ define(ip_macro, { srl $1, 16, local4 xor $2, local1, $2 - ifelse($9,1,{ld KS3, in4},{}) + ifelse($9,1,{LDPTR KS3, in4},{}) xor local4, $2, local4 - sethi %hi(des_SPtrans), global1 ! sbox addr + nop !sethi %hi(DES_SPtrans), global1 ! sbox addr - ifelse($9,1,{ld KS2, in3},{}) + ifelse($9,1,{LDPTR KS2, in3},{}) and local4, local2, local4 - or global1, %lo(des_SPtrans), global1 ! sbox addr + nop !or global1, %lo(DES_SPtrans), global1 ! sbox addr sll local4, 16, local1 xor $2, local4, $2 @@ -260,7 +301,7 @@ define(rounds_macro, { xor $2, out0, local1 ld [out2+284], local5 ! 0x0000FC00 - ba,pt %icc, $4 + ba $4 and local1, 252, local1 .align 32 @@ -273,16 +314,16 @@ $4: ld [global1+local1], local1 xor $2, out1, out1 ! 8642 xor $2, out0, out0 ! 7531 - fxor %f0, %f0, %f0 ! fxor used for alignment + fmovs %f0, %f0 ! fxor used for alignment srl out1, 4, local0 ! rotate 4 right and out0, local5, local3 ! 3 - fxor %f0, %f0, %f0 + fmovs %f0, %f0 ld [$5+$3*8], local7 ! key 7531 next round srl local3, 8, local3 ! 3 and local0, 252, local2 ! 2 - fxor %f0, %f0, %f0 + fmovs %f0, %f0 ld [global3+local3],local3 ! 3 sll out1, 28, out1 ! rotate @@ -380,7 +421,11 @@ $4: xor $2, local1, $2 ! 1 finished xor $2, local2, $2 ! 3 finished +#ifdef OPENSSL_SYSNAME_ULTRASPARC bne,pt %icc, $4 +#else + bne $4 +#endif and local4, 252, local1 ! sbox 1 next round ! two rounds more: @@ -551,10 +596,10 @@ define(fp_macro, { sethi %hi(0x0f0f0f0f), local4 sll local3, 2, local2 - ifelse($4,1, {ld INPUT, local5}) + ifelse($4,1, {LDPTR INPUT, local5}) xor $1, local3, $1 - ifelse($4,1, {ld OUTPUT, local7}) + ifelse($4,1, {LDPTR OUTPUT, local7}) srl $1, 16, local3 xor $2, local2, $2 xor local3, $2, local3 @@ -702,7 +747,7 @@ define(fp_ip_macro, { sll temp1, 4, temp2 xor $1, temp1, $1 - ifelse($5,1,{ld KS2, in4}) + ifelse($5,1,{LDPTR KS2, in4}) sll $4, 3, local2 xor local4, temp2, $2 @@ -713,7 +758,7 @@ define(fp_ip_macro, { srl $3, 29, local0 ifelse($5,1,{add in4, 120, in4}) - ifelse($5,1,{ld KS1, in3}) + ifelse($5,1,{LDPTR KS1, in3}) srl $4, 29, local7 or local0, local5, $4 @@ -738,6 +783,7 @@ define(load_little_endian, { ! first in memory to rightmost in register +#ifdef OPENSSL_SYSNAME_ULTRASPARC andcc $1, 3, global0 bne,pn %icc, $5 nop @@ -747,6 +793,7 @@ define(load_little_endian, { ba,pt %icc, $5a lda [$4] 0x88, $3 +#endif $5: ldub [$1+3], $2 @@ -799,6 +846,7 @@ define(load_little_endian_inc, { ! first in memory to rightmost in register +#ifdef OPENSSL_SYSNAME_ULTRASPARC andcc $1, 3, global0 bne,pn %icc, $5 nop @@ -809,6 +857,7 @@ define(load_little_endian_inc, { lda [$1] 0x88, $3 ba,pt %icc, $5a add $1, 4, $1 +#endif $5: ldub [$1+3], $2 @@ -863,17 +912,17 @@ define(load_n_bytes, { ! {load_n_bytes} ! $1 $2 $5 $6 $7 $8 $7 $8 $9 +$7.0: call .+8 sll $2, 2, $6 - sethi %hi($7.jmp.table), $5 - or $5, %lo($7.jmp.table), $5 + add %o7,$7.jmp.table-$7.0,$5 add $5, $6, $5 mov 0, $4 ld [$5], $5 - jmp $5 + jmp %o7+$5 mov 0, $3 $7.7: @@ -901,20 +950,20 @@ $7.2: or $4, $5, $4 $7.1: ldub [$1+0], $5 - ba,pt %icc, $8 + ba $8 or $4, $5, $4 .align 4 $7.jmp.table: .word 0 - .word $7.1 - .word $7.2 - .word $7.3 - .word $7.4 - .word $7.5 - .word $7.6 - .word $7.7 + .word $7.1-$7.0 + .word $7.2-$7.0 + .word $7.3-$7.0 + .word $7.4-$7.0 + .word $7.5-$7.0 + .word $7.6-$7.0 + .word $7.7-$7.0 }) @@ -932,6 +981,7 @@ define(store_little_endian, { ! rightmost in register to first in memory +#ifdef OPENSSL_SYSNAME_ULTRASPARC andcc $1, 3, global0 bne,pn %icc, $5 nop @@ -941,6 +991,7 @@ define(store_little_endian, { ba,pt %icc, $5a sta $3, [$4] 0x88 +#endif $5: and $2, 255, $4 @@ -995,15 +1046,16 @@ define(store_n_bytes, { ! {store_n_bytes} ! $1 $2 $5 $6 $7 $8 $7 $8 $9 +$7.0: call .+8 sll $2, 2, $6 - sethi %hi($7.jmp.table), $5 - or $5, %lo($7.jmp.table), $5 + add %o7,$7.jmp.table-$7.0,$5 add $5, $6, $5 ld [$5], $5 - jmp $5 + + jmp %o7+$5 nop $7.7: @@ -1032,7 +1084,7 @@ $7.1: and $4, 0xff, $5 - ba,pt %icc, $8 + ba $8 stub $5, [$1] .align 4 @@ -1040,13 +1092,13 @@ $7.1: $7.jmp.table: .word 0 - .word $7.1 - .word $7.2 - .word $7.3 - .word $7.4 - .word $7.5 - .word $7.6 - .word $7.7 + .word $7.1-$7.0 + .word $7.2-$7.0 + .word $7.3-$7.0 + .word $7.4-$7.0 + .word $7.5-$7.0 + .word $7.6-$7.0 + .word $7.7-$7.0 }) @@ -1089,64 +1141,6 @@ define(register_init, { }) - .global .des_and -.section ".rodata" - .align 8 - .type .des_and,#object - .size .des_and,284 - -.des_and: - -! This table is used for AND 0xFC when it is known that register -! bits 8-31 are zero. Makes it possible to do three arithmetic -! operations in one cycle. - - .byte 0, 0, 0, 0, 4, 4, 4, 4 - .byte 8, 8, 8, 8, 12, 12, 12, 12 - .byte 16, 16, 16, 16, 20, 20, 20, 20 - .byte 24, 24, 24, 24, 28, 28, 28, 28 - .byte 32, 32, 32, 32, 36, 36, 36, 36 - .byte 40, 40, 40, 40, 44, 44, 44, 44 - .byte 48, 48, 48, 48, 52, 52, 52, 52 - .byte 56, 56, 56, 56, 60, 60, 60, 60 - .byte 64, 64, 64, 64, 68, 68, 68, 68 - .byte 72, 72, 72, 72, 76, 76, 76, 76 - .byte 80, 80, 80, 80, 84, 84, 84, 84 - .byte 88, 88, 88, 88, 92, 92, 92, 92 - .byte 96, 96, 96, 96, 100, 100, 100, 100 - .byte 104, 104, 104, 104, 108, 108, 108, 108 - .byte 112, 112, 112, 112, 116, 116, 116, 116 - .byte 120, 120, 120, 120, 124, 124, 124, 124 - .byte 128, 128, 128, 128, 132, 132, 132, 132 - .byte 136, 136, 136, 136, 140, 140, 140, 140 - .byte 144, 144, 144, 144, 148, 148, 148, 148 - .byte 152, 152, 152, 152, 156, 156, 156, 156 - .byte 160, 160, 160, 160, 164, 164, 164, 164 - .byte 168, 168, 168, 168, 172, 172, 172, 172 - .byte 176, 176, 176, 176, 180, 180, 180, 180 - .byte 184, 184, 184, 184, 188, 188, 188, 188 - .byte 192, 192, 192, 192, 196, 196, 196, 196 - .byte 200, 200, 200, 200, 204, 204, 204, 204 - .byte 208, 208, 208, 208, 212, 212, 212, 212 - .byte 216, 216, 216, 216, 220, 220, 220, 220 - .byte 224, 224, 224, 224, 228, 228, 228, 228 - .byte 232, 232, 232, 232, 236, 236, 236, 236 - .byte 240, 240, 240, 240, 244, 244, 244, 244 - .byte 248, 248, 248, 248, 252, 252, 252, 252 - - ! 5 numbers for initil/final permutation - - .word 0x0f0f0f0f ! offset 256 - .word 0x0000ffff ! 260 - .word 0x33333333 ! 264 - .word 0x00ff00ff ! 268 - .word 0x55555555 ! 272 - - .word 0 ! 276 - .word LOOPS ! 280 - .word 0x0000FC00 ! 284 - - .section ".text" .align 32 @@ -1173,24 +1167,29 @@ define(register_init, { -! void des_encrypt(data, ks, enc) +! void DES_encrypt1(data, ks, enc) ! ******************************* .align 32 - .global des_encrypt - .type des_encrypt,#function + .global DES_encrypt1 + .type DES_encrypt1,#function + +DES_encrypt1: -des_encrypt: + save %sp, FRAME, %sp - save %sp, -96, %sp + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 ld [in0], in5 ! left - sethi %hi(.des_and), out2 ! address constants cmp in2, 0 ! enc - ld [in0+4], out5 ! right +#ifdef OPENSSL_SYSNAME_ULTRASPARC be,pn %icc, .encrypt.dec ! enc/dec - or out2, %lo(.des_and), out2 ! address constants +#else + be .encrypt.dec +#endif + ld [in0+4], out5 ! right ! parameter 6 1/2 for include encryption/decryption ! parameter 7 1 for move in1 to in3 @@ -1198,12 +1197,12 @@ des_encrypt: ip_macro(in5, out5, in5, out5, in3, 0, 1, 1) - rounds_macro(in5, out5, 1, .des_encrypt.1, in3, in4) ! in4 not used + rounds_macro(in5, out5, 1, .des_encrypt1.1, in3, in4) ! in4 not used fp_macro(in5, out5, 1) ! 1 for store to [in0] - return in7+8 - nop + ret + restore .encrypt.dec: @@ -1217,34 +1216,35 @@ des_encrypt: fp_macro(out5, in5, 1) ! 1 for store to [in0] - return in7+8 - nop + ret + restore -.des_encrypt.end: - .size des_encrypt,.des_encrypt.end-des_encrypt +.DES_encrypt1.end: + .size DES_encrypt1,.DES_encrypt1.end-DES_encrypt1 -! void des_encrypt2(data, ks, enc) +! void DES_encrypt2(data, ks, enc) !********************************* ! encrypts/decrypts without initial/final permutation .align 32 - .global des_encrypt2 - .type des_encrypt2,#function + .global DES_encrypt2 + .type DES_encrypt2,#function -des_encrypt2: +DES_encrypt2: - save %sp, -112, %sp + save %sp, FRAME, %sp + + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 ! Set sbox address 1 to 6 and rotate halfs 3 left ! Errors caught by destest? Yes. Still? *NO* - sethi %hi(des_SPtrans), global1 ! address sbox 1 - sethi %hi(.des_and), out2 ! address constants + !sethi %hi(DES_SPtrans), global1 ! address sbox 1 - or global1, %lo(des_SPtrans), global1 ! sbox 1 - or out2, %lo(.des_and), out2 ! adress constants + !or global1, %lo(DES_SPtrans), global1 ! sbox 1 add global1, 256, global2 ! sbox 2 add global1, 512, global3 ! sbox 3 @@ -1273,8 +1273,12 @@ des_encrypt2: ! we use our own stackframe +#ifdef OPENSSL_SYSNAME_ULTRASPARC be,pn %icc, .encrypt2.dec ! decryption - st in0, [%sp+68] +#else + be .encrypt2.dec +#endif + STPTR in0, [%sp+BIAS+ARG0+0*ARGSZ] ld [in3], out0 ! key 7531 first round mov LOOPS, out4 ! loop counter @@ -1291,13 +1295,13 @@ des_encrypt2: sll out5, 29, in1 add in5, in0, in5 srl out5, 3, out5 - ld [%sp+68], in0 + LDPTR [%sp+BIAS+ARG0+0*ARGSZ], in0 add out5, in1, out5 st in5, [in0] st out5, [in0+4] - return in7+8 - nop + ret + restore .encrypt2.dec: @@ -1324,36 +1328,37 @@ des_encrypt2: sll out5, 29, in1 add in5, in0, in5 srl out5, 3, out5 - ld [%sp+68], in0 + LDPTR [%sp+BIAS+ARG0+0*ARGSZ], in0 add out5, in1, out5 st out5, [in0] st in5, [in0+4] - return in7+8 - nop + ret + restore -.des_encrypt2.end: - .size des_encrypt2, .des_encrypt2.end-des_encrypt2 +.DES_encrypt2.end: + .size DES_encrypt2, .DES_encrypt2.end-DES_encrypt2 -! void des_encrypt3(data, ks1, ks2, ks3) +! void DES_encrypt3(data, ks1, ks2, ks3) ! ************************************** .align 32 - .global des_encrypt3 - .type des_encrypt3,#function + .global DES_encrypt3 + .type DES_encrypt3,#function -des_encrypt3: +DES_encrypt3: - save %sp, -96, %sp + save %sp, FRAME, %sp + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 + ld [in0], in5 ! left add in2, 120, in4 ! ks2 - sethi %hi(.des_and), out2 ! address constants ld [in0+4], out5 ! right mov in3, in2 ! save ks3 - or out2, %lo(.des_and), out2 ! address constants ! parameter 6 1/2 for include encryption/decryption ! parameter 7 1 for mov in1 to in3 @@ -1370,31 +1375,32 @@ des_encrypt3: fp_macro(in5, out5, 1) - return in7+8 - nop + ret + restore -.des_encrypt3.end: - .size des_encrypt3,.des_encrypt3.end-des_encrypt3 +.DES_encrypt3.end: + .size DES_encrypt3,.DES_encrypt3.end-DES_encrypt3 -! void des_decrypt3(data, ks1, ks2, ks3) +! void DES_decrypt3(data, ks1, ks2, ks3) ! ************************************** .align 32 - .global des_decrypt3 - .type des_decrypt3,#function + .global DES_decrypt3 + .type DES_decrypt3,#function -des_decrypt3: +DES_decrypt3: - save %sp, -96, %sp + save %sp, FRAME, %sp + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 + ld [in0], in5 ! left add in3, 120, in4 ! ks3 - sethi %hi(.des_and), out2 ld [in0+4], out5 ! right mov in2, in3 ! ks2 - or out2, %lo(.des_and), out2 ! parameter 6 1/2 for include encryption/decryption ! parameter 7 1 for mov in1 to in3 @@ -1411,44 +1417,128 @@ des_decrypt3: fp_macro(out5, in5, 1) - return in7+8 - nop + ret + restore + +.DES_decrypt3.end: + .size DES_decrypt3,.DES_decrypt3.end-DES_decrypt3 + +! input: out0 offset between .PIC.me.up and caller +! output: out0 pointer to .PIC.me.up +! out2 pointer to .des_and +! global1 pointer to DES_SPtrans + .align 32 +.PIC.me.up: + add out0,%o7,out0 ! pointer to .PIC.me.up + +#ifdef __PIC__ + sethi %hi(DES_SPtrans),global1 + or global1,%lo(DES_SPtrans),global1 + sethi %hi(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2 + add global1,out0,global1 + add out2,%lo(_GLOBAL_OFFSET_TABLE_-(.PIC.me.up-.)),out2 + LDPTR [out2+global1],global1 +#else + setn DES_SPtrans,out2,global1 ! synthetic instruction ! +#endif + + retl + add out0,.des_and-.PIC.me.up,out2 + + .align 256 + .type .des_and,#object + .size .des_and,284 + +.des_and: + +! This table is used for AND 0xFC when it is known that register +! bits 8-31 are zero. Makes it possible to do three arithmetic +! operations in one cycle. + + .byte 0, 0, 0, 0, 4, 4, 4, 4 + .byte 8, 8, 8, 8, 12, 12, 12, 12 + .byte 16, 16, 16, 16, 20, 20, 20, 20 + .byte 24, 24, 24, 24, 28, 28, 28, 28 + .byte 32, 32, 32, 32, 36, 36, 36, 36 + .byte 40, 40, 40, 40, 44, 44, 44, 44 + .byte 48, 48, 48, 48, 52, 52, 52, 52 + .byte 56, 56, 56, 56, 60, 60, 60, 60 + .byte 64, 64, 64, 64, 68, 68, 68, 68 + .byte 72, 72, 72, 72, 76, 76, 76, 76 + .byte 80, 80, 80, 80, 84, 84, 84, 84 + .byte 88, 88, 88, 88, 92, 92, 92, 92 + .byte 96, 96, 96, 96, 100, 100, 100, 100 + .byte 104, 104, 104, 104, 108, 108, 108, 108 + .byte 112, 112, 112, 112, 116, 116, 116, 116 + .byte 120, 120, 120, 120, 124, 124, 124, 124 + .byte 128, 128, 128, 128, 132, 132, 132, 132 + .byte 136, 136, 136, 136, 140, 140, 140, 140 + .byte 144, 144, 144, 144, 148, 148, 148, 148 + .byte 152, 152, 152, 152, 156, 156, 156, 156 + .byte 160, 160, 160, 160, 164, 164, 164, 164 + .byte 168, 168, 168, 168, 172, 172, 172, 172 + .byte 176, 176, 176, 176, 180, 180, 180, 180 + .byte 184, 184, 184, 184, 188, 188, 188, 188 + .byte 192, 192, 192, 192, 196, 196, 196, 196 + .byte 200, 200, 200, 200, 204, 204, 204, 204 + .byte 208, 208, 208, 208, 212, 212, 212, 212 + .byte 216, 216, 216, 216, 220, 220, 220, 220 + .byte 224, 224, 224, 224, 228, 228, 228, 228 + .byte 232, 232, 232, 232, 236, 236, 236, 236 + .byte 240, 240, 240, 240, 244, 244, 244, 244 + .byte 248, 248, 248, 248, 252, 252, 252, 252 -.des_decrypt3.end: - .size des_decrypt3,.des_decrypt3.end-des_decrypt3 + ! 5 numbers for initil/final permutation + .word 0x0f0f0f0f ! offset 256 + .word 0x0000ffff ! 260 + .word 0x33333333 ! 264 + .word 0x00ff00ff ! 268 + .word 0x55555555 ! 272 + .word 0 ! 276 + .word LOOPS ! 280 + .word 0x0000FC00 ! 284 -! void des_ncbc_encrypt(input, output, length, schedule, ivec, enc) +! void DES_ncbc_encrypt(input, output, length, schedule, ivec, enc) ! ***************************************************************** .align 32 - .global des_ncbc_encrypt - .type des_ncbc_encrypt,#function + .global DES_ncbc_encrypt + .type DES_ncbc_encrypt,#function -des_ncbc_encrypt: +DES_ncbc_encrypt: - save %sp, -96, %sp + save %sp, FRAME, %sp - define({INPUT}, { [%sp+68] }) - define({OUTPUT}, { [%sp+72] }) - define({IVEC}, { [%sp+84] }) + define({INPUT}, { [%sp+BIAS+ARG0+0*ARGSZ] }) + define({OUTPUT}, { [%sp+BIAS+ARG0+1*ARGSZ] }) + define({IVEC}, { [%sp+BIAS+ARG0+4*ARGSZ] }) + + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 cmp in5, 0 ! enc - sethi %hi(.des_and), out2 ! address constants +#ifdef OPENSSL_SYSNAME_ULTRASPARC be,pn %icc, .ncbc.dec - st in4, IVEC +#else + be .ncbc.dec +#endif + STPTR in4, IVEC ! addr left right temp label load_little_endian(in4, in5, out5, local3, .LLE1) ! iv addcc in2, -8, in2 ! bytes missing when first block done - mov in3, in4 ! schedule +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ncbc.enc.seven.or.less - or out2, %lo(.des_and), out2 +#else + bl .ncbc.enc.seven.or.less +#endif + mov in3, in4 ! schedule .ncbc.enc.next.block: @@ -1471,7 +1561,11 @@ des_ncbc_encrypt: rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption ks in3 +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ncbc.enc.next.block_fp +#else + bl .ncbc.enc.next.block_fp +#endif add in0, 8, in0 ! input address ! If 8 or more bytes are to be encrypted after this block, @@ -1501,7 +1595,7 @@ des_ncbc_encrypt: add global1, 512, global3 ! address sbox 3 since register used xor global4, local1, out5 ! iv xor next block - ba,pt %icc, .ncbc.enc.next.block_2 + ba .ncbc.enc.next.block_2 add in1, 8, in1 ! output adress .ncbc.enc.next.block_fp: @@ -1512,14 +1606,22 @@ des_ncbc_encrypt: addcc in2, -8, in2 ! bytes missing when next block done +#ifdef OPENSSL_SYSNAME_ULTRASPARC bpos,pt %icc, .ncbc.enc.next.block ! also jumps if 0 +#else + bpos .ncbc.enc.next.block +#endif add in1, 8, in1 .ncbc.enc.seven.or.less: cmp in2, -8 +#ifdef OPENSSL_SYSNAME_ULTRASPARC ble,pt %icc, .ncbc.enc.finish +#else + ble .ncbc.enc.finish +#endif nop add in2, 8, local1 ! bytes to load @@ -1532,25 +1634,28 @@ des_ncbc_encrypt: .ncbc.enc.finish: - ld IVEC, local4 + LDPTR IVEC, local4 store_little_endian(local4, in5, out5, local5, .SLE2) ! ivec - return in7+8 - nop + ret + restore .ncbc.dec: - st in0, INPUT + STPTR in0, INPUT cmp in2, 0 ! length add in3, 120, in3 - ld IVEC, local7 ! ivec + LDPTR IVEC, local7 ! ivec +#ifdef OPENSSL_SYSNAME_ULTRASPARC ble,pn %icc, .ncbc.dec.finish +#else + ble .ncbc.dec.finish +#endif mov in3, in4 ! schedule - st in1, OUTPUT - or out2, %lo(.des_and), out2 ! address constants low part + STPTR in1, OUTPUT mov in0, local5 ! input load_little_endian(local7, in0, in1, local3, .LLE3) ! ivec @@ -1571,7 +1676,11 @@ des_ncbc_encrypt: ! in2 is compared to 8 in the rounds xor out5, in0, out4 ! iv xor +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ncbc.dec.seven.or.less +#else + bl .ncbc.dec.seven.or.less +#endif xor in5, in1, global4 ! iv xor ! Load ivec next block now, since input and output address might be the same. @@ -1580,23 +1689,27 @@ des_ncbc_encrypt: store_little_endian(local7, out4, global4, local3, .SLE3) - st local5, INPUT + STPTR local5, INPUT add local7, 8, local7 addcc in2, -8, in2 +#ifdef OPENSSL_SYSNAME_ULTRASPARC bg,pt %icc, .ncbc.dec.next.block - st local7, OUTPUT +#else + bg .ncbc.dec.next.block +#endif + STPTR local7, OUTPUT .ncbc.dec.store.iv: - ld IVEC, local4 ! ivec + LDPTR IVEC, local4 ! ivec store_little_endian(local4, in0, in1, local5, .SLE4) .ncbc.dec.finish: - return in7+8 - nop + ret + restore .ncbc.dec.seven.or.less: @@ -1605,45 +1718,52 @@ des_ncbc_encrypt: store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv) -.des_ncbc_encrypt.end: - .size des_ncbc_encrypt, .des_ncbc_encrypt.end-des_ncbc_encrypt +.DES_ncbc_encrypt.end: + .size DES_ncbc_encrypt, .DES_ncbc_encrypt.end-DES_ncbc_encrypt -! void des_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc) +! void DES_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc) ! ************************************************************************** .align 32 - .global des_ede3_cbc_encrypt - .type des_ede3_cbc_encrypt,#function + .global DES_ede3_cbc_encrypt + .type DES_ede3_cbc_encrypt,#function -des_ede3_cbc_encrypt: +DES_ede3_cbc_encrypt: - save %sp, -96, %sp + save %sp, FRAME, %sp - define({LENGTH},{ [%sp+76] }) - define({KS1}, { [%sp+80] }) - define({KS2}, { [%sp+84] }) - define({KS3}, { [%sp+88] }) + define({KS1}, { [%sp+BIAS+ARG0+3*ARGSZ] }) + define({KS2}, { [%sp+BIAS+ARG0+4*ARGSZ] }) + define({KS3}, { [%sp+BIAS+ARG0+5*ARGSZ] }) - ld [%fp+96], local3 ! enc - sethi %hi(.des_and), out2 + call .PIC.me.up + mov .PIC.me.up-(.-4),out0 - ld [%fp+92], local4 ! ivec - or out2, %lo(.des_and), out2 + LDPTR [%fp+BIAS+ARG0+7*ARGSZ], local3 ! enc + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec cmp local3, 0 ! enc +#ifdef OPENSSL_SYSNAME_ULTRASPARC be,pn %icc, .ede3.dec - st in4, KS2 +#else + be .ede3.dec +#endif + STPTR in4, KS2 - st in5, KS3 + STPTR in5, KS3 load_little_endian(local4, in5, out5, local3, .LLE6) ! ivec addcc in2, -8, in2 ! bytes missing after next block +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ede3.enc.seven.or.less - st in3, KS1 +#else + bl .ede3.enc.seven.or.less +#endif + STPTR in3, KS1 .ede3.enc.next.block: @@ -1651,11 +1771,11 @@ des_ede3_cbc_encrypt: .ede3.enc.next.block_1: - ld KS2, in4 + LDPTR KS2, in4 xor in5, out4, in5 ! iv xor xor out5, global4, out5 ! iv xor - ld KS1, in3 + LDPTR KS1, in3 add in4, 120, in4 ! for decryption we use last subkey first nop @@ -1667,12 +1787,16 @@ des_ede3_cbc_encrypt: nop call .des_dec ! ks2 in4 - ld KS3, in3 + LDPTR KS3, in3 call .des_enc ! ks3 in3 compares in2 to 8 nop +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ede3.enc.next.block_fp +#else + bl .ede3.enc.next.block_fp +#endif add in0, 8, in0 ! If 8 or more bytes are to be encrypted after this block, @@ -1703,7 +1827,7 @@ des_ede3_cbc_encrypt: ld [in3+4], out1 ! key 8642 add global1, 768, global4 ! address sbox 4 - ba,pt %icc, .ede3.enc.next.block_2 + ba .ede3.enc.next.block_2 add in1, 8, in1 .ede3.enc.next.block_fp: @@ -1714,14 +1838,22 @@ des_ede3_cbc_encrypt: addcc in2, -8, in2 ! bytes missing when next block done +#ifdef OPENSSL_SYSNAME_ULTRASPARC bpos,pt %icc, .ede3.enc.next.block +#else + bpos .ede3.enc.next.block +#endif add in1, 8, in1 .ede3.enc.seven.or.less: cmp in2, -8 +#ifdef OPENSSL_SYSNAME_ULTRASPARC ble,pt %icc, .ede3.enc.finish +#else + ble .ede3.enc.finish +#endif nop add in2, 8, local1 ! bytes to load @@ -1731,29 +1863,32 @@ des_ede3_cbc_encrypt: .ede3.enc.finish: - ld [%fp+92], local4 ! ivec + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec store_little_endian(local4, in5, out5, local5, .SLE6) ! ivec - return in7+8 - nop - + ret + restore .ede3.dec: - st in0, INPUT + STPTR in0, INPUT add in5, 120, in5 - st in1, OUTPUT + STPTR in1, OUTPUT mov in0, local5 add in3, 120, in3 - st in3, KS1 + STPTR in3, KS1 cmp in2, 0 +#ifdef OPENSSL_SYSNAME_ULTRASPARC ble %icc, .ede3.dec.finish - st in5, KS3 +#else + ble .ede3.dec.finish +#endif + STPTR in5, KS3 - ld [%fp+92], local7 ! iv + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local7 ! iv load_little_endian(local7, in0, in1, local3, .LLE8) .ede3.dec.next.block: @@ -1768,7 +1903,7 @@ des_ede3_cbc_encrypt: ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4 call .des_enc ! ks2 in3 - ld KS1, in4 + LDPTR KS1, in4 call .des_dec ! ks1 in4 nop @@ -1779,30 +1914,37 @@ des_ede3_cbc_encrypt: ! in2 is compared to 8 in the rounds xor out5, in0, out4 +#ifdef OPENSSL_SYSNAME_ULTRASPARC bl,pn %icc, .ede3.dec.seven.or.less +#else + bl .ede3.dec.seven.or.less +#endif xor in5, in1, global4 load_little_endian_inc(local5, in0, in1, local3, .LLE10) ! iv next block store_little_endian(local7, out4, global4, local3, .SLE7) ! block - st local5, INPUT + STPTR local5, INPUT addcc in2, -8, in2 add local7, 8, local7 +#ifdef OPENSSL_SYSNAME_ULTRASPARC bg,pt %icc, .ede3.dec.next.block - st local7, OUTPUT +#else + bg .ede3.dec.next.block +#endif + STPTR local7, OUTPUT .ede3.dec.store.iv: - ld [%fp+92], local4 ! ivec + LDPTR [%fp+BIAS+ARG0+6*ARGSZ], local4 ! ivec store_little_endian(local4, in0, in1, local5, .SLE8) ! ivec .ede3.dec.finish: - return in7+8 - nop - + ret + restore .ede3.dec.seven.or.less: @@ -1811,5 +1953,5 @@ des_ede3_cbc_encrypt: store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv) -.des_ede3_cbc_encrypt.end: - .size des_ede3_cbc_encrypt,.des_ede3_cbc_encrypt.end-des_ede3_cbc_encrypt +.DES_ede3_cbc_encrypt.end: + .size DES_ede3_cbc_encrypt,.DES_ede3_cbc_encrypt.end-DES_ede3_cbc_encrypt -- GitLab