提交 2d22e080 编写于 作者: A Andy Polyakov

ARM assembler pack: reschedule instructions for dual-issue pipeline.

Modest improvement coefficients mean that code already had some
parallelism and there was not very much room for improvement. Special
thanks to Ted Krovetz for benchmarking the code with such patience.
上级 0852f90c
...@@ -16,12 +16,17 @@ ...@@ -16,12 +16,17 @@
# allows to merge logical or arithmetic operation with shift or rotate # allows to merge logical or arithmetic operation with shift or rotate
# in one instruction and emit combined result every cycle. The module # in one instruction and emit combined result every cycle. The module
# is endian-neutral. The performance is ~42 cycles/byte for 128-bit # is endian-neutral. The performance is ~42 cycles/byte for 128-bit
# key. # key [on single-issue Xscale PXA250 core].
# May 2007. # May 2007.
# #
# AES_set_[en|de]crypt_key is added. # AES_set_[en|de]crypt_key is added.
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
...@@ -167,24 +172,24 @@ AES_encrypt: ...@@ -167,24 +172,24 @@ AES_encrypt:
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0] ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8 orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7] ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6] ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5] ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4] ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8 orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11] ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10] ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9] ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8] ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8 orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15] ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14] ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13] ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12] ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
...@@ -199,24 +204,24 @@ AES_encrypt: ...@@ -199,24 +204,24 @@ AES_encrypt:
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0] strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1] strb $t2,[$rounds,#1]
strb $t3,[$rounds,#2]
strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24 mov $t1,$s1,lsr#24
strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16 mov $t2,$s1,lsr#16
strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8 mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4] strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5] strb $t2,[$rounds,#5]
strb $t3,[$rounds,#6]
strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24 mov $t1,$s2,lsr#24
strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16 mov $t2,$s2,lsr#16
strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8 mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8] strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9] strb $t2,[$rounds,#9]
strb $t3,[$rounds,#10]
strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24 mov $t1,$s3,lsr#24
strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16 mov $t2,$s3,lsr#16
strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8 mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12] strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
...@@ -233,141 +238,137 @@ AES_encrypt: ...@@ -233,141 +238,137 @@ AES_encrypt:
.align 2 .align 2
_armv4_AES_encrypt: _armv4_AES_encrypt:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
ldr $t1,[$key],#16 ldmia $key!,{$t1-$i1}
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
ldr $rounds,[$key,#240-16]
eor $s0,$s0,$t1 eor $s0,$s0,$t1
ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2 eor $s1,$s1,$t2
eor $s2,$s2,$t3 eor $s2,$s2,$t3
eor $s3,$s3,$i1 eor $s3,$s3,$i1
sub $rounds,$rounds,#1 sub $rounds,$rounds,#1
mov lr,#255 mov lr,#255
.Lenc_loop: and $i1,lr,$s0
and $i2,lr,$s0,lsr#8 and $i2,lr,$s0,lsr#8
and $i3,lr,$s0,lsr#16 and $i3,lr,$s0,lsr#16
and $i1,lr,$s0
mov $s0,$s0,lsr#24 mov $s0,$s0,lsr#24
.Lenc_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0] ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
and $i1,lr,$s1,lsr#16 @ i0 and $i1,lr,$s1,lsr#16 @ i0
ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
and $i2,lr,$s1 and $i2,lr,$s1
ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
and $i3,lr,$s1,lsr#8 and $i3,lr,$s1,lsr#8
ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
mov $s1,$s1,lsr#24 mov $s1,$s1,lsr#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16] ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0] ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
eor $s0,$s0,$i1,ror#8 eor $s0,$s0,$i1,ror#8
eor $s1,$s1,$t1,ror#24 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
eor $t2,$t2,$i2,ror#8
eor $t3,$t3,$i3,ror#8
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,ror#8
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2 and $i3,lr,$s2
mov $s2,$s2,lsr#24 eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
eor $s1,$s1,$i2,ror#8 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
eor $s2,$s2,$t2,ror#16
eor $t3,$t3,$i3,ror#16
and $i1,lr,$s3 @ i0 and $i1,lr,$s3 @ i0
eor $s1,$s1,$i2,ror#8
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
mov $s3,$s3,lsr#24 eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s0,$s0,$i1,ror#24 eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16 eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
eor $s2,$s2,$i3,ror#8 eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8 eor $s3,$s3,$t3,ror#8
ldr $t1,[$key],#16 ldr $t2,[$key,#-8]
ldr $t2,[$key,#-12] eor $s0,$s0,$i1
ldr $t3,[$key,#-8] ldr $t3,[$key,#-4]
ldr $i1,[$key,#-4] and $i1,lr,$s0
eor $s0,$s0,$t1 eor $s1,$s1,$t1
eor $s1,$s1,$t2 and $i2,lr,$s0,lsr#8
eor $s2,$s2,$t3 eor $s2,$s2,$t2
eor $s3,$s3,$i1 and $i3,lr,$s0,lsr#16
eor $s3,$s3,$t3
mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1 subs $rounds,$rounds,#1
bne .Lenc_loop bne .Lenc_loop
add $tbl,$tbl,#2 add $tbl,$tbl,#2
and $i1,lr,$s0
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0,lsr#16
mov $s0,$s0,lsr#24
ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0] ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
and $i1,lr,$s1,lsr#16 @ i0 and $i1,lr,$s1,lsr#16 @ i0
ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
and $i2,lr,$s1 and $i2,lr,$s1
ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
and $i3,lr,$s1,lsr#8 and $i3,lr,$s1,lsr#8
ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
mov $s1,$s1,lsr#24 mov $s1,$s1,lsr#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0] ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
eor $s1,$t1,$s1,lsl#24 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
eor $t2,$i2,$t2,lsl#8
eor $t3,$i3,$t3,lsl#8
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$i2,$t2,lsl#8
and $i2,lr,$s2,lsr#16 @ i1 and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2 and $i3,lr,$s2
mov $s2,$s2,lsr#24 eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
eor $s1,$s1,$i2,lsl#16 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
eor $s2,$t2,$s2,lsl#24
eor $t3,$i3,$t3,lsl#8
and $i1,lr,$s3 @ i0 and $i1,lr,$s3 @ i0
eor $s1,$s1,$i2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2 and $i3,lr,$s3,lsr#16 @ i2
mov $s3,$s3,lsr#24 eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s0,$i1,$s0,lsl#8 eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8 eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16 eor $s2,$s2,$i3,lsl#16
ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24 eor $s3,$t3,$s3,lsl#24
ldr $t3,[$key,#12]
ldr lr,[sp],#4 @ pop lr eor $s0,$s0,$i1
ldr $t1,[$key,#0] eor $s1,$s1,$t1
ldr $t2,[$key,#4] eor $s2,$s2,$t2
ldr $t3,[$key,#8] eor $s3,$s3,$t3
ldr $i1,[$key,#12]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $tbl,$tbl,#2 sub $tbl,$tbl,#2
mov pc,lr @ return ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt .size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global AES_set_encrypt_key .global AES_set_encrypt_key
...@@ -402,31 +403,31 @@ AES_set_encrypt_key: ...@@ -402,31 +403,31 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0] ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8 orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7] ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6] ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5] ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4] ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8 orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11] ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10] ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9] ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8] ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8 orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15] ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14] ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13] ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12] ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
str $s0,[$key],#16 str $s0,[$key],#16
orr $s3,$s3,$t2,lsl#16
str $s1,[$key,#-12] str $s1,[$key,#-12]
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8] str $s2,[$key,#-8]
str $s3,[$key,#-4] str $s3,[$key,#-4]
...@@ -440,27 +441,26 @@ AES_set_encrypt_key: ...@@ -440,27 +441,26 @@ AES_set_encrypt_key:
.L128_loop: .L128_loop:
and $t2,lr,$s3,lsr#24 and $t2,lr,$s3,lsr#24
and $i1,lr,$s3,lsr#16 and $i1,lr,$s3,lsr#16
and $i2,lr,$s3,lsr#8
and $i3,lr,$s3
ldrb $t2,[$tbl,$t2] ldrb $t2,[$tbl,$t2]
and $i2,lr,$s3,lsr#8
ldrb $i1,[$tbl,$i1] ldrb $i1,[$tbl,$i1]
and $i3,lr,$s3
ldrb $i2,[$tbl,$i2] ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24 orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16 orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8 orr $t2,$t2,$i3,lsl#8
eor $t2,$t2,$t1 eor $t2,$t2,$t1
eor $s0,$s0,$t2 @ rk[4]=rk[0]^... eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4] eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s0,[$key],#16 str $s0,[$key],#16
eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
str $s1,[$key,#-12] str $s1,[$key,#-12]
eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
str $s2,[$key,#-8] str $s2,[$key,#-8]
str $s3,[$key,#-4]
subs $rounds,$rounds,#1 subs $rounds,$rounds,#1
str $s3,[$key,#-4]
bne .L128_loop bne .L128_loop
sub r2,$key,#176 sub r2,$key,#176
b .Ldone b .Ldone
...@@ -471,16 +471,16 @@ AES_set_encrypt_key: ...@@ -471,16 +471,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#17] ldrb $t2,[$rounds,#17]
ldrb $t3,[$rounds,#16] ldrb $t3,[$rounds,#16]
orr $i2,$i2,$t1,lsl#8 orr $i2,$i2,$t1,lsl#8
orr $i2,$i2,$t2,lsl#16
orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#23] ldrb $i3,[$rounds,#23]
orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#22] ldrb $t1,[$rounds,#22]
orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#21] ldrb $t2,[$rounds,#21]
ldrb $t3,[$rounds,#20] ldrb $t3,[$rounds,#20]
orr $i3,$i3,$t1,lsl#8 orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16 orr $i3,$i3,$t2,lsl#16
orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
teq lr,#192 teq lr,#192
...@@ -494,27 +494,26 @@ AES_set_encrypt_key: ...@@ -494,27 +494,26 @@ AES_set_encrypt_key:
.L192_loop: .L192_loop:
and $t2,lr,$i3,lsr#24 and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16 and $i1,lr,$i3,lsr#16
and $i2,lr,$i3,lsr#8
and $i3,lr,$i3
ldrb $t2,[$tbl,$t2] ldrb $t2,[$tbl,$t2]
and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1] ldrb $i1,[$tbl,$i1]
and $i3,lr,$i3
ldrb $i2,[$tbl,$i2] ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24 orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16 orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8 orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1 eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[6]=rk[0]^... eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6] eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s0,[$key],#24 str $s0,[$key],#24
eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
str $s1,[$key,#-20] str $s1,[$key,#-20]
eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
str $s2,[$key,#-16] str $s2,[$key,#-16]
str $s3,[$key,#-12]
subs $rounds,$rounds,#1 subs $rounds,$rounds,#1
str $s3,[$key,#-12]
subeq r2,$key,#216 subeq r2,$key,#216
beq .Ldone beq .Ldone
...@@ -532,16 +531,16 @@ AES_set_encrypt_key: ...@@ -532,16 +531,16 @@ AES_set_encrypt_key:
ldrb $t2,[$rounds,#25] ldrb $t2,[$rounds,#25]
ldrb $t3,[$rounds,#24] ldrb $t3,[$rounds,#24]
orr $i2,$i2,$t1,lsl#8 orr $i2,$i2,$t1,lsl#8
orr $i2,$i2,$t2,lsl#16
orr $i2,$i2,$t3,lsl#24
ldrb $i3,[$rounds,#31] ldrb $i3,[$rounds,#31]
orr $i2,$i2,$t2,lsl#16
ldrb $t1,[$rounds,#30] ldrb $t1,[$rounds,#30]
orr $i2,$i2,$t3,lsl#24
ldrb $t2,[$rounds,#29] ldrb $t2,[$rounds,#29]
ldrb $t3,[$rounds,#28] ldrb $t3,[$rounds,#28]
orr $i3,$i3,$t1,lsl#8 orr $i3,$i3,$t1,lsl#8
orr $i3,$i3,$t2,lsl#16 orr $i3,$i3,$t2,lsl#16
orr $i3,$i3,$t3,lsl#24
str $i2,[$key],#8 str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4] str $i3,[$key,#-4]
mov $rounds,#14 mov $rounds,#14
...@@ -553,52 +552,51 @@ AES_set_encrypt_key: ...@@ -553,52 +552,51 @@ AES_set_encrypt_key:
.L256_loop: .L256_loop:
and $t2,lr,$i3,lsr#24 and $t2,lr,$i3,lsr#24
and $i1,lr,$i3,lsr#16 and $i1,lr,$i3,lsr#16
and $i2,lr,$i3,lsr#8
and $i3,lr,$i3
ldrb $t2,[$tbl,$t2] ldrb $t2,[$tbl,$t2]
and $i2,lr,$i3,lsr#8
ldrb $i1,[$tbl,$i1] ldrb $i1,[$tbl,$i1]
and $i3,lr,$i3
ldrb $i2,[$tbl,$i2] ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i1,lsl#24 orr $t2,$t2,$i1,lsl#24
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16 orr $t2,$t2,$i2,lsl#16
ldr $t1,[$t3],#4 @ rcon[i++]
orr $t2,$t2,$i3,lsl#8 orr $t2,$t2,$i3,lsl#8
eor $i3,$t2,$t1 eor $i3,$t2,$t1
eor $s0,$s0,$i3 @ rk[8]=rk[0]^... eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8] eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s0,[$key],#32 str $s0,[$key],#32
eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
str $s1,[$key,#-28] str $s1,[$key,#-28]
eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
str $s2,[$key,#-24] str $s2,[$key,#-24]
str $s3,[$key,#-20]
subs $rounds,$rounds,#1 subs $rounds,$rounds,#1
str $s3,[$key,#-20]
subeq r2,$key,#256 subeq r2,$key,#256
beq .Ldone beq .Ldone
and $t2,lr,$s3 and $t2,lr,$s3
and $i1,lr,$s3,lsr#8 and $i1,lr,$s3,lsr#8
and $i2,lr,$s3,lsr#16
and $i3,lr,$s3,lsr#24
ldrb $t2,[$tbl,$t2] ldrb $t2,[$tbl,$t2]
and $i2,lr,$s3,lsr#16
ldrb $i1,[$tbl,$i1] ldrb $i1,[$tbl,$i1]
and $i3,lr,$s3,lsr#24
ldrb $i2,[$tbl,$i2] ldrb $i2,[$tbl,$i2]
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i1,lsl#8 orr $t2,$t2,$i1,lsl#8
ldrb $i3,[$tbl,$i3]
orr $t2,$t2,$i2,lsl#16 orr $t2,$t2,$i2,lsl#16
ldr $t1,[$key,#-48]
orr $t2,$t2,$i3,lsl#24 orr $t2,$t2,$i3,lsl#24
ldr $t1,[$key,#-48]
ldr $i1,[$key,#-44] ldr $i1,[$key,#-44]
ldr $i2,[$key,#-40] ldr $i2,[$key,#-40]
ldr $i3,[$key,#-36]
eor $t1,$t1,$t2 @ rk[12]=rk[4]^... eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
ldr $i3,[$key,#-36]
eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12] eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $t1,[$key,#-16] str $t1,[$key,#-16]
eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
str $i1,[$key,#-12] str $i1,[$key,#-12]
eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
str $i2,[$key,#-8] str $i2,[$key,#-8]
str $i3,[$key,#-4] str $i3,[$key,#-4]
b .L256_loop b .L256_loop
...@@ -819,24 +817,24 @@ AES_decrypt: ...@@ -819,24 +817,24 @@ AES_decrypt:
ldrb $t2,[$rounds,#1] ldrb $t2,[$rounds,#1]
ldrb $t3,[$rounds,#0] ldrb $t3,[$rounds,#0]
orr $s0,$s0,$t1,lsl#8 orr $s0,$s0,$t1,lsl#8
orr $s0,$s0,$t2,lsl#16
orr $s0,$s0,$t3,lsl#24
ldrb $s1,[$rounds,#7] ldrb $s1,[$rounds,#7]
orr $s0,$s0,$t2,lsl#16
ldrb $t1,[$rounds,#6] ldrb $t1,[$rounds,#6]
orr $s0,$s0,$t3,lsl#24
ldrb $t2,[$rounds,#5] ldrb $t2,[$rounds,#5]
ldrb $t3,[$rounds,#4] ldrb $t3,[$rounds,#4]
orr $s1,$s1,$t1,lsl#8 orr $s1,$s1,$t1,lsl#8
orr $s1,$s1,$t2,lsl#16
orr $s1,$s1,$t3,lsl#24
ldrb $s2,[$rounds,#11] ldrb $s2,[$rounds,#11]
orr $s1,$s1,$t2,lsl#16
ldrb $t1,[$rounds,#10] ldrb $t1,[$rounds,#10]
orr $s1,$s1,$t3,lsl#24
ldrb $t2,[$rounds,#9] ldrb $t2,[$rounds,#9]
ldrb $t3,[$rounds,#8] ldrb $t3,[$rounds,#8]
orr $s2,$s2,$t1,lsl#8 orr $s2,$s2,$t1,lsl#8
orr $s2,$s2,$t2,lsl#16
orr $s2,$s2,$t3,lsl#24
ldrb $s3,[$rounds,#15] ldrb $s3,[$rounds,#15]
orr $s2,$s2,$t2,lsl#16
ldrb $t1,[$rounds,#14] ldrb $t1,[$rounds,#14]
orr $s2,$s2,$t3,lsl#24
ldrb $t2,[$rounds,#13] ldrb $t2,[$rounds,#13]
ldrb $t3,[$rounds,#12] ldrb $t3,[$rounds,#12]
orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t1,lsl#8
...@@ -851,24 +849,24 @@ AES_decrypt: ...@@ -851,24 +849,24 @@ AES_decrypt:
mov $t3,$s0,lsr#8 mov $t3,$s0,lsr#8
strb $t1,[$rounds,#0] strb $t1,[$rounds,#0]
strb $t2,[$rounds,#1] strb $t2,[$rounds,#1]
strb $t3,[$rounds,#2]
strb $s0,[$rounds,#3]
mov $t1,$s1,lsr#24 mov $t1,$s1,lsr#24
strb $t3,[$rounds,#2]
mov $t2,$s1,lsr#16 mov $t2,$s1,lsr#16
strb $s0,[$rounds,#3]
mov $t3,$s1,lsr#8 mov $t3,$s1,lsr#8
strb $t1,[$rounds,#4] strb $t1,[$rounds,#4]
strb $t2,[$rounds,#5] strb $t2,[$rounds,#5]
strb $t3,[$rounds,#6]
strb $s1,[$rounds,#7]
mov $t1,$s2,lsr#24 mov $t1,$s2,lsr#24
strb $t3,[$rounds,#6]
mov $t2,$s2,lsr#16 mov $t2,$s2,lsr#16
strb $s1,[$rounds,#7]
mov $t3,$s2,lsr#8 mov $t3,$s2,lsr#8
strb $t1,[$rounds,#8] strb $t1,[$rounds,#8]
strb $t2,[$rounds,#9] strb $t2,[$rounds,#9]
strb $t3,[$rounds,#10]
strb $s2,[$rounds,#11]
mov $t1,$s3,lsr#24 mov $t1,$s3,lsr#24
strb $t3,[$rounds,#10]
mov $t2,$s3,lsr#16 mov $t2,$s3,lsr#16
strb $s2,[$rounds,#11]
mov $t3,$s3,lsr#8 mov $t3,$s3,lsr#8
strb $t1,[$rounds,#12] strb $t1,[$rounds,#12]
strb $t2,[$rounds,#13] strb $t2,[$rounds,#13]
...@@ -885,146 +883,143 @@ AES_decrypt: ...@@ -885,146 +883,143 @@ AES_decrypt:
.align 2 .align 2
_armv4_AES_decrypt: _armv4_AES_decrypt:
str lr,[sp,#-4]! @ push lr str lr,[sp,#-4]! @ push lr
ldr $t1,[$key],#16 ldmia $key!,{$t1-$i1}
ldr $t2,[$key,#-12]
ldr $t3,[$key,#-8]
ldr $i1,[$key,#-4]
ldr $rounds,[$key,#240-16]
eor $s0,$s0,$t1 eor $s0,$s0,$t1
ldr $rounds,[$key,#240-16]
eor $s1,$s1,$t2 eor $s1,$s1,$t2
eor $s2,$s2,$t3 eor $s2,$s2,$t3
eor $s3,$s3,$i1 eor $s3,$s3,$i1
sub $rounds,$rounds,#1 sub $rounds,$rounds,#1
mov lr,#255 mov lr,#255
.Ldec_loop:
and $i1,lr,$s0,lsr#16 and $i1,lr,$s0,lsr#16
and $i2,lr,$s0,lsr#8 and $i2,lr,$s0,lsr#8
and $i3,lr,$s0 and $i3,lr,$s0
mov $s0,$s0,lsr#24 mov $s0,$s0,lsr#24
.Ldec_loop:
ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16] ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
and $i1,lr,$s1 @ i0 and $i1,lr,$s1 @ i0
ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
and $i2,lr,$s1,lsr#16 and $i2,lr,$s1,lsr#16
ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
and $i3,lr,$s1,lsr#8 and $i3,lr,$s1,lsr#8
ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
mov $s1,$s1,lsr#24 mov $s1,$s1,lsr#24
ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0] ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16] ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
eor $s0,$s0,$i1,ror#24 eor $s0,$s0,$i1,ror#24
eor $s1,$s1,$t1,ror#8 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
eor $t2,$i2,$t2,ror#8
eor $t3,$i3,$t3,ror#8
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$i2,$t2,ror#8
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16 and $i3,lr,$s2,lsr#16
mov $s2,$s2,lsr#24 eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16 eor $s0,$s0,$i1,ror#16
eor $s1,$s1,$i2,ror#24 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
eor $s2,$s2,$t2,ror#8
eor $t3,$i3,$t3,ror#8
and $i1,lr,$s3,lsr#16 @ i0 and $i1,lr,$s3,lsr#16 @ i0
eor $s1,$s1,$i2,ror#24
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2 and $i3,lr,$s3 @ i2
mov $s3,$s3,lsr#24 eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s0,$s0,$i1,ror#8 eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16 eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24 eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8 eor $s3,$s3,$t3,ror#8
ldr $t1,[$key],#16 ldr $t1,[$key,#-12]
ldr $t2,[$key,#-12] ldr $t2,[$key,#-8]
ldr $t3,[$key,#-8] eor $s0,$s0,$i1
ldr $i1,[$key,#-4] ldr $t3,[$key,#-4]
eor $s0,$s0,$t1 and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t2 eor $s1,$s1,$t1
eor $s2,$s2,$t3 and $i2,lr,$s0,lsr#8
eor $s3,$s3,$i1 eor $s2,$s2,$t2
and $i3,lr,$s0
eor $s3,$s3,$t3
mov $s0,$s0,lsr#24
subs $rounds,$rounds,#1 subs $rounds,$rounds,#1
bne .Ldec_loop bne .Ldec_loop
add $tbl,$tbl,#1024 add $tbl,$tbl,#1024
ldr $t1,[$tbl,#0] @ prefetch Td4 ldr $t2,[$tbl,#0] @ prefetch Td4
ldr $t2,[$tbl,#32] ldr $t3,[$tbl,#32]
ldr $t3,[$tbl,#64] ldr $t1,[$tbl,#64]
ldr $i1,[$tbl,#96] ldr $t2,[$tbl,#96]
ldr $i2,[$tbl,#128] ldr $t3,[$tbl,#128]
ldr $i3,[$tbl,#160] ldr $t1,[$tbl,#160]
ldr $t1,[$tbl,#192] ldr $t2,[$tbl,#192]
ldr $t2,[$tbl,#224] ldr $t3,[$tbl,#224]
and $i1,lr,$s0,lsr#16 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
and $i2,lr,$s0,lsr#8
and $i3,lr,$s0
ldrb $s0,[$tbl,$s0,lsr#24] @ Td4[s0>>24]
ldrb $t1,[$tbl,$i1] @ Td4[s0>>16] ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i1,lr,$s1 @ i0 and $i1,lr,$s1 @ i0
ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
and $i2,lr,$s1,lsr#16 and $i2,lr,$s1,lsr#16
ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
and $i3,lr,$s1,lsr#8 and $i3,lr,$s1,lsr#8
ldrb $i1,[$tbl,$i1] @ Td4[s1>>0] ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24] ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
ldrb $i2,[$tbl,$i2] @ Td4[s1>>16] ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s0,$i1,$s0,lsl#24 eor $s0,$i1,$s0,lsl#24
ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
eor $s1,$t1,$s1,lsl#8 eor $s1,$t1,$s1,lsl#8
eor $t2,$t2,$i2,lsl#8
eor $t3,$t3,$i3,lsl#8
and $i1,lr,$s2,lsr#8 @ i0 and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1 and $i2,lr,$s2 @ i1
and $i3,lr,$s2,lsr#16 eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s0,$s0,$i1,lsl#8 eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
eor $s1,$i2,$s1,lsl#16 eor $s1,$i2,$s1,lsl#16
eor $s2,$t2,$s2,lsl#16
eor $t3,$t3,$i3,lsl#16
and $i1,lr,$s3,lsr#16 @ i0 and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1 and $i2,lr,$s3,lsr#8 @ i1
and $i3,lr,$s3 @ i2 eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16 eor $s0,$s0,$i1,lsl#16
ldr $i1,[$key,#0]
eor $s1,$s1,$i2,lsl#8 eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$i3,$s2,lsl#8 eor $s2,$i3,$s2,lsl#8
ldr $t2,[$key,#8]
eor $s3,$t3,$s3,lsl#24 eor $s3,$t3,$s3,lsl#24
ldr $t3,[$key,#12]
ldr lr,[sp],#4 @ pop lr eor $s0,$s0,$i1
ldr $t1,[$key,#0] eor $s1,$s1,$t1
ldr $t2,[$key,#4] eor $s2,$s2,$t2
ldr $t3,[$key,#8] eor $s3,$s3,$t3
ldr $i1,[$key,#12]
eor $s0,$s0,$t1
eor $s1,$s1,$t2
eor $s2,$s2,$t3
eor $s3,$s3,$i1
sub $tbl,$tbl,#1024 sub $tbl,$tbl,#1024
mov pc,lr @ return ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_decrypt,.-_armv4_AES_decrypt .size _armv4_AES_decrypt,.-_armv4_AES_decrypt
.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" .asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2 .align 2
......
...@@ -19,6 +19,12 @@ ...@@ -19,6 +19,12 @@
# loop, this assembler loop body was found to be ~3x smaller than # loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one... # compiler-generated one...
# #
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# Note about "528B" variant. In ARM case it makes lesser sense to # Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons: # implement it for following reasons:
# #
...@@ -123,12 +129,12 @@ gcm_ghash_4bit: ...@@ -123,12 +129,12 @@ gcm_ghash_4bit:
add $Zhh,$Htbl,$nlo,lsl#4 add $Zhh,$Htbl,$nlo,lsl#4
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
add $Thh,$Htbl,$nhi
ldrb $nlo,[$inp,#14] ldrb $nlo,[$inp,#14]
add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
mov $nhi,$nhi,lsl#1 add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
...@@ -139,15 +145,15 @@ gcm_ghash_4bit: ...@@ -139,15 +145,15 @@ gcm_ghash_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4 eor $Zhh,$Thh,$Zhh,lsr#4
eor $nlo,$nlo,$nhi eor $nlo,$nlo,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16
and $nhi,$nlo,#0xf0 and $nhi,$nlo,#0xf0
and $nlo,$nlo,#0x0f and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Loop: .Loop:
add $Thh,$Htbl,$nlo,lsl#4 add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1 subs $cnt,$cnt,#1
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
...@@ -161,22 +167,22 @@ gcm_ghash_4bit: ...@@ -161,22 +167,22 @@ gcm_ghash_4bit:
add $Thh,$Htbl,$nhi add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Tlh,$Zlh,lsr#4 eor $Zlh,$Tlh,$Zlh,lsr#4
ldrplb $nhi,[$Xi,$cnt]
eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
eorpl $nlo,$nlo,$nhi eorpl $nlo,$nlo,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0 andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop bpl .Loop
ldr $len,[sp,#32] @ re-load $len/end ldr $len,[sp,#32] @ re-load $len/end
...@@ -212,7 +218,7 @@ gcm_gmult_4bit: ...@@ -212,7 +218,7 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi add $Thh,$Htbl,$nhi
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
mov $nhi,$nhi,lsl#1 add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28 eor $Zll,$Zll,$Zlh,lsl#28
...@@ -228,8 +234,8 @@ gcm_gmult_4bit: ...@@ -228,8 +234,8 @@ gcm_gmult_4bit:
.Loop2: .Loop2:
add $Thh,$Htbl,$nlo,lsl#4 add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1 subs $cnt,$cnt,#1
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
and $nlo,$Zll,#0xf @ rem and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
add $nlo,$nlo,$nlo add $nlo,$nlo,$nlo
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
...@@ -243,8 +249,8 @@ gcm_gmult_4bit: ...@@ -243,8 +249,8 @@ gcm_gmult_4bit:
add $Thh,$Htbl,$nhi add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
and $nhi,$Zll,#0xf @ rem and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
add $nhi,$nhi,$nhi add $nhi,$nhi,$nhi
eor $Zll,$Tll,$Zll,lsr#4 eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
...@@ -255,8 +261,8 @@ gcm_gmult_4bit: ...@@ -255,8 +261,8 @@ gcm_gmult_4bit:
eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4 eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0 andpl $nhi,$nlo,#0xf0
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
andpl $nlo,$nlo,#0x0f andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2 bpl .Loop2
___ ___
&Zsmash(); &Zsmash();
......
...@@ -11,7 +11,12 @@ ...@@ -11,7 +11,12 @@
# Performance is ~2x better than gcc 3.4 generated code and in "abso- # Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte. # byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output"; open STDOUT,">$output";
...@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16); ...@@ -52,27 +57,27 @@ $code.=<<___ if ($i<16);
___ ___
$code.=<<___; $code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++ ldr $t2,[$Ktbl],#4 @ *K256++
str $T1,[sp,#`$i%16`*4]
mov $t0,$e,ror#$Sigma1[0] mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4]
eor $t0,$t0,$e,ror#$Sigma1[1] eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
add $T1,$T1,$t0
eor $t1,$f,$g eor $t1,$f,$g
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e and $t1,$t1,$e
add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g) eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$t1
add $T1,$T1,$h add $T1,$T1,$h
add $T1,$T1,$t2
mov $h,$a,ror#$Sigma0[0] mov $h,$a,ror#$Sigma0[0]
add $T1,$T1,$t1
eor $h,$h,$a,ror#$Sigma0[1] eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
orr $t0,$a,$b orr $t0,$a,$b
and $t0,$t0,$c
and $t1,$a,$b and $t1,$a,$b
and $t0,$t0,$c
add $h,$h,$T1
orr $t0,$t0,$t1 @ Maj(a,b,c) orr $t0,$t0,$t1 @ Maj(a,b,c)
add $h,$h,$t0
add $d,$d,$T1 add $d,$d,$T1
add $h,$h,$T1 add $h,$h,$t0
___ ___
} }
...@@ -80,19 +85,19 @@ sub BODY_16_XX { ...@@ -80,19 +85,19 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___; $code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i ldr $t1,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4] ldr $t2,[sp,#`($i+14)%16`*4]
ldr $T1,[sp,#`($i+0)%16`*4] ldr $T1,[sp,#`($i+0)%16`*4]
ldr $inp,[sp,#`($i+9)%16`*4]
mov $t0,$t1,ror#$sigma0[0] mov $t0,$t1,ror#$sigma0[0]
ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1] eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0] mov $t1,$t2,ror#$sigma1[0]
add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1] eor $t1,$t1,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
add $T1,$T1,$t0
add $T1,$T1,$t1 add $T1,$T1,$t1
add $T1,$T1,$inp
___ ___
&BODY_00_15(@_); &BODY_00_15(@_);
} }
......
...@@ -10,7 +10,13 @@ ...@@ -10,7 +10,13 @@
# SHA512 block procedure for ARMv4. September 2007. # SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated # This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte. # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# Byte order [in]dependence. ========================================= # Byte order [in]dependence. =========================================
# #
...@@ -73,33 +79,31 @@ $code.=<<___; ...@@ -73,33 +79,31 @@ $code.=<<___;
eor $t0,$t0,$Elo,lsl#23 eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += h
ldr $t0,[sp,#$Foff+0] @ f.lo ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi ldr $t3,[sp,#$Goff+4] @ g.hi
str $Elo,[sp,#$Eoff+0]
str $Ehi,[sp,#$Eoff+4]
str $Alo,[sp,#$Aoff+0]
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2 eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3 eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2 eor $t0,$t0,$t2
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t2,[$Ktbl,#4] @ K[i].lo ldr $t2,[$Ktbl,#4] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi ldr $t3,[$Ktbl,#0] @ K[i].hi
ldr $Elo,[sp,#$Doff+0] @ d.lo
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t0 adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2 adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3 @ T += K[i] adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo adds $Elo,$Elo,$Tlo
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册