提交 6f6a6130 编写于 作者: A Andy Polyakov

aes/asm/bsaes-*.pl: improve decrypt performance.

Improve decrypt performance by 10-20% depending on platform. Thanks
to Jussi Kivilinna for providing valuable hint. Also thanks to Ard
Biesheuvel.
上级 99fb2212
...@@ -23,14 +23,14 @@ ...@@ -23,14 +23,14 @@
# to collect performance results, which for Cortex-A8 core are: # to collect performance results, which for Cortex-A8 core are:
# #
# encrypt 19.5 cycles per byte processed with 128-bit key # encrypt 19.5 cycles per byte processed with 128-bit key
# decrypt 24.0 cycles per byte processed with 128-bit key # decrypt 22.1 cycles per byte processed with 128-bit key
# key conv. 440 cycles per 128-bit key/0.18 of 8x block # key conv. 440 cycles per 128-bit key/0.18 of 8x block
# #
# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6, # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
# which is [much] worse than anticipated (for further details see # which is [much] worse than anticipated (for further details see
# http://www.openssl.org/~appro/Snapdragon-S4.html). # http://www.openssl.org/~appro/Snapdragon-S4.html).
# #
# Cortex-A15 manages in 14.2/19.6 cycles [when integer-only code # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
# manages in 20.0 cycles]. # manages in 20.0 cycles].
# #
# When comparing to x86_64 results keep in mind that NEON unit is # When comparing to x86_64 results keep in mind that NEON unit is
...@@ -377,6 +377,7 @@ sub MixColumns { ...@@ -377,6 +377,7 @@ sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last] # modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7]; my @x=@_[0..7];
my @t=@_[8..15]; my @t=@_[8..15];
my $inv=@_[16]; # optional
$code.=<<___; $code.=<<___;
vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32 vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
vext.8 @t[1], @x[1], @x[1], #12 vext.8 @t[1], @x[1], @x[1], #12
...@@ -417,8 +418,9 @@ $code.=<<___; ...@@ -417,8 +418,9 @@ $code.=<<___;
veor @t[3], @t[3], @x[7] veor @t[3], @t[3], @x[7]
vext.8 @x[6], @x[2], @x[2], #8 vext.8 @x[6], @x[2], @x[2], #8
veor @x[7], @t[1], @t[5] veor @x[7], @t[1], @t[5]
___
$code.=<<___ if (!$inv);
veor @x[2], @t[0], @t[4] veor @x[2], @t[0], @t[4]
veor @x[4], @x[4], @t[3] veor @x[4], @x[4], @t[3]
veor @x[5], @x[5], @t[7] veor @x[5], @x[5], @t[7]
veor @x[3], @x[3], @t[6] veor @x[3], @x[3], @t[6]
...@@ -426,9 +428,18 @@ $code.=<<___; ...@@ -426,9 +428,18 @@ $code.=<<___;
veor @x[6], @x[6], @t[2] veor @x[6], @x[6], @t[2]
@ vmov @x[7], @t[1] @ vmov @x[7], @t[1]
___ ___
$code.=<<___ if ($inv);
veor @t[3], @t[3], @x[4]
veor @x[5], @x[5], @t[7]
veor @x[2], @x[3], @t[6]
veor @x[3], @t[0], @t[4]
veor @x[4], @x[6], @t[2]
vmov @x[6], @t[3]
@ vmov @x[7], @t[1]
___
} }
sub InvMixColumns { sub InvMixColumns_orig {
my @x=@_[0..7]; my @x=@_[0..7];
my @t=@_[8..15]; my @t=@_[8..15];
...@@ -581,6 +592,54 @@ $code.=<<___; ...@@ -581,6 +592,54 @@ $code.=<<___;
___ ___
} }
sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];
# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
$code.=<<___;
@ multiplication by 0x05-0x00-0x04-0x00
vext.8 @t[0], @x[0], @x[0], #8
vext.8 @t[6], @x[6], @x[6], #8
vext.8 @t[7], @x[7], @x[7], #8
veor @t[0], @t[0], @x[0]
vext.8 @t[1], @x[1], @x[1], #8
veor @t[6], @t[6], @x[6]
vext.8 @t[2], @x[2], @x[2], #8
veor @t[7], @t[7], @x[7]
vext.8 @t[3], @x[3], @x[3], #8
veor @t[1], @t[1], @x[1]
vext.8 @t[4], @x[4], @x[4], #8
veor @t[2], @t[2], @x[2]
vext.8 @t[5], @x[5], @x[5], #8
veor @t[3], @t[3], @x[3]
veor @t[4], @t[4], @x[4]
veor @t[5], @t[5], @x[5]
veor @x[0], @x[0], @t[6]
veor @x[1], @x[1], @t[6]
veor @x[2], @x[2], @t[0]
veor @x[4], @x[4], @t[2]
veor @x[3], @x[3], @t[1]
veor @x[1], @x[1], @t[7]
veor @x[2], @x[2], @t[7]
veor @x[4], @x[4], @t[6]
veor @x[5], @x[5], @t[3]
veor @x[3], @x[3], @t[6]
veor @x[6], @x[6], @t[4]
veor @x[4], @x[4], @t[7]
veor @x[5], @x[5], @t[7]
veor @x[7], @x[7], @t[5]
___
&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
}
sub swapmove { sub swapmove {
my ($a,$b,$n,$mask,$t)=@_; my ($a,$b,$n,$mask,$t)=@_;
$code.=<<___; $code.=<<___;
......
...@@ -83,9 +83,9 @@ ...@@ -83,9 +83,9 @@
# Add decryption procedure. Performance in CPU cycles spent to decrypt # Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is: # one byte out of 4096-byte buffer with 128-bit key is:
# #
# Core 2 11.0 # Core 2 9.83
# Nehalem 9.16 # Nehalem 7.74
# Atom 20.9 # Atom 18.9 (estimated, not measured yet)
# #
# November 2011. # November 2011.
# #
...@@ -456,6 +456,7 @@ sub MixColumns { ...@@ -456,6 +456,7 @@ sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last] # modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7]; my @x=@_[0..7];
my @t=@_[8..15]; my @t=@_[8..15];
my $inv=@_[16]; # optional
$code.=<<___; $code.=<<___;
pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
pshufd \$0x93, @x[1], @t[1] pshufd \$0x93, @x[1], @t[1]
...@@ -497,7 +498,8 @@ $code.=<<___; ...@@ -497,7 +498,8 @@ $code.=<<___;
pxor @t[4], @t[0] pxor @t[4], @t[0]
pshufd \$0x4E, @x[2], @x[6] pshufd \$0x4E, @x[2], @x[6]
pxor @t[5], @t[1] pxor @t[5], @t[1]
___
$code.=<<___ if (!$inv);
pxor @t[3], @x[4] pxor @t[3], @x[4]
pxor @t[7], @x[5] pxor @t[7], @x[5]
pxor @t[6], @x[3] pxor @t[6], @x[3]
...@@ -505,9 +507,20 @@ $code.=<<___; ...@@ -505,9 +507,20 @@ $code.=<<___;
pxor @t[2], @x[6] pxor @t[2], @x[6]
movdqa @t[1], @x[7] movdqa @t[1], @x[7]
___ ___
$code.=<<___ if ($inv);
pxor @x[4], @t[3]
pxor @t[7], @x[5]
pxor @x[3], @t[6]
movdqa @t[0], @x[3]
pxor @t[2], @x[6]
movdqa @t[6], @x[2]
movdqa @t[1], @x[7]
movdqa @x[6], @x[4]
movdqa @t[3], @x[6]
___
} }
sub InvMixColumns { sub InvMixColumns_orig {
my @x=@_[0..7]; my @x=@_[0..7];
my @t=@_[8..15]; my @t=@_[8..15];
...@@ -661,6 +674,54 @@ $code.=<<___; ...@@ -661,6 +674,54 @@ $code.=<<___;
___ ___
} }
sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];
# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
$code.=<<___;
# multiplication by 0x05-0x00-0x04-0x00
pshufd \$0x4E, @x[0], @t[0]
pshufd \$0x4E, @x[6], @t[6]
pxor @x[0], @t[0]
pshufd \$0x4E, @x[7], @t[7]
pxor @x[6], @t[6]
pshufd \$0x4E, @x[1], @t[1]
pxor @x[7], @t[7]
pshufd \$0x4E, @x[2], @t[2]
pxor @x[1], @t[1]
pshufd \$0x4E, @x[3], @t[3]
pxor @x[2], @t[2]
pxor @t[6], @x[0]
pxor @t[6], @x[1]
pshufd \$0x4E, @x[4], @t[4]
pxor @x[3], @t[3]
pxor @t[0], @x[2]
pxor @t[1], @x[3]
pshufd \$0x4E, @x[5], @t[5]
pxor @x[4], @t[4]
pxor @t[7], @x[1]
pxor @t[2], @x[4]
pxor @x[5], @t[5]
pxor @t[7], @x[2]
pxor @t[6], @x[3]
pxor @t[6], @x[4]
pxor @t[3], @x[5]
pxor @t[4], @x[6]
pxor @t[7], @x[4]
pxor @t[7], @x[5]
pxor @t[5], @x[7]
___
&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
}
sub aesenc { # not used sub aesenc { # not used
my @b=@_[0..7]; my @b=@_[0..7];
my @t=@_[8..15]; my @t=@_[8..15];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册